mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Split position DB into fid and relative position DB
This commit is contained in:
parent
56b7209f26
commit
9b2653427d
@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{
|
|||||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
||||||
};
|
};
|
||||||
pub use self::script_language_codec::ScriptLanguageCodec;
|
pub use self::script_language_codec::ScriptLanguageCodec;
|
||||||
pub use self::str_beu32_codec::StrBEU32Codec;
|
pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec};
|
||||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||||
|
@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
|
|||||||
Some(Cow::Owned(bytes))
|
Some(Cow::Owned(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct StrBEU16Codec;
|
||||||
|
|
||||||
|
impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
|
||||||
|
type DItem = (&'a str, u16);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let footer_len = size_of::<u16>();
|
||||||
|
|
||||||
|
if bytes.len() < footer_len {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||||
|
let word = str::from_utf8(word).ok()?;
|
||||||
|
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
|
||||||
|
|
||||||
|
Some((word, pos))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
||||||
|
type EItem = (&'a str, u16);
|
||||||
|
|
||||||
|
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let pos = pos.to_be_bytes();
|
||||||
|
|
||||||
|
let mut bytes = Vec::with_capacity(word.len() + pos.len());
|
||||||
|
bytes.extend_from_slice(word.as_bytes());
|
||||||
|
bytes.extend_from_slice(&pos[..]);
|
||||||
|
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -19,12 +19,12 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
FieldIdCodec, OrderedF64Codec,
|
FieldIdCodec, OrderedF64Codec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
|
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||||
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
|
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
|
||||||
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
||||||
Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32,
|
Search, U8StrStrCodec, BEU16, BEU32,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||||
@ -76,7 +76,9 @@ pub mod db_name {
|
|||||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||||
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
|
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
|
||||||
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
||||||
|
pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
|
||||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||||
|
pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids";
|
||||||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||||
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
||||||
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
||||||
@ -118,11 +120,16 @@ pub struct Index {
|
|||||||
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the word and the position with the docids that corresponds to it.
|
/// Maps the word and the position with the docids that corresponds to it.
|
||||||
pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||||
|
/// Maps the word and the field id with the docids that corresponds to it.
|
||||||
|
pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the field id and the word count with the docids that corresponds to it.
|
/// Maps the field id and the word count with the docids that corresponds to it.
|
||||||
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
||||||
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||||
|
/// Maps the word and the field id with the docids that corresponds to it.
|
||||||
|
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the script and language with all the docids that corresponds to it.
|
/// Maps the script and language with all the docids that corresponds to it.
|
||||||
pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
|
pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
|
||||||
@ -153,7 +160,7 @@ impl Index {
|
|||||||
) -> Result<Index> {
|
) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(19);
|
options.max_dbs(21);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
@ -170,8 +177,10 @@ impl Index {
|
|||||||
let prefix_word_pair_proximity_docids =
|
let prefix_word_pair_proximity_docids =
|
||||||
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
|
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
|
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
|
||||||
|
let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?;
|
||||||
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
||||||
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
||||||
|
let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
|
||||||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||||
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||||
@ -196,7 +205,9 @@ impl Index {
|
|||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
prefix_word_pair_proximity_docids,
|
prefix_word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
|
word_fid_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
|
word_prefix_fid_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative
|
|||||||
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
|
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
|
||||||
(field_id as u32) << 16 | (relative as u32)
|
(field_id as u32) << 16 | (relative as u32)
|
||||||
}
|
}
|
||||||
|
// TODO: this is wrong, but will do for now
|
||||||
|
/// Compute the "bucketed" absolute position from the field id and relative position in the field.
|
||||||
|
///
|
||||||
|
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
|
||||||
|
pub fn bucketed_position(relative: u16) -> u16 {
|
||||||
|
// The first few relative positions are kept intact.
|
||||||
|
if relative < 16 {
|
||||||
|
relative
|
||||||
|
} else if relative < 24 {
|
||||||
|
// Relative positions between 16 and 24 all become equal to 24
|
||||||
|
24
|
||||||
|
} else {
|
||||||
|
// Then, groups of positions that have the same base-2 logarithm are reduced to
|
||||||
|
// the same relative position: the smallest power of 2 that is greater than them
|
||||||
|
(relative as f64).log2().ceil().exp2() as u16
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Transform a raw obkv store into a JSON Object.
|
/// Transform a raw obkv store into a JSON Object.
|
||||||
pub fn obkv_to_json(
|
pub fn obkv_to_json(
|
||||||
|
@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> {
|
|||||||
struct QueryPositionIterator<'t> {
|
struct QueryPositionIterator<'t> {
|
||||||
#[allow(clippy::type_complexity)]
|
#[allow(clippy::type_complexity)]
|
||||||
inner:
|
inner:
|
||||||
Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>,
|
Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u16), RoaringBitmap)>> + 't>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> QueryPositionIterator<'t> {
|
impl<'t> QueryPositionIterator<'t> {
|
||||||
@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Iterator for QueryPositionIterator<'t> {
|
impl<'t> Iterator for QueryPositionIterator<'t> {
|
||||||
type Item = heed::Result<(u32, RoaringBitmap)>;
|
type Item = heed::Result<(u16, RoaringBitmap)>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
// sort inner words from the closest next position to the farthest next position.
|
// sort inner words from the closest next position to the farthest next position.
|
||||||
@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> {
|
|||||||
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
|
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
|
||||||
/// This branch allows us to iterate over meta-interval of positions.
|
/// This branch allows us to iterate over meta-interval of positions.
|
||||||
struct Branch<'t> {
|
struct Branch<'t> {
|
||||||
query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
|
query_level_iterator: Vec<(u16, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
|
||||||
last_result: (u32, RoaringBitmap),
|
last_result: (u16, RoaringBitmap),
|
||||||
branch_size: u32,
|
branch_size: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Branch<'t> {
|
impl<'t> Branch<'t> {
|
||||||
@ -303,7 +303,7 @@ impl<'t> Branch<'t> {
|
|||||||
let mut branch = Self {
|
let mut branch = Self {
|
||||||
query_level_iterator,
|
query_level_iterator,
|
||||||
last_result: (0, RoaringBitmap::new()),
|
last_result: (0, RoaringBitmap::new()),
|
||||||
branch_size: flatten_branch.len() as u32,
|
branch_size: flatten_branch.len() as u16,
|
||||||
};
|
};
|
||||||
|
|
||||||
branch.update_last_result();
|
branch.update_last_result();
|
||||||
@ -342,7 +342,7 @@ impl<'t> Branch<'t> {
|
|||||||
Some(result) => {
|
Some(result) => {
|
||||||
result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
|
result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
|
||||||
}
|
}
|
||||||
None => u32::MAX,
|
None => u16::MAX,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -378,7 +378,8 @@ impl<'t> Branch<'t> {
|
|||||||
fn compute_rank(&self) -> u32 {
|
fn compute_rank(&self) -> u32 {
|
||||||
// we compute a rank from the position.
|
// we compute a rank from the position.
|
||||||
let (pos, _) = self.last_result;
|
let (pos, _) = self.last_result;
|
||||||
pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
|
pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS
|
||||||
|
/ self.branch_size as u32
|
||||||
}
|
}
|
||||||
|
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
@ -171,7 +171,7 @@ pub trait Context<'c> {
|
|||||||
&self,
|
&self,
|
||||||
word: &str,
|
word: &str,
|
||||||
in_prefix_cache: bool,
|
in_prefix_cache: bool,
|
||||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>;
|
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>;
|
||||||
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
|
fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
|
||||||
fn field_id_word_count_docids(
|
fn field_id_word_count_docids(
|
||||||
@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
|||||||
&self,
|
&self,
|
||||||
word: &str,
|
word: &str,
|
||||||
in_prefix_cache: bool,
|
in_prefix_cache: bool,
|
||||||
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>
|
) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>
|
||||||
{
|
{
|
||||||
let range = {
|
let range = {
|
||||||
let left = u32::min_value();
|
let left = u16::min_value(); // TODO: this is wrong
|
||||||
let right = u32::max_value();
|
let right = u16::max_value(); // TODO: this is wrong
|
||||||
let left = (word, left);
|
let left = (word, left);
|
||||||
let right = (word, right);
|
let right = (word, right);
|
||||||
left..=right
|
left..=right
|
||||||
@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> {
|
fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
let key = (word, pos);
|
let key = (word, pos as u16); // TODO: this is wrong
|
||||||
self.index.word_position_docids.get(self.rtxn, &key)
|
self.index.word_position_docids.get(self.rtxn, &key)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -899,7 +899,7 @@ pub mod test {
|
|||||||
_word: &str,
|
_word: &str,
|
||||||
_in_prefix_cache: bool,
|
_in_prefix_cache: bool,
|
||||||
) -> heed::Result<
|
) -> heed::Result<
|
||||||
Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>,
|
Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>,
|
||||||
> {
|
> {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
prefix_word_pair_proximity_docids,
|
prefix_word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
|
word_fid_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
|
word_prefix_fid_docids,
|
||||||
script_language_docids,
|
script_language_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_position_docids.clear(self.wtxn)?;
|
word_position_docids.clear(self.wtxn)?;
|
||||||
|
word_fid_docids.clear(self.wtxn)?;
|
||||||
field_id_word_count_docids.clear(self.wtxn)?;
|
field_id_word_count_docids.clear(self.wtxn)?;
|
||||||
word_prefix_position_docids.clear(self.wtxn)?;
|
word_prefix_position_docids.clear(self.wtxn)?;
|
||||||
|
word_prefix_fid_docids.clear(self.wtxn)?;
|
||||||
script_language_docids.clear(self.wtxn)?;
|
script_language_docids.clear(self.wtxn)?;
|
||||||
facet_id_f64_docids.clear(self.wtxn)?;
|
facet_id_f64_docids.clear(self.wtxn)?;
|
||||||
facet_id_exists_docids.clear(self.wtxn)?;
|
facet_id_exists_docids.clear(self.wtxn)?;
|
||||||
|
@ -2,8 +2,8 @@ use std::collections::btree_map::Entry;
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::types::{ByteSlice, DecodeIgnore, Str};
|
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
|
||||||
use heed::Database;
|
use heed::{BytesDecode, BytesEncode, Database, RwIter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
prefix_word_pair_proximity_docids,
|
prefix_word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
|
word_fid_docids,
|
||||||
|
word_prefix_fid_docids,
|
||||||
facet_id_f64_docids: _,
|
facet_id_f64_docids: _,
|
||||||
facet_id_string_docids: _,
|
facet_id_string_docids: _,
|
||||||
field_id_docid_facet_f64s: _,
|
field_id_docid_facet_f64s: _,
|
||||||
@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
|
for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
|
||||||
// We delete the documents ids from the word prefix pair proximity database docids
|
// We delete the documents ids from the word prefix pair proximity database docids
|
||||||
// and remove the empty pairs too.
|
// and remove the empty pairs too.
|
||||||
let db = db.remap_key_type::<ByteSlice>();
|
Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?;
|
||||||
let mut iter = db.iter_mut(self.wtxn)?;
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let (key, mut docids) = result?;
|
|
||||||
let previous_len = docids.len();
|
|
||||||
docids -= &self.to_delete_docids;
|
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
let key = key.to_owned();
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Self::delete_from_db(
|
||||||
// We delete the documents ids that are under the pairs of words,
|
word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
// it is faster and use no memory to iterate over all the words pairs than
|
&self.to_delete_docids,
|
||||||
// to compute the cartesian product of every words of the deleted documents.
|
)?;
|
||||||
let mut iter =
|
Self::delete_from_db(
|
||||||
word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
|
word_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
while let Some(result) = iter.next() {
|
&self.to_delete_docids,
|
||||||
let (bytes, mut docids) = result?;
|
)?;
|
||||||
let previous_len = docids.len();
|
Self::delete_from_db(
|
||||||
docids -= &self.to_delete_docids;
|
word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
if docids.is_empty() {
|
&self.to_delete_docids,
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
)?;
|
||||||
unsafe { iter.del_current()? };
|
Self::delete_from_db(
|
||||||
} else if docids.len() != previous_len {
|
word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
let bytes = bytes.to_owned();
|
&self.to_delete_docids,
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
)?;
|
||||||
unsafe { iter.put_current(&bytes, &docids)? };
|
Self::delete_from_db(
|
||||||
}
|
word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
}
|
&self.to_delete_docids,
|
||||||
|
)?;
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// We delete the documents ids that are under the word level position docids.
|
|
||||||
let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let (bytes, mut docids) = result?;
|
|
||||||
let previous_len = docids.len();
|
|
||||||
docids -= &self.to_delete_docids;
|
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
let bytes = bytes.to_owned();
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&bytes, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// We delete the documents ids that are under the word prefix level position docids.
|
|
||||||
let mut iter =
|
|
||||||
word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
|
||||||
while let Some(result) = iter.next() {
|
|
||||||
let (bytes, mut docids) = result?;
|
|
||||||
let previous_len = docids.len();
|
|
||||||
docids -= &self.to_delete_docids;
|
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
let bytes = bytes.to_owned();
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&bytes, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// Remove the documents ids from the field id word count database.
|
// Remove the documents ids from the field id word count database.
|
||||||
let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
|
Self::delete_from_db(
|
||||||
while let Some((key, mut docids)) = iter.next().transpose()? {
|
field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
let previous_len = docids.len();
|
&self.to_delete_docids,
|
||||||
docids -= &self.to_delete_docids;
|
)?;
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
let key = key.to_owned();
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
|
if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
|
||||||
let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
|
let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
|
||||||
@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Remove the documents ids from the script language database.
|
// Remove the documents ids from the script language database.
|
||||||
let mut iter = script_language_docids.iter_mut(self.wtxn)?;
|
Self::delete_from_db(
|
||||||
while let Some((key, mut docids)) = iter.next().transpose()? {
|
script_language_docids.iter_mut(self.wtxn)?.remap_key_type(),
|
||||||
let previous_len = docids.len();
|
&self.to_delete_docids,
|
||||||
docids -= &self.to_delete_docids;
|
)?;
|
||||||
if docids.is_empty() {
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.del_current()? };
|
|
||||||
} else if docids.len() != previous_len {
|
|
||||||
let key = key.to_owned();
|
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
drop(iter);
|
|
||||||
// We delete the documents ids that are under the facet field id values.
|
// We delete the documents ids that are under the facet field id values.
|
||||||
remove_docids_from_facet_id_exists_docids(
|
remove_docids_from_facet_id_exists_docids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
soft_deletion_used: false,
|
soft_deletion_used: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn delete_from_db<C>(
|
||||||
|
mut iter: RwIter<UnalignedSlice<u8>, C>,
|
||||||
|
to_delete_docids: &RoaringBitmap,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
|
||||||
|
+ for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
|
||||||
|
{
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let (bytes, mut docids) = result?;
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids -= to_delete_docids;
|
||||||
|
if docids.is_empty() {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
let bytes = bytes.to_owned();
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.put_current(&bytes, &docids)? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_from_word_prefix_docids(
|
fn remove_from_word_prefix_docids(
|
||||||
|
@ -7,14 +7,17 @@ use super::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::{DocumentId, Result};
|
use crate::{
|
||||||
|
absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
|
||||||
|
DocumentId, Result,
|
||||||
|
};
|
||||||
|
|
||||||
/// Extracts the word positions and the documents ids where this word appear.
|
/// Extracts the word positions and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
/// Returns a grenad reader with the list of extracted words at positions and
|
/// Returns a grenad reader with the list of extracted words at positions and
|
||||||
/// documents ids from the given chunk of docid word positions.
|
/// documents ids from the given chunk of docid word positions.
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||||||
for position in read_u32_ne_bytes(value) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
let (fid, position) = relative_from_absolute_position(position);
|
||||||
|
let position = bucketed_position(position);
|
||||||
|
let position = absolute_from_relative_position(fid, position);
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
|
|
||||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(word_position_docids_sorter, indexer)
|
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||||
|
|
||||||
|
Ok(word_position_docids_reader)
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
|||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_fid_and_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
||||||
GrenadParameters, MergeFn, MergeableReader,
|
GrenadParameters, MergeFn, MergeableReader,
|
||||||
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_word_position_docids,
|
extract_word_fid_and_position_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
|
@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode};
|
|||||||
use log::debug;
|
use log::debug;
|
||||||
|
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::heed_codec::StrBEU32Codec;
|
use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||||
CursorClonableMmap, MergeFn,
|
CursorClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result};
|
use crate::{bucketed_position, relative_from_absolute_position, Index, Result};
|
||||||
|
|
||||||
pub struct WordPrefixPositionDocids<'t, 'u, 'i> {
|
pub struct WordPrefixPositionDocids<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
|
|||||||
let mut prefixes_cache = HashMap::new();
|
let mut prefixes_cache = HashMap::new();
|
||||||
while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
|
while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
|
||||||
let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
|
let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
|
||||||
|
let (_fid, pos) = relative_from_absolute_position(pos);
|
||||||
|
|
||||||
current_prefixes = match current_prefixes.take() {
|
current_prefixes = match current_prefixes.take() {
|
||||||
Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
|
Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
|
||||||
@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
|
|||||||
let iter = db
|
let iter = db
|
||||||
.remap_key_type::<ByteSlice>()
|
.remap_key_type::<ByteSlice>()
|
||||||
.prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
|
.prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
|
||||||
.remap_key_type::<StrBEU32Codec>();
|
.remap_key_type::<StrBEU16Codec>();
|
||||||
for result in iter {
|
for result in iter {
|
||||||
let ((word, pos), data) = result?;
|
let ((word, pos), data) = result?;
|
||||||
if word.starts_with(prefix) {
|
if word.starts_with(prefix) {
|
||||||
let key = (prefix, pos);
|
let key = (prefix, pos);
|
||||||
let bytes = StrBEU32Codec::bytes_encode(&key).unwrap();
|
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
|
||||||
prefix_position_docids_sorter.insert(bytes, data)?;
|
prefix_position_docids_sorter.insert(bytes, data)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user