From 9b2653427ded198a8d744e112dba68a93470dd51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:22:01 +0100 Subject: [PATCH] Split position DB into fid and relative position DB --- milli/src/heed_codec/mod.rs | 2 +- milli/src/heed_codec/str_beu32_codec.rs | 34 ++++ milli/src/index.rs | 21 ++- milli/src/lib.rs | 17 ++ milli/src/search/criteria/attribute.rs | 17 +- milli/src/search/criteria/mod.rs | 12 +- milli/src/update/clear_documents.rs | 4 + milli/src/update/delete_documents.rs | 162 ++++++------------ .../extract/extract_word_position_docids.rs | 15 +- .../src/update/index_documents/extract/mod.rs | 4 +- .../update/words_prefix_position_docids.rs | 9 +- 11 files changed, 162 insertions(+), 135 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index a4df63e22..b7a8c3c88 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::script_language_codec::ScriptLanguageCodec; -pub use self::str_beu32_codec::StrBEU32Codec; +pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec}; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs index d1f379bdc..17f3c996f 100644 --- a/milli/src/heed_codec/str_beu32_codec.rs +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { Some(Cow::Owned(bytes)) } } + +pub struct StrBEU16Codec; + +impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { + type DItem = (&'a str, u16); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; + + Some((word, pos)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { + type EItem = (&'a str, u16); + + fn bytes_encode((word, pos): &Self::EItem) -> Option> { + let pos = pos.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + pos.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.extend_from_slice(&pos[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index a4048dfb0..7848ddf5a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -19,12 +19,12 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec}; +use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, + Search, U8StrStrCodec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -76,7 +76,9 @@ pub mod db_name { pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; + pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; + pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; @@ -118,11 +120,16 @@ pub struct Index { pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. - pub word_position_docids: Database, + pub word_position_docids: Database, + /// Maps the word and the field id with the docids that corresponds to it. + pub word_fid_docids: Database, + /// Maps the field id and the word count with the docids that corresponds to it. pub field_id_word_count_docids: Database, /// Maps the position of a word prefix with all the docids where this prefix appears. - pub word_prefix_position_docids: Database, + pub word_prefix_position_docids: Database, + /// Maps the word and the field id with the docids that corresponds to it. + pub word_prefix_fid_docids: Database, /// Maps the script and language with all the docids that corresponds to it. pub script_language_docids: Database, @@ -153,7 +160,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(19); + options.max_dbs(21); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -170,8 +177,10 @@ impl Index { let prefix_word_pair_proximity_docids = env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; + let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; + let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; @@ -196,7 +205,9 @@ impl Index { word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids, + word_fid_docids, word_prefix_position_docids, + word_prefix_fid_docids, field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b256192bd..a62c344f9 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { (field_id as u32) << 16 | (relative as u32) } +// TODO: this is wrong, but will do for now +/// Compute the "bucketed" absolute position from the field id and relative position in the field. +/// +/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger. +pub fn bucketed_position(relative: u16) -> u16 { + // The first few relative positions are kept intact. + if relative < 16 { + relative + } else if relative < 24 { + // Relative positions between 16 and 24 all become equal to 24 + 24 + } else { + // Then, groups of positions that have the same base-2 logarithm are reduced to + // the same relative position: the smallest power of 2 that is greater than them + (relative as f64).log2().ceil().exp2() as u16 + } +} /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5b33fdf54..322f6e051 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> { struct QueryPositionIterator<'t> { #[allow(clippy::type_complexity)] inner: - Vec> + 't>>>, + Vec> + 't>>>, } impl<'t> QueryPositionIterator<'t> { @@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> { } impl<'t> Iterator for QueryPositionIterator<'t> { - type Item = heed::Result<(u32, RoaringBitmap)>; + type Item = heed::Result<(u16, RoaringBitmap)>; fn next(&mut self) -> Option { // sort inner words from the closest next position to the farthest next position. @@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> { /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, /// This branch allows us to iterate over meta-interval of positions. struct Branch<'t> { - query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, - last_result: (u32, RoaringBitmap), - branch_size: u32, + query_level_iterator: Vec<(u16, RoaringBitmap, Peekable>)>, + last_result: (u16, RoaringBitmap), + branch_size: u16, } impl<'t> Branch<'t> { @@ -303,7 +303,7 @@ impl<'t> Branch<'t> { let mut branch = Self { query_level_iterator, last_result: (0, RoaringBitmap::new()), - branch_size: flatten_branch.len() as u32, + branch_size: flatten_branch.len() as u16, }; branch.update_last_result(); @@ -342,7 +342,7 @@ impl<'t> Branch<'t> { Some(result) => { result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) } - None => u32::MAX, + None => u16::MAX, } } }) @@ -378,7 +378,8 @@ impl<'t> Branch<'t> { fn compute_rank(&self) -> u32 { // we compute a rank from the position. let (pos, _) = self.last_result; - pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size + pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS + / self.branch_size as u32 } fn cmp(&self, other: &Self) -> Ordering { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 0c1c8add1..5e491672f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -171,7 +171,7 @@ pub trait Context<'c> { &self, word: &str, in_prefix_cache: bool, - ) -> heed::Result> + 'c>>; + ) -> heed::Result> + 'c>>; fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> Result>; fn field_id_word_count_docids( @@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { &self, word: &str, in_prefix_cache: bool, - ) -> heed::Result> + 'c>> + ) -> heed::Result> + 'c>> { let range = { - let left = u32::min_value(); - let right = u32::max_value(); + let left = u16::min_value(); // TODO: this is wrong + let right = u16::max_value(); // TODO: this is wrong let left = (word, left); let right = (word, right); left..=right @@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { - let key = (word, pos); + let key = (word, pos as u16); // TODO: this is wrong self.index.word_position_docids.get(self.rtxn, &key) } } @@ -899,7 +899,7 @@ pub mod test { _word: &str, _in_prefix_cache: bool, ) -> heed::Result< - Box> + 'c>, + Box> + 'c>, > { todo!() } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 0296bc192..c9de4d9ab 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids, + word_fid_docids, field_id_word_count_docids, word_prefix_position_docids, + word_prefix_fid_docids, script_language_docids, facet_id_f64_docids, facet_id_string_docids, @@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids.clear(self.wtxn)?; prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; + word_fid_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; + word_prefix_fid_docids.clear(self.wtxn)?; script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eeb67b829..47a7bde4c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,8 +2,8 @@ use std::collections::btree_map::Entry; use std::collections::{HashMap, HashSet}; use fst::IntoStreamer; -use heed::types::{ByteSlice, DecodeIgnore, Str}; -use heed::Database; +use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; +use heed::{BytesDecode, BytesEncode, Database, RwIter}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; @@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, + word_fid_docids, + word_prefix_fid_docids, facet_id_f64_docids: _, facet_id_string_docids: _, field_id_docid_facet_f64s: _, @@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { // We delete the documents ids from the word prefix pair proximity database docids // and remove the empty pairs too. - let db = db.remap_key_type::(); - let mut iter = db.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (key, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } + Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?; } - - // We delete the documents ids that are under the pairs of words, - // it is faster and use no memory to iterate over all the words pairs than - // to compute the cartesian product of every words of the deleted documents. - let mut iter = - word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - drop(iter); - - // We delete the documents ids that are under the word level position docids. - let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - drop(iter); - - // We delete the documents ids that are under the word prefix level position docids. - let mut iter = - word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &docids)? }; - } - } - - drop(iter); + Self::delete_from_db( + word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_position_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; + Self::delete_from_db( + word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; // Remove the documents ids from the field id word count database. - let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - - drop(iter); + Self::delete_from_db( + field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; @@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // Remove the documents ids from the script language database. - let mut iter = script_language_docids.iter_mut(self.wtxn)?; - while let Some((key, mut docids)) = iter.next().transpose()? { - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - - drop(iter); + Self::delete_from_db( + script_language_docids.iter_mut(self.wtxn)?.remap_key_type(), + &self.to_delete_docids, + )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( self.wtxn, @@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { soft_deletion_used: false, }) } + + fn delete_from_db( + mut iter: RwIter, C>, + to_delete_docids: &RoaringBitmap, + ) -> Result<()> + where + C: for<'a> BytesDecode<'a, DItem = RoaringBitmap> + + for<'a> BytesEncode<'a, EItem = RoaringBitmap>, + { + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids -= to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; + } + } + Ok(()) + } } fn remove_from_word_prefix_docids( diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index d95db4157..cd3ec691b 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,14 +7,17 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{DocumentId, Result}; +use crate::{ + absolute_from_relative_position, bucketed_position, relative_from_absolute_position, + DocumentId, Result, +}; /// Extracts the word positions and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_position_docids( +pub fn extract_word_fid_and_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,11 +42,15 @@ pub fn extract_word_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); + let (fid, position) = relative_from_absolute_position(position); + let position = bucketed_position(position); + let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } } - sorter_into_reader(word_position_docids_sorter, indexer) + let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; + + Ok(word_position_docids_reader) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c0f07cf79..844efed36 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_position_docids; +use self::extract_word_position_docids::extract_word_fid_and_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks, indexer, lmdb_writer_sx.clone(), - extract_word_position_docids, + extract_word_fid_and_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 6f12dde38..0822d0d26 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode}; use log::debug; use crate::error::SerializationError; -use crate::heed_codec::StrBEU32Codec; +use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec}; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{bucketed_position, relative_from_absolute_position, Index, Result}; pub struct WordPrefixPositionDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let mut prefixes_cache = HashMap::new(); while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + let (_fid, pos) = relative_from_absolute_position(pos); current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), @@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let iter = db .remap_key_type::() .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? - .remap_key_type::(); + .remap_key_type::(); for result in iter { let ((word, pos), data) = result?; if word.starts_with(prefix) { let key = (prefix, pos); - let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); + let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); prefix_position_docids_sorter.insert(bytes, data)?; } }