diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9d60d59ca..ba1e6b74e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -36,7 +36,7 @@ use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, + WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -373,6 +373,7 @@ where let mut final_documents_ids = RoaringBitmap::new(); let mut word_pair_proximity_docids = None; let mut word_position_docids = None; + let mut word_fid_docids = None; let mut word_docids = None; let mut exact_word_docids = None; @@ -406,6 +407,11 @@ where word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } + TypedChunk::WordFidDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordFidDocids(chunk) + } otherwise => otherwise, }; @@ -449,6 +455,7 @@ where exact_word_docids, word_pair_proximity_docids, word_position_docids, + word_fid_docids, )?; Ok(all_documents_ids.len()) @@ -461,6 +468,7 @@ where exact_word_docids: Option>, word_pair_proximity_docids: Option>, word_position_docids: Option>, + word_fid_docids: Option>, ) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -595,17 +603,16 @@ where if let Some(word_position_docids) = word_position_docids { // Run the words prefix position docids update operation. - let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); + let mut builder = WordPrefixIntegerDocids::new( + self.wtxn, + self.index.word_prefix_position_docids, + self.index.word_position_docids, + ); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - if let Some(value) = self.config.words_positions_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.config.words_positions_min_level_size { - builder.min_level_size(value); - } + builder.execute( word_position_docids, &new_prefix_fst_words, @@ -613,6 +620,24 @@ where &del_prefix_fst_words, )?; } + if let Some(word_fid_docids) = word_fid_docids { + // Run the words prefix fid docids update operation. + let mut builder = WordPrefixIntegerDocids::new( + self.wtxn, + self.index.word_prefix_fid_docids, + self.index.word_fid_docids, + ); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + builder.execute( + word_fid_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 948811a6b..7a3fd1fd9 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -14,7 +14,7 @@ pub use self::prefix_word_pairs::{ pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; -pub use self::words_prefix_position_docids::WordPrefixPositionDocids; +pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; @@ -27,5 +27,5 @@ mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; -mod words_prefix_position_docids; +mod words_prefix_integer_docids; mod words_prefixes_fst; diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_integer_docids.rs similarity index 71% rename from milli/src/update/words_prefix_position_docids.rs rename to milli/src/update/words_prefix_integer_docids.rs index b09555264..63ca178ef 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -1,10 +1,9 @@ use std::collections::{HashMap, HashSet}; -use std::num::NonZeroU32; -use std::{cmp, str}; +use std::str; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode}; +use heed::{BytesDecode, BytesEncode, Database}; use log::debug; use crate::error::SerializationError; @@ -14,57 +13,46 @@ use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{CboRoaringBitmapCodec, Result}; -pub struct WordPrefixPositionDocids<'t, 'u, 'i> { +pub struct WordPrefixIntegerDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + prefix_database: Database, + word_database: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - level_group_size: NonZeroU32, - min_level_size: NonZeroU32, } -impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { +impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPositionDocids<'t, 'u, 'i> { - WordPrefixPositionDocids { + prefix_database: Database, + word_database: Database, + ) -> WordPrefixIntegerDocids<'t, 'u, 'i> { + WordPrefixIntegerDocids { wtxn, - index, + prefix_database, + word_database, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, max_memory: None, - level_group_size: NonZeroU32::new(4).unwrap(), - min_level_size: NonZeroU32::new(5).unwrap(), } } - pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { - self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); - self - } - - pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { - self.min_level_size = value; - self - } - - #[logging_timer::time("WordPrefixPositionDocids::{}")] + #[logging_timer::time("WordPrefixIntegerDocids::{}")] pub fn execute( self, - new_word_position_docids: grenad::Reader, + new_word_integer_docids: grenad::Reader, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + debug!("Computing and writing the word levels integers docids into LMDB on disk..."); - let mut prefix_position_docids_sorter = create_sorter( + let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -73,14 +61,14 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { self.max_memory, ); - let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; + let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?; if !common_prefix_fst_words.is_empty() { // We fetch all the new common prefixes between the previous and new prefix fst. let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { + while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? { let (word, pos) = StrBEU16Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; current_prefixes = match current_prefixes.take() { @@ -88,7 +76,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { _otherwise => { write_prefixes_in_sorter( &mut prefixes_cache, - &mut prefix_position_docids_sorter, + &mut prefix_integer_docids_sorter, )?; common_prefix_fst_words .iter() @@ -101,6 +89,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { if word.starts_with(prefix) { buffer.clear(); buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(0); buffer.extend_from_slice(&pos.to_be_bytes()); match prefixes_cache.get_mut(&buffer) { Some(value) => value.push(data.to_owned()), @@ -113,11 +102,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { } } - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_integer_docids_sorter)?; } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.index.word_position_docids.remap_data_type::(); + let db = self.word_database.remap_data_type::(); for prefix_bytes in new_prefix_fst_words { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } @@ -133,19 +122,18 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { if word.starts_with(prefix) { let key = (prefix, pos); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); - prefix_position_docids_sorter.insert(bytes, data)?; + prefix_integer_docids_sorter.insert(bytes, data)?; } } } - // We remove all the entries that are no more required in this word prefix position + // We remove all the entries that are no more required in this word prefix integer // docids database. - // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in + // We also avoid iterating over the whole `word_prefix_integer_docids` database if we know in // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below // will always be false (i.e. if `del_prefix_fst_words` is empty). if !del_prefix_fst_words.is_empty() { - let mut iter = - self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + let mut iter = self.prefix_database.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some(((prefix, _), _)) = iter.next().transpose()? { if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; @@ -154,11 +142,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { drop(iter); } - // We finally write all the word prefix position docids into the LMDB database. + // We finally write all the word prefix integer docids into the LMDB database. sorter_into_lmdb_database( self.wtxn, - *self.index.word_prefix_position_docids.as_polymorph(), - prefix_position_docids_sorter, + *self.prefix_database.as_polymorph(), + prefix_integer_docids_sorter, merge_cbo_roaring_bitmaps, )?;