diff --git a/milli/src/index.rs b/milli/src/index.rs index 3d6d954f0..80f62f684 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -61,6 +61,7 @@ pub mod db_name { pub const WORD_DOCIDS: &str = "word-docids"; pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; @@ -91,6 +92,9 @@ pub struct Index { /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, + /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. + pub exact_word_prefix_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, @@ -124,7 +128,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(15); + options.max_dbs(16); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -132,6 +136,7 @@ impl Index { let word_docids = env.create_database(Some(WORD_DOCIDS))?; let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = @@ -154,6 +159,7 @@ impl Index { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 57c0969c7..3665d2313 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -21,6 +21,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, @@ -58,6 +59,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; + exact_word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 46a4721c0..58c4d4f70 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode}; +use heed::{BytesDecode, BytesEncode, Database}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -113,6 +113,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, field_id_word_count_docids, @@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We write the new words FST into the main database. self.index.put_words_fst(self.wtxn, &new_words_fst)?; - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut prefixes_to_delete = fst::SetBuilder::memory(); - let mut iter = word_prefix_docids.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } + let prefixes_to_delete = + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; - drop(iter); + let exact_prefix_to_delete = remove_from_word_prefix_docids( + self.wtxn, + exact_word_prefix_docids, + &self.documents_ids, + )?; + + let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); // We compute the new prefix FST and write it only if there is a change. - let prefixes_to_delete = prefixes_to_delete.into_set(); - if !prefixes_to_delete.is_empty() { + if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { let new_words_prefixes_fst = { // We retrieve the current words prefixes FST from the database. let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference(); + let difference = + words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); // We stream the new external ids that does no more contains the to-delete external ids. let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); @@ -457,6 +448,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_prefix_docids( + txn: &mut heed::RwTxn, + db: &Database, + to_remove: &RoaringBitmap, +) -> Result>> { + let mut prefixes_to_delete = fst::SetBuilder::memory(); + + // We iterate over the word prefix docids database and remove the deleted documents ids + // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. + let mut iter = db.iter_mut(txn)?; + while let Some(result) = iter.next() { + let (prefix, mut docids) = result?; + let prefix = prefix.to_owned(); + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + prefixes_to_delete.insert(prefix)?; + } else if docids.len() != previous_len { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&prefix, &docids)? }; + } + } + + drop(iter); + + Ok(prefixes_to_delete.into_set()) +} + fn remove_from_word_docids( txn: &mut heed::RwTxn, db: &heed::Database,