diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 396134436..85adaf750 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -11,6 +11,7 @@ type Word = Vec; // TODO make it be a SmallVec pub struct RawIndexer { word_limit: usize, // the maximum number of indexed words + stop_words: fst::Set, words_doc_indexes: BTreeMap>, docs_words: HashMap>, } @@ -21,13 +22,14 @@ pub struct Indexed { } impl RawIndexer { - pub fn new() -> RawIndexer { - RawIndexer::with_word_limit(1000) + pub fn new(stop_words: fst::Set) -> RawIndexer { + RawIndexer::with_word_limit(stop_words, 1000) } - pub fn with_word_limit(limit: usize) -> RawIndexer { + pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer { RawIndexer { word_limit: limit, + stop_words, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), } @@ -56,6 +58,7 @@ impl RawIndexer { id, attr, self.word_limit, + &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -87,6 +90,7 @@ impl RawIndexer { id, attr, self.word_limit, + &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -118,6 +122,7 @@ impl RawIndexer { id, attr, self.word_limit, + &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -152,17 +157,12 @@ impl RawIndexer { } } -impl Default for RawIndexer { - fn default() -> Self { - Self::new() - } -} - fn index_token( token: Token, id: DocumentId, attr: SchemaAttr, word_limit: usize, + stop_words: &fst::Set, words_doc_indexes: &mut BTreeMap>, docs_words: &mut HashMap>, ) -> bool { @@ -170,16 +170,18 @@ fn index_token( return false; } - match token_to_docindex(id, attr, token) { - Some(docindex) => { - let word = Vec::from(token.word); - words_doc_indexes - .entry(word.clone()) - .or_insert_with(Vec::new) - .push(docindex); - docs_words.entry(id).or_insert_with(Vec::new).push(word); + if !stop_words.contains(&token.word) { + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + words_doc_indexes + .entry(word.clone()) + .or_insert_with(Vec::new) + .push(docindex); + docs_words.entry(id).or_insert_with(Vec::new).push(word); + } + None => return false, } - None => return false, } true @@ -207,7 +209,7 @@ mod tests { #[test] fn strange_apostrophe() { - let mut indexer = RawIndexer::new(); + let mut indexer = RawIndexer::new(fst::Set::default()); let docid = DocumentId(0); let attr = SchemaAttr(0); @@ -231,7 +233,7 @@ mod tests { #[test] fn strange_apostrophe_in_sequence() { - let mut indexer = RawIndexer::new(); + let mut indexer = RawIndexer::new(fst::Set::default()); let docid = DocumentId(0); let attr = SchemaAttr(0); @@ -252,4 +254,33 @@ mod tests { .get(&"l’éteindre".to_owned().into_bytes()) .is_some()); } + + #[test] + fn basic_stop_words() { + let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]); + let stop_words = fst::Set::from_iter(stop_words).unwrap(); + + let mut indexer = RawIndexer::new(stop_words); + + let docid = DocumentId(0); + let attr = SchemaAttr(0); + let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !"; + indexer.index_text(docid, attr, text); + + let Indexed { + words_doc_indexes, .. + } = indexer.build(); + + assert!(words_doc_indexes.get(&b"l"[..]).is_none()); + assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some()); + assert!(words_doc_indexes.get(&b"j"[..]).is_none()); + assert!(words_doc_indexes.get(&b"ai"[..]).is_none()); + assert!(words_doc_indexes.get(&b"de"[..]).is_none()); + assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some()); + + // with the ugly apostrophe... + assert!(words_doc_indexes + .get(&"l’éteindre".to_owned().into_bytes()) + .is_some()); + } } diff --git a/meilidb-core/src/store/main.rs b/meilidb-core/src/store/main.rs index 416fcfe37..dca995759 100644 --- a/meilidb-core/src/store/main.rs +++ b/meilidb-core/src/store/main.rs @@ -9,6 +9,7 @@ const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; const RANKED_MAP_KEY: &str = "ranked-map"; const SCHEMA_KEY: &str = "schema"; const SYNONYMS_KEY: &str = "synonyms"; +const STOP_WORDS_KEY: &str = "stop-words"; const WORDS_KEY: &str = "words"; #[derive(Copy, Clone)] @@ -71,6 +72,24 @@ impl Main { } } + pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> { + let bytes = fst.as_fst().as_bytes(); + self.main + .put::(writer, STOP_WORDS_KEY, bytes) + } + + pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult> { + match self.main.get::(reader, STOP_WORDS_KEY)? { + Some(bytes) => { + let len = bytes.len(); + let bytes = Arc::from(bytes); + let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); + Ok(Some(fst::Set::from(fst))) + } + None => Ok(None), + } + } + pub fn put_number_of_documents(self, writer: &mut heed::RwTxn, f: F) -> ZResult where F: Fn(u64) -> u64, diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs index 4909ce77b..3198f455f 100644 --- a/meilidb-core/src/store/mod.rs +++ b/meilidb-core/src/store/mod.rs @@ -187,6 +187,22 @@ impl Index { ) } + pub fn stop_words_addition(&self) -> update::StopWordsAddition { + update::StopWordsAddition::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + + pub fn stop_words_deletion(&self) -> update::StopWordsDeletion { + update::StopWordsDeletion::new( + self.updates, + self.updates_results, + self.updates_notifier.clone(), + ) + } + pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult> { match self.updates.last_update_id(reader)? { Some((id, _)) => Ok(Some(id)), diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs index 79387600c..17e55527d 100644 --- a/meilidb-core/src/update/documents_addition.rs +++ b/meilidb-core/src/update/documents_addition.rs @@ -87,7 +87,6 @@ pub fn apply_documents_addition( addition: Vec, ) -> MResult<()> { let mut documents_additions = HashMap::new(); - let mut indexer = RawIndexer::new(); let schema = match main_store.schema(writer)? { Some(schema) => schema, @@ -124,7 +123,14 @@ pub fn apply_documents_addition( None => RankedMap::default(), }; + let stop_words = match main_store.stop_words_fst(writer)? { + Some(stop_words) => stop_words, + None => fst::Set::default(), + }; + // 3. index the documents fields in the stores + let mut indexer = RawIndexer::new(stop_words); + for (document_id, document) in documents_additions { let serializer = Serializer { txn: writer, @@ -180,8 +186,13 @@ pub fn reindex_all_documents( postings_lists_store.clear(writer)?; docs_words_store.clear(writer)?; + let stop_words = match main_store.stop_words_fst(writer)? { + Some(stop_words) => stop_words, + None => fst::Set::default(), + }; + // 3. re-index one document by one document (otherwise we make the borrow checker unhappy) - let mut indexer = RawIndexer::new(); + let mut indexer = RawIndexer::new(stop_words); let mut ram_store = HashMap::new(); for document_id in documents_ids_to_reindex { diff --git a/meilidb-core/src/update/mod.rs b/meilidb-core/src/update/mod.rs index 82290cb4f..755df56c0 100644 --- a/meilidb-core/src/update/mod.rs +++ b/meilidb-core/src/update/mod.rs @@ -3,6 +3,8 @@ mod customs_update; mod documents_addition; mod documents_deletion; mod schema_update; +mod stop_words_addition; +mod stop_words_deletion; mod synonyms_addition; mod synonyms_deletion; @@ -11,11 +13,13 @@ pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::documents_addition::{apply_documents_addition, DocumentsAddition}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::schema_update::{apply_schema_update, push_schema_update}; +pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition}; +pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion}; pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition}; pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion}; use std::cmp; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use std::time::{Duration, Instant}; use heed::Result as ZResult; @@ -34,6 +38,8 @@ pub enum Update { DocumentsDeletion(Vec), SynonymsAddition(BTreeMap>), SynonymsDeletion(BTreeMap>>), + StopWordsAddition(BTreeSet), + StopWordsDeletion(BTreeSet), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -45,6 +51,8 @@ pub enum UpdateType { DocumentsDeletion { number: usize }, SynonymsAddition { number: usize }, SynonymsDeletion { number: usize }, + StopWordsAddition { number: usize }, + StopWordsDeletion { number: usize }, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -210,6 +218,37 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult { + let start = Instant::now(); + + let update_type = UpdateType::StopWordsAddition { + number: stop_words.len(), + }; + + let result = + apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words); + + (update_type, result, start.elapsed()) + } + Update::StopWordsDeletion(stop_words) => { + let start = Instant::now(); + + let update_type = UpdateType::StopWordsDeletion { + number: stop_words.len(), + }; + + let result = apply_stop_words_deletion( + writer, + index.main, + index.documents_fields, + index.documents_fields_counts, + index.postings_lists, + index.docs_words, + stop_words, + ); + (update_type, result, start.elapsed()) } }; diff --git a/meilidb-core/src/update/stop_words_addition.rs b/meilidb-core/src/update/stop_words_addition.rs new file mode 100644 index 000000000..6adba450b --- /dev/null +++ b/meilidb-core/src/update/stop_words_addition.rs @@ -0,0 +1,116 @@ +use std::collections::BTreeSet; + +use fst::{set::OpBuilder, SetBuilder}; + +use crate::automaton::normalize_str; +use crate::update::{next_update_id, Update}; +use crate::{store, MResult}; + +pub struct StopWordsAddition { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + stop_words: BTreeSet, +} + +impl StopWordsAddition { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> StopWordsAddition { + StopWordsAddition { + updates_store, + updates_results_store, + updates_notifier, + stop_words: BTreeSet::new(), + } + } + + pub fn add_stop_word>(&mut self, stop_word: S) { + let stop_word = normalize_str(stop_word.as_ref()); + self.stop_words.insert(stop_word); + } + + pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult { + let _ = self.updates_notifier.send(()); + let update_id = push_stop_words_addition( + writer, + self.updates_store, + self.updates_results_store, + self.stop_words, + )?; + Ok(update_id) + } +} + +pub fn push_stop_words_addition( + writer: &mut heed::RwTxn, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + addition: BTreeSet, +) -> MResult { + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::StopWordsAddition(addition); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + +pub fn apply_stop_words_addition( + writer: &mut heed::RwTxn, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + addition: BTreeSet, +) -> MResult<()> { + let mut stop_words_builder = SetBuilder::memory(); + + for word in addition { + stop_words_builder.insert(&word).unwrap(); + // we remove every posting list associated to a new stop word + postings_lists_store.del_postings_list(writer, word.as_bytes())?; + } + + // create the new delta stop words fst + let delta_stop_words = stop_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + // we also need to remove all the stop words from the main fst + if let Some(word_fst) = main_store.words_fst(writer)? { + let op = OpBuilder::new() + .add(&word_fst) + .add(&delta_stop_words) + .difference(); + + let mut word_fst_builder = SetBuilder::memory(); + word_fst_builder.extend_stream(op).unwrap(); + let word_fst = word_fst_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + main_store.put_words_fst(writer, &word_fst)?; + } + + // now we add all of these stop words from the main store + let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default(); + + let op = OpBuilder::new() + .add(&stop_words_fst) + .add(&delta_stop_words) + .r#union(); + + let mut stop_words_builder = SetBuilder::memory(); + stop_words_builder.extend_stream(op).unwrap(); + let stop_words_fst = stop_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + main_store.put_stop_words_fst(writer, &stop_words_fst)?; + + Ok(()) +} diff --git a/meilidb-core/src/update/stop_words_deletion.rs b/meilidb-core/src/update/stop_words_deletion.rs new file mode 100644 index 000000000..11c72ded9 --- /dev/null +++ b/meilidb-core/src/update/stop_words_deletion.rs @@ -0,0 +1,112 @@ +use std::collections::BTreeSet; + +use fst::{set::OpBuilder, SetBuilder}; + +use crate::automaton::normalize_str; +use crate::update::documents_addition::reindex_all_documents; +use crate::update::{next_update_id, Update}; +use crate::{store, MResult}; + +pub struct StopWordsDeletion { + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + stop_words: BTreeSet, +} + +impl StopWordsDeletion { + pub fn new( + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + updates_notifier: crossbeam_channel::Sender<()>, + ) -> StopWordsDeletion { + StopWordsDeletion { + updates_store, + updates_results_store, + updates_notifier, + stop_words: BTreeSet::new(), + } + } + + pub fn delete_stop_word>(&mut self, stop_word: S) { + let stop_word = normalize_str(stop_word.as_ref()); + self.stop_words.insert(stop_word); + } + + pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult { + let _ = self.updates_notifier.send(()); + let update_id = push_stop_words_deletion( + writer, + self.updates_store, + self.updates_results_store, + self.stop_words, + )?; + Ok(update_id) + } +} + +pub fn push_stop_words_deletion( + writer: &mut heed::RwTxn, + updates_store: store::Updates, + updates_results_store: store::UpdatesResults, + deletion: BTreeSet, +) -> MResult { + let last_update_id = next_update_id(writer, updates_store, updates_results_store)?; + + let update = Update::StopWordsDeletion(deletion); + updates_store.put_update(writer, last_update_id, &update)?; + + Ok(last_update_id) +} + +pub fn apply_stop_words_deletion( + writer: &mut heed::RwTxn, + main_store: store::Main, + documents_fields_store: store::DocumentsFields, + documents_fields_counts_store: store::DocumentsFieldsCounts, + postings_lists_store: store::PostingsLists, + docs_words_store: store::DocsWords, + deletion: BTreeSet, +) -> MResult<()> { + let mut stop_words_builder = SetBuilder::memory(); + + for word in deletion { + stop_words_builder.insert(&word).unwrap(); + } + + // create the new delta stop words fst + let delta_stop_words = stop_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + // now we delete all of these stop words from the main store + let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default(); + + let op = OpBuilder::new() + .add(&stop_words_fst) + .add(&delta_stop_words) + .difference(); + + let mut stop_words_builder = SetBuilder::memory(); + stop_words_builder.extend_stream(op).unwrap(); + let stop_words_fst = stop_words_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + main_store.put_stop_words_fst(writer, &stop_words_fst)?; + + // now that we have setup the stop words + // lets reindex everything... + reindex_all_documents( + writer, + main_store, + documents_fields_store, + documents_fields_counts_store, + postings_lists_store, + docs_words_store, + )?; + + Ok(()) +}