Merge pull request #244 from meilisearch/reintroduce-stop-words

Reintroduce stop words
2024-12-23 13:10:06 +01:00 · 2019-10-29 16:35:03 +01:00 · 2019-10-29 16:35:03 +01:00 · 41065305aa
commit 41065305aa
parent 32d2cc3aea e9dce3ce81
7 changed files with 367 additions and 23 deletions
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
 pub struct RawIndexer {
    word_limit: usize, // the maximum number of indexed words
    stop_words: fst::Set,
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
 }
@ -21,13 +22,14 @@ pub struct Indexed {
 }
 impl RawIndexer {
-    pub fn new() -> RawIndexer {
+    pub fn new(stop_words: fst::Set) -> RawIndexer {
-        RawIndexer::with_word_limit(1000)
+        RawIndexer::with_word_limit(stop_words, 1000)
    }
-    pub fn with_word_limit(limit: usize) -> RawIndexer {
+    pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
        RawIndexer {
            word_limit: limit,
            stop_words,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
        }
@ -56,6 +58,7 @@ impl RawIndexer {
                    id,
                    attr,
                    self.word_limit,
                    &self.stop_words,
                    &mut self.words_doc_indexes,
                    &mut self.docs_words,
                );
@ -87,6 +90,7 @@ impl RawIndexer {
                id,
                attr,
                self.word_limit,
                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );
@ -118,6 +122,7 @@ impl RawIndexer {
                id,
                attr,
                self.word_limit,
                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );
@ -152,17 +157,12 @@ impl RawIndexer {
    }
 }
 impl Default for RawIndexer {
    fn default() -> Self {
        Self::new()
    }
 }
 fn index_token(
    token: Token,
    id: DocumentId,
    attr: SchemaAttr,
    word_limit: usize,
    stop_words: &fst::Set,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
 ) -> bool {
@ -170,16 +170,18 @@ fn index_token(
        return false;
    }
-    match token_to_docindex(id, attr, token) {
+    if !stop_words.contains(&token.word) {
-        Some(docindex) => {
+        match token_to_docindex(id, attr, token) {
-            let word = Vec::from(token.word);
+            Some(docindex) => {
-            words_doc_indexes
+                let word = Vec::from(token.word);
-                .entry(word.clone())
+                words_doc_indexes
-                .or_insert_with(Vec::new)
+                    .entry(word.clone())
-                .push(docindex);
+                    .or_insert_with(Vec::new)
-            docs_words.entry(id).or_insert_with(Vec::new).push(word);
+                    .push(docindex);
                docs_words.entry(id).or_insert_with(Vec::new).push(word);
            }
            None => return false,
        }
        None => return false,
    }
    true
@ -207,7 +209,7 @@ mod tests {
    #[test]
    fn strange_apostrophe() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());
        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
@ -231,7 +233,7 @@ mod tests {
    #[test]
    fn strange_apostrophe_in_sequence() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());
        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
@ -252,4 +254,33 @@ mod tests {
            .get(&"l’éteindre".to_owned().into_bytes())
            .is_some());
    }
    #[test]
    fn basic_stop_words() {
        let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
        let stop_words = fst::Set::from_iter(stop_words).unwrap();
        let mut indexer = RawIndexer::new(stop_words);
        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
        indexer.index_text(docid, attr, text);
        let Indexed {
            words_doc_indexes, ..
        } = indexer.build();
        assert!(words_doc_indexes.get(&b"l"[..]).is_none());
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"j"[..]).is_none());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
        assert!(words_doc_indexes.get(&b"de"[..]).is_none());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        // with the ugly apostrophe...
        assert!(words_doc_indexes
            .get(&"l’éteindre".to_owned().into_bytes())
            .is_some());
    }
 }
--- a/meilidb-core/src/store/main.rs
+++ b/meilidb-core/src/store/main.rs
@ -9,6 +9,7 @@ const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
 const RANKED_MAP_KEY: &str = "ranked-map";
 const SCHEMA_KEY: &str = "schema";
 const SYNONYMS_KEY: &str = "synonyms";
 const STOP_WORDS_KEY: &str = "stop-words";
 const WORDS_KEY: &str = "words";
 #[derive(Copy, Clone)]
@ -71,6 +72,24 @@ impl Main {
        }
    }
    pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
        let bytes = fst.as_fst().as_bytes();
        self.main
            .put::<Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
    }
    pub fn stop_words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
        match self.main.get::<Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
            Some(bytes) => {
                let len = bytes.len();
                let bytes = Arc::from(bytes);
                let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
                Ok(Some(fst::Set::from(fst)))
            }
            None => Ok(None),
        }
    }
    pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
    where
        F: Fn(u64) -> u64,
--- a/meilidb-core/src/store/mod.rs
+++ b/meilidb-core/src/store/mod.rs
@ -187,6 +187,22 @@ impl Index {
        )
    }
    pub fn stop_words_addition(&self) -> update::StopWordsAddition {
        update::StopWordsAddition::new(
            self.updates,
            self.updates_results,
            self.updates_notifier.clone(),
        )
    }
    pub fn stop_words_deletion(&self) -> update::StopWordsDeletion {
        update::StopWordsDeletion::new(
            self.updates,
            self.updates_results,
            self.updates_notifier.clone(),
        )
    }
    pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
        match self.updates.last_update_id(reader)? {
            Some((id, _)) => Ok(Some(id)),
--- a/meilidb-core/src/update/documents_addition.rs
+++ b/meilidb-core/src/update/documents_addition.rs
@ -87,7 +87,6 @@ pub fn apply_documents_addition(
    addition: Vec<serde_json::Value>,
 ) -> MResult<()> {
    let mut documents_additions = HashMap::new();
    let mut indexer = RawIndexer::new();
    let schema = match main_store.schema(writer)? {
        Some(schema) => schema,
@ -124,7 +123,14 @@ pub fn apply_documents_addition(
        None => RankedMap::default(),
    };
    let stop_words = match main_store.stop_words_fst(writer)? {
        Some(stop_words) => stop_words,
        None => fst::Set::default(),
    };
    // 3. index the documents fields in the stores
    let mut indexer = RawIndexer::new(stop_words);
    for (document_id, document) in documents_additions {
        let serializer = Serializer {
            txn: writer,
@ -180,8 +186,13 @@ pub fn reindex_all_documents(
    postings_lists_store.clear(writer)?;
    docs_words_store.clear(writer)?;
    let stop_words = match main_store.stop_words_fst(writer)? {
        Some(stop_words) => stop_words,
        None => fst::Set::default(),
    };
    // 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
-    let mut indexer = RawIndexer::new();
+    let mut indexer = RawIndexer::new(stop_words);
    let mut ram_store = HashMap::new();
    for document_id in documents_ids_to_reindex {
--- a/meilidb-core/src/update/mod.rs
+++ b/meilidb-core/src/update/mod.rs
@ -3,6 +3,8 @@ mod customs_update;
 mod documents_addition;
 mod documents_deletion;
 mod schema_update;
 mod stop_words_addition;
 mod stop_words_deletion;
 mod synonyms_addition;
 mod synonyms_deletion;
@ -11,11 +13,13 @@ pub use self::customs_update::{apply_customs_update, push_customs_update};
 pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
 pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
 pub use self::schema_update::{apply_schema_update, push_schema_update};
 pub use self::stop_words_addition::{apply_stop_words_addition, StopWordsAddition};
 pub use self::stop_words_deletion::{apply_stop_words_deletion, StopWordsDeletion};
 pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
 pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
 use std::cmp;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet};
 use std::time::{Duration, Instant};
 use heed::Result as ZResult;
@ -34,6 +38,8 @@ pub enum Update {
    DocumentsDeletion(Vec<DocumentId>),
    SynonymsAddition(BTreeMap<String, Vec<String>>),
    SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
    StopWordsAddition(BTreeSet<String>),
    StopWordsDeletion(BTreeSet<String>),
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -45,6 +51,8 @@ pub enum UpdateType {
    DocumentsDeletion { number: usize },
    SynonymsAddition { number: usize },
    SynonymsDeletion { number: usize },
    StopWordsAddition { number: usize },
    StopWordsDeletion { number: usize },
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
@ -210,6 +218,37 @@ pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Opt
            let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
            (update_type, result, start.elapsed())
        }
        Update::StopWordsAddition(stop_words) => {
            let start = Instant::now();
            let update_type = UpdateType::StopWordsAddition {
                number: stop_words.len(),
            };
            let result =
                apply_stop_words_addition(writer, index.main, index.postings_lists, stop_words);
            (update_type, result, start.elapsed())
        }
        Update::StopWordsDeletion(stop_words) => {
            let start = Instant::now();
            let update_type = UpdateType::StopWordsDeletion {
                number: stop_words.len(),
            };
            let result = apply_stop_words_deletion(
                writer,
                index.main,
                index.documents_fields,
                index.documents_fields_counts,
                index.postings_lists,
                index.docs_words,
                stop_words,
            );
            (update_type, result, start.elapsed())
        }
    };
--- a/meilidb-core/src/update/stop_words_addition.rs
+++ b/meilidb-core/src/update/stop_words_addition.rs
@ -0,0 +1,116 @@
 use std::collections::BTreeSet;
 use fst::{set::OpBuilder, SetBuilder};
 use crate::automaton::normalize_str;
 use crate::update::{next_update_id, Update};
 use crate::{store, MResult};
 pub struct StopWordsAddition {
    updates_store: store::Updates,
    updates_results_store: store::UpdatesResults,
    updates_notifier: crossbeam_channel::Sender<()>,
    stop_words: BTreeSet<String>,
 }
 impl StopWordsAddition {
    pub fn new(
        updates_store: store::Updates,
        updates_results_store: store::UpdatesResults,
        updates_notifier: crossbeam_channel::Sender<()>,
    ) -> StopWordsAddition {
        StopWordsAddition {
            updates_store,
            updates_results_store,
            updates_notifier,
            stop_words: BTreeSet::new(),
        }
    }
    pub fn add_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
        let stop_word = normalize_str(stop_word.as_ref());
        self.stop_words.insert(stop_word);
    }
    pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
        let _ = self.updates_notifier.send(());
        let update_id = push_stop_words_addition(
            writer,
            self.updates_store,
            self.updates_results_store,
            self.stop_words,
        )?;
        Ok(update_id)
    }
 }
 pub fn push_stop_words_addition(
    writer: &mut heed::RwTxn,
    updates_store: store::Updates,
    updates_results_store: store::UpdatesResults,
    addition: BTreeSet<String>,
 ) -> MResult<u64> {
    let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
    let update = Update::StopWordsAddition(addition);
    updates_store.put_update(writer, last_update_id, &update)?;
    Ok(last_update_id)
 }
 pub fn apply_stop_words_addition(
    writer: &mut heed::RwTxn,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
    addition: BTreeSet<String>,
 ) -> MResult<()> {
    let mut stop_words_builder = SetBuilder::memory();
    for word in addition {
        stop_words_builder.insert(&word).unwrap();
        // we remove every posting list associated to a new stop word
        postings_lists_store.del_postings_list(writer, word.as_bytes())?;
    }
    // create the new delta stop words fst
    let delta_stop_words = stop_words_builder
        .into_inner()
        .and_then(fst::Set::from_bytes)
        .unwrap();
    // we also need to remove all the stop words from the main fst
    if let Some(word_fst) = main_store.words_fst(writer)? {
        let op = OpBuilder::new()
            .add(&word_fst)
            .add(&delta_stop_words)
            .difference();
        let mut word_fst_builder = SetBuilder::memory();
        word_fst_builder.extend_stream(op).unwrap();
        let word_fst = word_fst_builder
            .into_inner()
            .and_then(fst::Set::from_bytes)
            .unwrap();
        main_store.put_words_fst(writer, &word_fst)?;
    }
    // now we add all of these stop words from the main store
    let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
    let op = OpBuilder::new()
        .add(&stop_words_fst)
        .add(&delta_stop_words)
        .r#union();
    let mut stop_words_builder = SetBuilder::memory();
    stop_words_builder.extend_stream(op).unwrap();
    let stop_words_fst = stop_words_builder
        .into_inner()
        .and_then(fst::Set::from_bytes)
        .unwrap();
    main_store.put_stop_words_fst(writer, &stop_words_fst)?;
    Ok(())
 }
--- a/meilidb-core/src/update/stop_words_deletion.rs
+++ b/meilidb-core/src/update/stop_words_deletion.rs
@ -0,0 +1,112 @@
 use std::collections::BTreeSet;
 use fst::{set::OpBuilder, SetBuilder};
 use crate::automaton::normalize_str;
 use crate::update::documents_addition::reindex_all_documents;
 use crate::update::{next_update_id, Update};
 use crate::{store, MResult};
 pub struct StopWordsDeletion {
    updates_store: store::Updates,
    updates_results_store: store::UpdatesResults,
    updates_notifier: crossbeam_channel::Sender<()>,
    stop_words: BTreeSet<String>,
 }
 impl StopWordsDeletion {
    pub fn new(
        updates_store: store::Updates,
        updates_results_store: store::UpdatesResults,
        updates_notifier: crossbeam_channel::Sender<()>,
    ) -> StopWordsDeletion {
        StopWordsDeletion {
            updates_store,
            updates_results_store,
            updates_notifier,
            stop_words: BTreeSet::new(),
        }
    }
    pub fn delete_stop_word<S: AsRef<str>>(&mut self, stop_word: S) {
        let stop_word = normalize_str(stop_word.as_ref());
        self.stop_words.insert(stop_word);
    }
    pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
        let _ = self.updates_notifier.send(());
        let update_id = push_stop_words_deletion(
            writer,
            self.updates_store,
            self.updates_results_store,
            self.stop_words,
        )?;
        Ok(update_id)
    }
 }
 pub fn push_stop_words_deletion(
    writer: &mut heed::RwTxn,
    updates_store: store::Updates,
    updates_results_store: store::UpdatesResults,
    deletion: BTreeSet<String>,
 ) -> MResult<u64> {
    let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
    let update = Update::StopWordsDeletion(deletion);
    updates_store.put_update(writer, last_update_id, &update)?;
    Ok(last_update_id)
 }
 pub fn apply_stop_words_deletion(
    writer: &mut heed::RwTxn,
    main_store: store::Main,
    documents_fields_store: store::DocumentsFields,
    documents_fields_counts_store: store::DocumentsFieldsCounts,
    postings_lists_store: store::PostingsLists,
    docs_words_store: store::DocsWords,
    deletion: BTreeSet<String>,
 ) -> MResult<()> {
    let mut stop_words_builder = SetBuilder::memory();
    for word in deletion {
        stop_words_builder.insert(&word).unwrap();
    }
    // create the new delta stop words fst
    let delta_stop_words = stop_words_builder
        .into_inner()
        .and_then(fst::Set::from_bytes)
        .unwrap();
    // now we delete all of these stop words from the main store
    let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
    let op = OpBuilder::new()
        .add(&stop_words_fst)
        .add(&delta_stop_words)
        .difference();
    let mut stop_words_builder = SetBuilder::memory();
    stop_words_builder.extend_stream(op).unwrap();
    let stop_words_fst = stop_words_builder
        .into_inner()
        .and_then(fst::Set::from_bytes)
        .unwrap();
    main_store.put_stop_words_fst(writer, &stop_words_fst)?;
    // now that we have setup the stop words
    // lets reindex everything...
    reindex_all_documents(
        writer,
        main_store,
        documents_fields_store,
        documents_fields_counts_store,
        postings_lists_store,
        docs_words_store,
    )?;
    Ok(())
 }