From ff7dde752205c613d6a4ec0ff8add7c1366a2ed4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 29 Oct 2019 15:53:45 +0100
Subject: [PATCH] Make the RawIndexer support stop words

---
 meilidb-core/src/raw_indexer.rs               | 26 +++++++++++--------
 meilidb-core/src/update/documents_addition.rs | 15 +++++++++--
 2 files changed, 28 insertions(+), 13 deletions(-)
diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs
index 396134436..967033d37 100644
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
 
 pub struct RawIndexer {
     word_limit: usize, // the maximum number of indexed words
+    stop_words: fst::Set,
     words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
     docs_words: HashMap<DocumentId, Vec<Word>>,
 }
@@ -21,13 +22,14 @@ pub struct Indexed {
 }
 
 impl RawIndexer {
-    pub fn new() -> RawIndexer {
-        RawIndexer::with_word_limit(1000)
+    pub fn new(stop_words: fst::Set) -> RawIndexer {
+        RawIndexer::with_word_limit(stop_words, 1000)
     }
 
-    pub fn with_word_limit(limit: usize) -> RawIndexer {
+    pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
         RawIndexer {
             word_limit: limit,
+            stop_words,
             words_doc_indexes: BTreeMap::new(),
             docs_words: HashMap::new(),
         }
@@ -56,6 +58,7 @@ impl RawIndexer {
                     id,
                     attr,
                     self.word_limit,
+                    &self.stop_words,
                     &mut self.words_doc_indexes,
                     &mut self.docs_words,
                 );
@@ -87,6 +90,7 @@ impl RawIndexer {
                 id,
                 attr,
                 self.word_limit,
+                &self.stop_words,
                 &mut self.words_doc_indexes,
                 &mut self.docs_words,
             );
@@ -118,6 +122,7 @@ impl RawIndexer {
                 id,
                 attr,
                 self.word_limit,
+                &self.stop_words,
                 &mut self.words_doc_indexes,
                 &mut self.docs_words,
             );
@@ -152,17 +157,12 @@ impl RawIndexer {
     }
 }
 
-impl Default for RawIndexer {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 fn index_token(
     token: Token,
     id: DocumentId,
     attr: SchemaAttr,
     word_limit: usize,
+    stop_words: &fst::Set,
     words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
     docs_words: &mut HashMap<DocumentId, Vec<Word>>,
 ) -> bool {
@@ -170,6 +170,10 @@ fn index_token(
         return false;
     }
 
+    if stop_words.contains(&token.word) {
+        return false;
+    }
+
     match token_to_docindex(id, attr, token) {
         Some(docindex) => {
             let word = Vec::from(token.word);
@@ -207,7 +211,7 @@ mod tests {
 
     #[test]
     fn strange_apostrophe() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());
 
         let docid = DocumentId(0);
         let attr = SchemaAttr(0);
@@ -231,7 +235,7 @@ mod tests {
 
     #[test]
     fn strange_apostrophe_in_sequence() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());
 
         let docid = DocumentId(0);
         let attr = SchemaAttr(0);
diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs
index 79387600c..17e55527d 100644
--- a/meilidb-core/src/update/documents_addition.rs
+++ b/meilidb-core/src/update/documents_addition.rs
@@ -87,7 +87,6 @@ pub fn apply_documents_addition(
     addition: Vec<serde_json::Value>,
 ) -> MResult<()> {
     let mut documents_additions = HashMap::new();
-    let mut indexer = RawIndexer::new();
 
     let schema = match main_store.schema(writer)? {
         Some(schema) => schema,
@@ -124,7 +123,14 @@ pub fn apply_documents_addition(
         None => RankedMap::default(),
     };
 
+    let stop_words = match main_store.stop_words_fst(writer)? {
+        Some(stop_words) => stop_words,
+        None => fst::Set::default(),
+    };
+
     // 3. index the documents fields in the stores
+    let mut indexer = RawIndexer::new(stop_words);
+
     for (document_id, document) in documents_additions {
         let serializer = Serializer {
             txn: writer,
@@ -180,8 +186,13 @@ pub fn reindex_all_documents(
     postings_lists_store.clear(writer)?;
     docs_words_store.clear(writer)?;
 
+    let stop_words = match main_store.stop_words_fst(writer)? {
+        Some(stop_words) => stop_words,
+        None => fst::Set::default(),
+    };
+
     // 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
-    let mut indexer = RawIndexer::new();
+    let mut indexer = RawIndexer::new(stop_words);
     let mut ram_store = HashMap::new();
 
     for document_id in documents_ids_to_reindex {