From ff7dde752205c613d6a4ec0ff8add7c1366a2ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 29 Oct 2019 15:53:45 +0100 Subject: [PATCH] Make the RawIndexer support stop words --- meilidb-core/src/raw_indexer.rs | 26 +++++++++++-------- meilidb-core/src/update/documents_addition.rs | 15 +++++++++-- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 396134436..967033d37 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -11,6 +11,7 @@ type Word = Vec; // TODO make it be a SmallVec pub struct RawIndexer { word_limit: usize, // the maximum number of indexed words + stop_words: fst::Set, words_doc_indexes: BTreeMap>, docs_words: HashMap>, } @@ -21,13 +22,14 @@ pub struct Indexed { } impl RawIndexer { - pub fn new() -> RawIndexer { - RawIndexer::with_word_limit(1000) + pub fn new(stop_words: fst::Set) -> RawIndexer { + RawIndexer::with_word_limit(stop_words, 1000) } - pub fn with_word_limit(limit: usize) -> RawIndexer { + pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer { RawIndexer { word_limit: limit, + stop_words, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), } @@ -56,6 +58,7 @@ impl RawIndexer { id, attr, self.word_limit, + &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -87,6 +90,7 @@ impl RawIndexer { id, attr, self.word_limit, + &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -118,6 +122,7 @@ impl RawIndexer { id, attr, self.word_limit, + &self.stop_words, &mut self.words_doc_indexes, &mut self.docs_words, ); @@ -152,17 +157,12 @@ impl RawIndexer { } } -impl Default for RawIndexer { - fn default() -> Self { - Self::new() - } -} - fn index_token( token: Token, id: DocumentId, attr: SchemaAttr, word_limit: usize, + stop_words: &fst::Set, words_doc_indexes: &mut BTreeMap>, docs_words: &mut HashMap>, ) -> bool { @@ -170,6 +170,10 @@ fn index_token( return false; } + if stop_words.contains(&token.word) { + return false; + } + match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); @@ -207,7 +211,7 @@ mod tests { #[test] fn strange_apostrophe() { - let mut indexer = RawIndexer::new(); + let mut indexer = RawIndexer::new(fst::Set::default()); let docid = DocumentId(0); let attr = SchemaAttr(0); @@ -231,7 +235,7 @@ mod tests { #[test] fn strange_apostrophe_in_sequence() { - let mut indexer = RawIndexer::new(); + let mut indexer = RawIndexer::new(fst::Set::default()); let docid = DocumentId(0); let attr = SchemaAttr(0); diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs index 79387600c..17e55527d 100644 --- a/meilidb-core/src/update/documents_addition.rs +++ b/meilidb-core/src/update/documents_addition.rs @@ -87,7 +87,6 @@ pub fn apply_documents_addition( addition: Vec, ) -> MResult<()> { let mut documents_additions = HashMap::new(); - let mut indexer = RawIndexer::new(); let schema = match main_store.schema(writer)? { Some(schema) => schema, @@ -124,7 +123,14 @@ pub fn apply_documents_addition( None => RankedMap::default(), }; + let stop_words = match main_store.stop_words_fst(writer)? { + Some(stop_words) => stop_words, + None => fst::Set::default(), + }; + // 3. index the documents fields in the stores + let mut indexer = RawIndexer::new(stop_words); + for (document_id, document) in documents_additions { let serializer = Serializer { txn: writer, @@ -180,8 +186,13 @@ pub fn reindex_all_documents( postings_lists_store.clear(writer)?; docs_words_store.clear(writer)?; + let stop_words = match main_store.stop_words_fst(writer)? { + Some(stop_words) => stop_words, + None => fst::Set::default(), + }; + // 3. re-index one document by one document (otherwise we make the borrow checker unhappy) - let mut indexer = RawIndexer::new(); + let mut indexer = RawIndexer::new(stop_words); let mut ram_store = HashMap::new(); for document_id in documents_ids_to_reindex {