From 2d1727697dbc802822a5b427f11cdfd1aada768c Mon Sep 17 00:00:00 2001 From: many Date: Tue, 17 Aug 2021 12:25:07 +0200 Subject: [PATCH] Take stop word in account --- .../index_documents/extract/extract_docid_word_positions.rs | 5 +++++ milli/src/update/index_documents/extract/mod.rs | 2 ++ milli/src/update/index_documents/mod.rs | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9a9d7cb85..3ee7ee3b3 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -21,6 +21,7 @@ pub fn extract_docid_word_positions( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, searchable_fields: &Option>, + stop_words: Option<&fst::Set<&[u8]>>, ) -> Result<(RoaringBitmap, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); @@ -35,6 +36,10 @@ pub fn extract_docid_word_positions( let mut key_buffer = Vec::new(); let mut field_buffer = String::new(); + let mut config = AnalyzerConfig::default(); + if let Some(stop_words) = stop_words { + config.stop_words(stop_words); + } let analyzer = Analyzer::>::new(AnalyzerConfig::default()); while let Some((key, value)) = obkv_documents.next()? { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index a389f36cf..00c0a4a5f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -37,6 +37,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender, searchable_fields: Option>, faceted_fields: HashSet, + stop_words: Option>, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -54,6 +55,7 @@ pub(crate) fn data_from_obkv_documents( documents_chunk.clone(), indexer.clone(), &searchable_fields, + stop_words.as_ref(), )?; // send documents_ids to DB writer diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 51b0a6613..c9f5da0c1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -231,6 +231,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + let stop_words = self.index.stop_words(self.wtxn)?; + // let stop_words = stop_words.as_ref(); + // Run extraction pipeline in parallel. pool.install(|| { let params = GrenadParameters { @@ -255,6 +258,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx, searchable_fields, faceted_fields, + stop_words, ) .unwrap(); });