From 0bc1a18f524377ef9e8596367dfb17478502f5cd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Feb 2023 18:57:43 +0100 Subject: [PATCH] Use Languages list detected during indexing at search time --- milli/src/index.rs | 20 ++++++++++++++++++++ milli/src/search/mod.rs | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 803c04a50..c14d131a6 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1206,6 +1206,26 @@ impl Index { let doc_ids = self.script_language_docids.get(rtxn, key)?; Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) } + + pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; + + let mut script_language: HashMap> = HashMap::new(); + for sl in self.script_language_docids.iter(rtxn)? { + let ((script, language), docids) = sl?; + + // keep only Languages that contains at least 1 document. + if !soft_deleted_documents.is_superset(&docids) { + if let Some(languages) = script_language.get_mut(&script) { + (*languages).push(language); + } else { + script_language.insert(script, vec![language]); + } + } + } + + Ok(script_language) + } } #[cfg(test)] diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index b5274599c..f6970fcd1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -152,6 +152,11 @@ impl<'a> Search<'a> { tokbuilder.stop_words(stop_words); } + let script_lang_map = self.index.script_language(self.rtxn)?; + if !script_lang_map.is_empty() { + tokbuilder.allow_list(&script_lang_map); + } + let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); builder