diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 86f965368..f068b5b9a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; @@ -128,7 +128,10 @@ struct Highlighter<'a, A> { impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn new(stop_words: &'a fst::Set) -> Self { - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let mut config = AnalyzerConfig::default(); + config.stop_words(stop_words); + let analyzer = Analyzer::new(config); + Self { analyzer } } @@ -266,6 +269,13 @@ struct Settings { skip_serializing_if = "Option::is_none", )] criteria: Option>>, + + #[serde( + default, + deserialize_with = "deserialize_some", + skip_serializing_if = "Option::is_none", + )] + stop_words: Option>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -439,6 +449,14 @@ async fn main() -> anyhow::Result<()> { } } + // We transpose the settings JSON struct into a real setting update. + if let Some(stop_words) = settings.stop_words { + match stop_words { + Some(stop_words) => builder.set_stop_words(stop_words), + None => builder.reset_stop_words(), + } + } + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), diff --git a/milli/src/index.rs b/milli/src/index.rs index 2e0d329ef..642ad4ab7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,6 +28,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; +pub const STOP_WORDS_KEY: &str = "stop-words"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -377,6 +378,22 @@ impl Index { } } + /* stop words */ + + pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes()) + } + + pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) + } + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), + None => Ok(None), + } + } + /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7560fbf0a..c88800f38 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -4,7 +4,7 @@ use std::fmt; use std::str::Utf8Error; use std::time::Instant; -use fst::{IntoStreamer, Streamer, Set}; +use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; @@ -91,8 +91,7 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::>::new(AnalyzerConfig::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)? diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 114032eb8..f7367d826 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -543,7 +543,6 @@ pub fn maximum_proximity(operation: &Operation) -> usize { mod test { use std::collections::HashMap; - use fst::Set; use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -646,8 +645,7 @@ mod test { #[test] fn prefix() { let query = "hey friends"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -667,8 +665,7 @@ mod test { #[test] fn no_prefix() { let query = "hey friends "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -688,8 +685,7 @@ mod test { #[test] fn synonyms() { let query = "hello world "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -720,8 +716,7 @@ mod test { #[test] fn complex_synonyms() { let query = "new york city "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -766,8 +761,7 @@ mod test { #[test] fn ngrams() { let query = "n grams "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -787,8 +781,7 @@ mod test { #[test] fn word_split() { let query = "wordsplit fish "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -814,8 +807,7 @@ mod test { #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -835,8 +827,7 @@ mod test { #[test] fn optional_word() { let query = "hey my friend "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -875,8 +866,7 @@ mod test { #[test] fn optional_word_phrase() { let query = "\"hey my\""; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -892,8 +882,7 @@ mod test { #[test] fn optional_word_multiple_phrases() { let query = r#""hey" my good "friend""#; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -927,8 +916,7 @@ mod test { #[test] fn no_typo() { let query = "hey friends "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -947,8 +935,7 @@ mod test { #[test] fn fetching_words() { let query = "wordsplit nyc world"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a19d8c0a7..f4a7c7f25 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -410,6 +410,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { None => fields_ids_map.iter().map(|(id, _name)| id).collect(), }; + let stop_words = self.index.stop_words(self.wtxn)?; + let stop_words = stop_words.as_ref(); let linked_hash_map_size = self.linked_hash_map_size; let max_nb_chunks = self.max_nb_chunks; let max_memory = self.max_memory; @@ -436,7 +438,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let readers = rayon::iter::repeatn(documents, num_threads) .enumerate() .map(|(i, documents)| { - let stop_words = fst::Set::default(); let store = Store::new( searchable_fields.clone(), faceted_fields.clone(), @@ -446,7 +447,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, - &stop_words, + stop_words, )?; store.index( documents, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 05767080a..03d91af24 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -86,7 +86,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_type: CompressionType, chunk_compression_level: Option, chunk_fusing_shrink_size: Option, - stop_words: &'s Set, + stop_words: Option<&'s Set>, ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. @@ -141,7 +141,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { create_writer(chunk_compression_type, chunk_compression_level, f) })?; - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let mut config = AnalyzerConfig::default(); + if let Some(stop_words) = stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); Ok(Store { // Indexing parameters. diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7ce8b98c1..451447102 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::str::FromStr; use anyhow::Context; @@ -32,6 +32,7 @@ pub struct Settings<'a, 't, 'u, 'i> { displayed_fields: Option>>, faceted_fields: Option>>, criteria: Option>>, + stop_words: Option>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -55,6 +56,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { displayed_fields: None, faceted_fields: None, criteria: None, + stop_words: None, update_id, } } @@ -91,6 +93,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.criteria = Some(Some(criteria)); } + pub fn reset_stop_words(&mut self) { + self.stop_words = Some(None); + } + + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.stop_words = if stop_words.is_empty() { + Some(None) + } else { + Some(Some(stop_words)) + } + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -210,6 +224,28 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } + fn update_stop_words(&mut self) -> anyhow::Result { + match self.stop_words { + Some(Some(ref stop_words)) => { + let current = self.index.stop_words(self.wtxn)?; + // since we can't compare a BTreeSet with an FST we are going to convert the + // BTreeSet to an FST and then compare bytes per bytes the two FSTs. + let fst = fst::Set::from_iter(&*stop_words)?; + + // Does the new FST differ from the previous one? + if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { + // we want to re-create our FST. + self.index.put_stop_words(self.wtxn, &fst)?; + Ok(true) + } else { + Ok(false) + } + } + Some(None) => Ok(self.index.delete_stop_words(self.wtxn)?), + None => Ok(false), + } + } + fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { Some(Some(ref fields)) => { @@ -248,22 +284,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync - { - self.index.set_updated_at(self.wtxn, &Utc::now())?; - let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; - self.update_displayed()?; - let facets_updated = self.update_facets()?; - // update_criteria MUST be called after update_facets, since criterion fields must be set - // as facets. - self.update_criteria()?; - let searchable_updated = self.update_searchable()?; + F: Fn(UpdateIndexingStep, u64) + Sync + { + self.index.set_updated_at(self.wtxn, &Utc::now())?; + let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; + self.update_displayed()?; + let stop_words_updated = self.update_stop_words()?; + let facets_updated = self.update_facets()?; + // update_criteria MUST be called after update_facets, since criterion fields must be set + // as facets. + self.update_criteria()?; + let searchable_updated = self.update_searchable()?; - if facets_updated || searchable_updated { - self.reindex(&progress_callback, old_fields_ids_map)?; - } - Ok(()) + if facets_updated || searchable_updated || stop_words_updated { + self.reindex(&progress_callback, old_fields_ids_map)?; } + Ok(()) + } } #[cfg(test)] @@ -271,7 +308,7 @@ mod tests { use super::*; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::{hashmap, btreeset}; use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; @@ -328,7 +365,6 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); - drop(rtxn); } #[test] @@ -372,7 +408,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids.unwrap(), &["age"][..]); - drop(rtxn); } #[test] @@ -394,7 +429,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids, None); - drop(rtxn); } #[test] @@ -434,7 +468,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids, None); - drop(rtxn); } #[test] @@ -478,7 +511,96 @@ mod tests { // Only count the field_id 0 and level 0 facet values. let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 4); - drop(rtxn); + } + + #[test] + fn default_stop_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure there is no stop_words by default + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + } + + #[test] + fn set_and_reset_stop_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + + // In the same transaction we provide some stop_words + let mut builder = Settings::new(&mut wtxn, &index, 0); + let set = btreeset!{ "i".to_string(), "the".to_string(), "are".to_string() }; + builder.set_stop_words(set.clone()); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure stop_words are effectively stored + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_some()); // at this point the index should return something + + let stop_words = stop_words.unwrap(); + let expected = fst::Set::from_iter(&set).unwrap(); + assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); + + // when we search for something that is a non prefix stop_words it should be ignored + let result = index.search(&rtxn).query("the ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("i ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("are ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data + + // now we'll reset the stop_words and ensure it's None + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_stop_words(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + + // now we can search for the stop words + let result = index.search(&rtxn).query("the").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + let result = index.search(&rtxn).query("i").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("are").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // the rest of the search is still not impacted + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data } #[test] @@ -519,6 +641,5 @@ mod tests { assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); assert!(index.primary_key(&rtxn).unwrap().is_none()); assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); - drop(rtxn); } }