diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 605b6a7ba..ad9f1646d 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1021,11 +1021,11 @@ mod tests { faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::NotSet + synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }) }; assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 5 }, + Token::Struct { name: "Settings", len: 6 }, Token::Str("displayedAttributes"), Token::Some, Token::Seq { len: Some(1) }, @@ -1052,6 +1052,14 @@ mod tests { Token::Seq { len: Some(1) }, Token::Str("and"), Token::SeqEnd, + Token::Str("synonyms"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("alex"), + Token::Seq {len: Some(1) }, + Token::Str("alexey"), + Token::SeqEnd, + Token::MapEnd, Token::StructEnd, ]); } @@ -1064,11 +1072,11 @@ mod tests { faceted_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, - synonyms: Setting::NotSet + synonyms: Setting::Reset, }; assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 5 }, + Token::Struct { name: "Settings", len: 6 }, Token::Str("displayedAttributes"), Token::None, Token::Str("searchableAttributes"), @@ -1079,6 +1087,8 @@ mod tests { Token::None, Token::Str("stopWords"), Token::None, + Token::Str("synonyms"), + Token::None, Token::StructEnd, ]); } @@ -1091,6 +1101,7 @@ mod tests { faceted_attributes: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, + synonyms: Setting::NotSet, }; assert_tokens(&settings, &[ diff --git a/milli/src/index.rs b/milli/src/index.rs index d743445e3..045eabc3c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -417,22 +417,13 @@ impl Index { self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) } - pub fn synonyms(&self, rtxn: &RoTxn) -> anyhow::Result, Vec>>>> { - match self.main.get::<_, Str, SerdeBincode, Vec>>>>(rtxn, SYNONYMS_KEY)? { - Some(synonyms) => Ok(Some(synonyms)), - None => Ok(None), - } + pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { + Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, SYNONYMS_KEY)?.unwrap_or_default()) } - pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> anyhow::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); - - match self.synonyms(rtxn)? { - Some(synonyms) => Ok(Some( - synonyms.get(&words).cloned().unwrap_or(Vec::default()) - )), - None => Ok(None) - } + pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + Ok(self.synonyms(rtxn)?.remove(&words)) } /* words prefixes fst */ diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b2fd62771..d21227507 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,7 +155,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn synonyms>(&self, words: &[S]) -> anyhow::Result>>>; + fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { Some(rb) => Ok(Some(rb.len())), @@ -177,7 +177,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } @@ -270,10 +270,10 @@ fn typos(word: String, authorize_typos: bool) -> QueryKind { } } -/// Fetch synonyms from the `Context` for the provided words +/// Fetch synonyms from the `Context` for the provided word /// and create the list of operations for the query tree -fn synonyms(ctx: &impl Context, words: &[&str]) -> anyhow::Result>> { - let synonyms = ctx.synonyms(words)?; +fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { + let synonyms = ctx.synonyms(word)?; Ok(synonyms.map(|synonyms| { synonyms.into_iter().map(|synonym| { @@ -581,8 +581,8 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 336c0e253..a0cfbd315 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -5,6 +5,7 @@ use anyhow::Context; use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -13,7 +14,6 @@ use crate::criterion::Criterion; use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; -use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -328,18 +328,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_synonyms(&mut self) -> anyhow::Result { match self.synonyms { Setting::Set(ref synonyms) => { - let old_synonyms = self.index.synonyms(self.wtxn)?.unwrap_or_default(); - - let mut config = AnalyzerConfig::default(); - - let stop_words = self.index.stop_words(self.wtxn)?; - if let Some(stop_words) = &stop_words { - config.stop_words(stop_words); - } - - let analyzer = Analyzer::new(config); - - let normalize = |text: &String| { + fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { analyzer .analyze(text) .tokens() @@ -347,20 +336,40 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { if token.is_word() { Some(token.text().to_string()) } else { None } ) .collect::>() - }; + } - let new_synonyms = synonyms - .iter() - .map(|(word, synonyms)| { - let normalized_word = normalize(word); - let normalized_synonyms = synonyms.iter() - .map(normalize) - .unique() - .collect::>(); + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); - (normalized_word, normalized_synonyms) - }) - .collect(); + let mut new_synonyms = HashMap::new(); + for (word, synonyms) in synonyms { + // Normalize both the word and associated synonyms. + let normalized_word = normalize(&analyzer, word); + let normalized_synonyms = synonyms + .iter() + .map(|synonym| normalize(&analyzer, synonym)); + + // Store the normalized synonyms under the normalized word, + // merging the possible duplicate words. + let entry = new_synonyms + .entry(normalized_word) + .or_insert_with(Vec::new); + entry.extend(normalized_synonyms); + } + + // Make sure that we don't have duplicate synonyms. + new_synonyms + .iter_mut() + .for_each(|(_, synonyms)| { + synonyms.sort_unstable(); + synonyms.dedup(); + }); + + let old_synonyms = self.index.synonyms(self.wtxn)?; if new_synonyms != old_synonyms { self.index.put_synonyms(self.wtxn, &new_synonyms)?; @@ -734,6 +743,64 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data } + #[test] + fn set_and_reset_synonyms() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + + // In the same transaction provide some synonyms + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_synonyms(hashmap! { + "blini".to_string() => vec!["crepes".to_string()], + "super like".to_string() => vec!["love".to_string()], + "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] + }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // Reset the synonyms + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_synonyms(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are reset + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(synonyms.is_empty()); + + // Check that synonyms are no longer work + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + } + #[test] fn setting_searchable_recomputes_other_settings() { let path = tempfile::tempdir().unwrap();