diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 08e28be56..ad9f1646d 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -260,6 +260,9 @@ struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] stop_words: Setting>, + + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + synonyms: Setting>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -431,6 +434,13 @@ async fn main() -> anyhow::Result<()> { Setting::NotSet => () } + // We transpose the settings JSON struct into a real setting update. + match settings.synonyms { + Setting::Set(synonyms) => builder.set_synonyms(synonyms), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => () + } + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), @@ -1011,10 +1021,11 @@ mod tests { faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), + synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }) }; assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 5 }, + Token::Struct { name: "Settings", len: 6 }, Token::Str("displayedAttributes"), Token::Some, Token::Seq { len: Some(1) }, @@ -1041,6 +1052,14 @@ mod tests { Token::Seq { len: Some(1) }, Token::Str("and"), Token::SeqEnd, + Token::Str("synonyms"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("alex"), + Token::Seq {len: Some(1) }, + Token::Str("alexey"), + Token::SeqEnd, + Token::MapEnd, Token::StructEnd, ]); } @@ -1053,10 +1072,11 @@ mod tests { faceted_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, + synonyms: Setting::Reset, }; assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 5 }, + Token::Struct { name: "Settings", len: 6 }, Token::Str("displayedAttributes"), Token::None, Token::Str("searchableAttributes"), @@ -1067,6 +1087,8 @@ mod tests { Token::None, Token::Str("stopWords"), Token::None, + Token::Str("synonyms"), + Token::None, Token::StructEnd, ]); } @@ -1079,6 +1101,7 @@ mod tests { faceted_attributes: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, + synonyms: Setting::NotSet, }; assert_tokens(&settings, &[ diff --git a/milli/src/index.rs b/milli/src/index.rs index 7be618789..045eabc3c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,19 +3,19 @@ use std::collections::HashMap; use std::path::Path; use anyhow::Context; +use chrono::{DateTime, Utc}; +use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; -use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; -use chrono::{Utc, DateTime}; +use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; +use crate::{ + BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, +}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution}; -use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; -use crate::{ - RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, - StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, -}; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; @@ -31,6 +31,7 @@ pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; pub const STOP_WORDS_KEY: &str = "stop-words"; +pub const SYNONYMS_KEY: &str = "synonyms"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -376,12 +377,12 @@ impl Index { /* words fst */ - /// Writes the FST which is the words dictionnary of the engine. + /// Writes the FST which is the words dictionary of the engine. pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes()) } - /// Returns the FST which is the words dictionnary of the engine. + /// Returns the FST which is the words dictionary of the engine. pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), @@ -398,6 +399,7 @@ impl Index { pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) } + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), @@ -405,6 +407,25 @@ impl Index { } } + /* synonyms */ + + pub fn put_synonyms(&self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, SYNONYMS_KEY, synonyms) + } + + pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) + } + + pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { + Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, SYNONYMS_KEY)?.unwrap_or_default()) + } + + pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + Ok(self.synonyms(rtxn)?.remove(&words)) + } + /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. @@ -536,7 +557,7 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap(); - assert_eq!(fields_distribution, hashmap!{ + assert_eq!(fields_distribution, hashmap! { "name".to_string() => 2, "age".to_string() => 1, }); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 1941f0c6f..d21227507 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -177,12 +177,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { + self.index.words_synonyms(self.rtxn, words) } - fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { - Ok(None) + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) } } @@ -588,7 +588,6 @@ mod test { } impl Default for TestContext { - fn default() -> TestContext { let mut rng = StdRng::seed_from_u64(102); let rng = &mut rng; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index e63948082..a0cfbd315 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -5,6 +5,7 @@ use anyhow::Context; use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -71,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> { criteria: Setting>, stop_words: Setting>, distinct_attribute: Setting, + synonyms: Setting>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -96,6 +98,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_attribute: Setting::NotSet, + synonyms: Setting::NotSet, update_id, } } @@ -144,12 +147,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_distinct_attribute(&mut self) { + self.distinct_attribute = Setting::Reset; + } + pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { self.distinct_attribute = Setting::Set(distinct_attribute); } - pub fn reset_distinct_attribute(&mut self) { - self.distinct_attribute = Setting::Reset; + pub fn reset_synonyms(&mut self) { + self.synonyms = Setting::Reset; + } + + pub fn set_synonyms(&mut self, synonyms: HashMap>) { + self.synonyms = if synonyms.is_empty() { + Setting::Reset + } else { + Setting::Set(synonyms) + } } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> @@ -294,7 +309,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let current = self.index.stop_words(self.wtxn)?; // since we can't compare a BTreeSet with an FST we are going to convert the // BTreeSet to an FST and then compare bytes per bytes the two FSTs. - let fst = fst::Set::from_iter(&*stop_words)?; + let fst = fst::Set::from_iter(stop_words)?; // Does the new FST differ from the previous one? if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { @@ -310,6 +325,64 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_synonyms(&mut self) -> anyhow::Result { + match self.synonyms { + Setting::Set(ref synonyms) => { + fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { + analyzer + .analyze(text) + .tokens() + .filter_map(|token| + if token.is_word() { Some(token.text().to_string()) } else { None } + ) + .collect::>() + } + + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); + + let mut new_synonyms = HashMap::new(); + for (word, synonyms) in synonyms { + // Normalize both the word and associated synonyms. + let normalized_word = normalize(&analyzer, word); + let normalized_synonyms = synonyms + .iter() + .map(|synonym| normalize(&analyzer, synonym)); + + // Store the normalized synonyms under the normalized word, + // merging the possible duplicate words. + let entry = new_synonyms + .entry(normalized_word) + .or_insert_with(Vec::new); + entry.extend(normalized_synonyms); + } + + // Make sure that we don't have duplicate synonyms. + new_synonyms + .iter_mut() + .for_each(|(_, synonyms)| { + synonyms.sort_unstable(); + synonyms.dedup(); + }); + + let old_synonyms = self.index.synonyms(self.wtxn)?; + + if new_synonyms != old_synonyms { + self.index.put_synonyms(self.wtxn, &new_synonyms)?; + Ok(true) + } else { + Ok(false) + } + } + Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { Setting::Set(ref fields) => { @@ -359,9 +432,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // update_criteria MUST be called after update_facets, since criterion fields must be set // as facets. self.update_criteria()?; + let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; - if facets_updated || searchable_updated || stop_words_updated { + if stop_words_updated || facets_updated || synonyms_updated || searchable_updated { self.reindex(&progress_callback, old_fields_ids_map)?; } Ok(()) @@ -669,6 +743,64 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data } + #[test] + fn set_and_reset_synonyms() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + + // In the same transaction provide some synonyms + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_synonyms(hashmap! { + "blini".to_string() => vec!["crepes".to_string()], + "super like".to_string() => vec!["love".to_string()], + "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] + }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // Reset the synonyms + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_synonyms(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are reset + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(synonyms.is_empty()); + + // Check that synonyms are no longer work + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + } + #[test] fn setting_searchable_recomputes_other_settings() { let path = tempfile::tempdir().unwrap();