test(update, settings): set & reset synonyms

fixes after review

more fixes after review
This commit is contained in:
Alexey Shekhirin 2021-04-09 22:56:20 +03:00
parent e39aabbfe6
commit 33860bc3b7
No known key found for this signature in database
GPG Key ID: AF9A26AA133B5B98
4 changed files with 119 additions and 50 deletions

View File

@ -1021,11 +1021,11 @@ mod tests {
faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }),
criteria: Setting::Set(vec!["asc(age)".to_string()]), criteria: Setting::Set(vec!["asc(age)".to_string()]),
stop_words: Setting::Set(btreeset! { "and".to_string() }), stop_words: Setting::Set(btreeset! { "and".to_string() }),
synonyms: Setting::NotSet synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] })
}; };
assert_tokens(&settings, &[ assert_tokens(&settings, &[
Token::Struct { name: "Settings", len: 5 }, Token::Struct { name: "Settings", len: 6 },
Token::Str("displayedAttributes"), Token::Str("displayedAttributes"),
Token::Some, Token::Some,
Token::Seq { len: Some(1) }, Token::Seq { len: Some(1) },
@ -1052,6 +1052,14 @@ mod tests {
Token::Seq { len: Some(1) }, Token::Seq { len: Some(1) },
Token::Str("and"), Token::Str("and"),
Token::SeqEnd, Token::SeqEnd,
Token::Str("synonyms"),
Token::Some,
Token::Map { len: Some(1) },
Token::Str("alex"),
Token::Seq {len: Some(1) },
Token::Str("alexey"),
Token::SeqEnd,
Token::MapEnd,
Token::StructEnd, Token::StructEnd,
]); ]);
} }
@ -1064,11 +1072,11 @@ mod tests {
faceted_attributes: Setting::Reset, faceted_attributes: Setting::Reset,
criteria: Setting::Reset, criteria: Setting::Reset,
stop_words: Setting::Reset, stop_words: Setting::Reset,
synonyms: Setting::NotSet synonyms: Setting::Reset,
}; };
assert_tokens(&settings, &[ assert_tokens(&settings, &[
Token::Struct { name: "Settings", len: 5 }, Token::Struct { name: "Settings", len: 6 },
Token::Str("displayedAttributes"), Token::Str("displayedAttributes"),
Token::None, Token::None,
Token::Str("searchableAttributes"), Token::Str("searchableAttributes"),
@ -1079,6 +1087,8 @@ mod tests {
Token::None, Token::None,
Token::Str("stopWords"), Token::Str("stopWords"),
Token::None, Token::None,
Token::Str("synonyms"),
Token::None,
Token::StructEnd, Token::StructEnd,
]); ]);
} }
@ -1091,6 +1101,7 @@ mod tests {
faceted_attributes: Setting::NotSet, faceted_attributes: Setting::NotSet,
criteria: Setting::NotSet, criteria: Setting::NotSet,
stop_words: Setting::NotSet, stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
}; };
assert_tokens(&settings, &[ assert_tokens(&settings, &[

View File

@ -417,22 +417,13 @@ impl Index {
self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY)
} }
pub fn synonyms(&self, rtxn: &RoTxn) -> anyhow::Result<Option<HashMap<Vec<String>, Vec<Vec<String>>>>> { pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
match self.main.get::<_, Str, SerdeBincode<HashMap<Vec<String>, Vec<Vec<String>>>>>(rtxn, SYNONYMS_KEY)? { Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, SYNONYMS_KEY)?.unwrap_or_default())
Some(synonyms) => Ok(Some(synonyms)),
None => Ok(None),
}
} }
pub fn words_synonyms<S: AsRef<str>>(&self, rtxn: &RoTxn, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>> { pub fn words_synonyms<S: AsRef<str>>(&self, rtxn: &RoTxn, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms(rtxn)?.remove(&words))
match self.synonyms(rtxn)? {
Some(synonyms) => Ok(Some(
synonyms.get(&words).cloned().unwrap_or(Vec::default())
)),
None => Ok(None)
}
} }
/* words prefixes fst */ /* words prefixes fst */

View File

@ -155,7 +155,7 @@ impl fmt::Debug for Query {
trait Context { trait Context {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>>; fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> { fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
match self.word_docids(word)? { match self.word_docids(word)? {
Some(rb) => Ok(Some(rb.len())), Some(rb) => Ok(Some(rb.len())),
@ -177,7 +177,7 @@ impl<'a> Context for QueryTreeBuilder<'a> {
self.index.word_docids.get(self.rtxn, word) self.index.word_docids.get(self.rtxn, word)
} }
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>> { fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
self.index.words_synonyms(self.rtxn, words) self.index.words_synonyms(self.rtxn, words)
} }
@ -270,10 +270,10 @@ fn typos(word: String, authorize_typos: bool) -> QueryKind {
} }
} }
/// Fetch synonyms from the `Context` for the provided words /// Fetch synonyms from the `Context` for the provided word
/// and create the list of operations for the query tree /// and create the list of operations for the query tree
fn synonyms(ctx: &impl Context, words: &[&str]) -> anyhow::Result<Option<Vec<Operation>>> { fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operation>>> {
let synonyms = ctx.synonyms(words)?; let synonyms = ctx.synonyms(word)?;
Ok(synonyms.map(|synonyms| { Ok(synonyms.map(|synonyms| {
synonyms.into_iter().map(|synonym| { synonyms.into_iter().map(|synonym| {
@ -581,8 +581,8 @@ mod test {
Ok(self.postings.get(word).cloned()) Ok(self.postings.get(word).cloned())
} }
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> anyhow::Result<Option<Vec<Vec<String>>>> { fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms.get(&words).cloned()) Ok(self.synonyms.get(&words).cloned())
} }
} }

View File

@ -5,6 +5,7 @@ use anyhow::Context;
use chrono::Utc; use chrono::Utc;
use grenad::CompressionType; use grenad::CompressionType;
use itertools::Itertools; use itertools::Itertools;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rayon::ThreadPool; use rayon::ThreadPool;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
@ -13,7 +14,6 @@ use crate::criterion::Criterion;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::update::index_documents::{IndexDocumentsMethod, Transform}; use crate::update::index_documents::{IndexDocumentsMethod, Transform};
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum Setting<T> { pub enum Setting<T> {
@ -328,18 +328,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn update_synonyms(&mut self) -> anyhow::Result<bool> { fn update_synonyms(&mut self) -> anyhow::Result<bool> {
match self.synonyms { match self.synonyms {
Setting::Set(ref synonyms) => { Setting::Set(ref synonyms) => {
let old_synonyms = self.index.synonyms(self.wtxn)?.unwrap_or_default(); fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec<String> {
let mut config = AnalyzerConfig::default();
let stop_words = self.index.stop_words(self.wtxn)?;
if let Some(stop_words) = &stop_words {
config.stop_words(stop_words);
}
let analyzer = Analyzer::new(config);
let normalize = |text: &String| {
analyzer analyzer
.analyze(text) .analyze(text)
.tokens() .tokens()
@ -347,20 +336,40 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
if token.is_word() { Some(token.text().to_string()) } else { None } if token.is_word() { Some(token.text().to_string()) } else { None }
) )
.collect::<Vec<_>>() .collect::<Vec<_>>()
}; }
let new_synonyms = synonyms let mut config = AnalyzerConfig::default();
let stop_words = self.index.stop_words(self.wtxn)?;
if let Some(stop_words) = &stop_words {
config.stop_words(stop_words);
}
let analyzer = Analyzer::new(config);
let mut new_synonyms = HashMap::new();
for (word, synonyms) in synonyms {
// Normalize both the word and associated synonyms.
let normalized_word = normalize(&analyzer, word);
let normalized_synonyms = synonyms
.iter() .iter()
.map(|(word, synonyms)| { .map(|synonym| normalize(&analyzer, synonym));
let normalized_word = normalize(word);
let normalized_synonyms = synonyms.iter()
.map(normalize)
.unique()
.collect::<Vec<_>>();
(normalized_word, normalized_synonyms) // Store the normalized synonyms under the normalized word,
}) // merging the possible duplicate words.
.collect(); let entry = new_synonyms
.entry(normalized_word)
.or_insert_with(Vec::new);
entry.extend(normalized_synonyms);
}
// Make sure that we don't have duplicate synonyms.
new_synonyms
.iter_mut()
.for_each(|(_, synonyms)| {
synonyms.sort_unstable();
synonyms.dedup();
});
let old_synonyms = self.index.synonyms(self.wtxn)?;
if new_synonyms != old_synonyms { if new_synonyms != old_synonyms {
self.index.put_synonyms(self.wtxn, &new_synonyms)?; self.index.put_synonyms(self.wtxn, &new_synonyms)?;
@ -734,6 +743,64 @@ mod tests {
assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data
} }
#[test]
fn set_and_reset_synonyms() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction provide some synonyms
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_synonyms(hashmap! {
"blini".to_string() => vec!["crepes".to_string()],
"super like".to_string() => vec!["love".to_string()],
"puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
});
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Ensure synonyms are effectively stored
let rtxn = index.read_txn().unwrap();
let synonyms = index.synonyms(&rtxn).unwrap();
assert!(!synonyms.is_empty()); // at this point the index should return something
// Check that we can use synonyms
let result = index.search(&rtxn).query("blini").execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
let result = index.search(&rtxn).query("super like").execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
let result = index.search(&rtxn).query("puppies").execute().unwrap();
assert_eq!(result.documents_ids.len(), 2);
// Reset the synonyms
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.reset_synonyms();
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
// Ensure synonyms are reset
let rtxn = index.read_txn().unwrap();
let synonyms = index.synonyms(&rtxn).unwrap();
assert!(synonyms.is_empty());
// Check that synonyms are no longer work
let result = index.search(&rtxn).query("blini").execute().unwrap();
assert!(result.documents_ids.is_empty());
let result = index.search(&rtxn).query("super like").execute().unwrap();
assert!(result.documents_ids.is_empty());
let result = index.search(&rtxn).query("puppies").execute().unwrap();
assert!(result.documents_ids.is_empty());
}
#[test] #[test]
fn setting_searchable_recomputes_other_settings() { fn setting_searchable_recomputes_other_settings() {
let path = tempfile::tempdir().unwrap(); let path = tempfile::tempdir().unwrap();