mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 05:14:27 +01:00
Merge #505
505: normalize exact words r=curquiza a=MarinPostma Normalize the exact words, as specified in the specification. Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
commit
8010eca9c7
@ -580,6 +580,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
fn update_exact_words(&mut self) -> Result<()> {
|
fn update_exact_words(&mut self) -> Result<()> {
|
||||||
match self.exact_words {
|
match self.exact_words {
|
||||||
Setting::Set(ref mut words) => {
|
Setting::Set(ref mut words) => {
|
||||||
|
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String {
|
||||||
|
analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut config = AnalyzerConfig::default();
|
||||||
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
|
if let Some(stop_words) = &stop_words {
|
||||||
|
config.stop_words(stop_words);
|
||||||
|
}
|
||||||
|
let analyzer = Analyzer::new(config);
|
||||||
|
|
||||||
|
let mut words: Vec<_> =
|
||||||
|
words.iter().map(|word| normalize(&analyzer, word)).collect();
|
||||||
|
|
||||||
|
// normalization could reorder words
|
||||||
|
words.sort_unstable();
|
||||||
|
|
||||||
let words = fst::Set::from_iter(words.iter())?;
|
let words = fst::Set::from_iter(words.iter())?;
|
||||||
self.index.put_exact_words(&mut self.wtxn, &words)?;
|
self.index.put_exact_words(&mut self.wtxn, &words)?;
|
||||||
}
|
}
|
||||||
@ -1461,4 +1478,22 @@ mod tests {
|
|||||||
builder.set_min_word_len_two_typos(7);
|
builder.set_min_word_len_two_typos(7);
|
||||||
assert!(builder.execute(|_| ()).is_err());
|
assert!(builder.execute(|_| ()).is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn update_exact_words_normalization() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
// Set the genres setting
|
||||||
|
let mut txn = index.write_txn().unwrap();
|
||||||
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||||
|
|
||||||
|
let words = btreeset! { S("Ab"), S("ac") };
|
||||||
|
builder.set_exact_words(words);
|
||||||
|
assert!(builder.execute(|_| ()).is_ok());
|
||||||
|
let exact_words = index.exact_words(&txn).unwrap();
|
||||||
|
for word in exact_words.into_fst().stream().into_str_vec().unwrap() {
|
||||||
|
assert!(word.0 == "ac" || word.0 == "ab");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user