1184: normalize synonyms during indexation r=MarinPostma a=LegendreM

fix #1135 #964

Normalizes the synonyms before indexing them, so they are not case sensitive anymore. Then normalization also involves deunicoding is some cases, such as accents, so `été` and `ete` are considered equivalent in a search for synonyms.

Co-authored-by: many <maxime@meilisearch.com>
Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
bors[bot] 2021-02-01 14:12:57 +00:00 committed by GitHub
commit 81e9fd8933
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 97 additions and 2 deletions

View file

@ -1,9 +1,10 @@
use std::collections::{BTreeMap, BTreeSet};
use std::{borrow::Cow, collections::{BTreeMap, BTreeSet}};
use heed::Result as ZResult;
use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf;
use meilisearch_schema::Schema;
use meilisearch_tokenizer::analyzer::{Analyzer, AnalyzerConfig};
use crate::database::{MainT, UpdateT};
use crate::settings::{UpdateState, SettingsUpdate, RankingRule};
@ -289,13 +290,24 @@ pub fn apply_synonyms_update(
let main_store = index.main;
let synonyms_store = index.synonyms;
let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?;
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
fn normalize<T: AsRef<[u8]>>(analyzer: &Analyzer<T>, text: &str) -> String {
analyzer.analyze(&text)
.tokens()
.fold(String::new(), |s, t| s + t.text())
}
let mut synonyms_builder = SetBuilder::memory();
synonyms_store.clear(writer)?;
for (word, alternatives) in synonyms.clone() {
for (word, alternatives) in synonyms {
let word = normalize(&analyzer, &word);
synonyms_builder.insert(&word)?;
let alternatives = {
let alternatives = alternatives.iter().map(|text| normalize(&analyzer, &text)).collect();
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives)?;