feat: Normalize synonym strings and query strings to search for synonyms

This commit is contained in:
Clément Renault 2019-06-20 16:25:14 +02:00
parent 3dcbc737f3
commit 0a5d4eb7ed
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
4 changed files with 84 additions and 6 deletions

View File

@ -6,6 +6,7 @@ edition = "2018"
[dependencies] [dependencies]
byteorder = "1.3.1" byteorder = "1.3.1"
deunicode = "1.0.0"
hashbrown = "0.2.2" hashbrown = "0.2.2"
lazy_static = "1.2.0" lazy_static = "1.2.0"
log = "0.4.6" log = "0.4.6"

View File

@ -15,7 +15,7 @@ use serde::{Serialize, Deserialize};
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use zerocopy::{AsBytes, FromBytes}; use zerocopy::{AsBytes, FromBytes};
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
pub use self::store::Store; pub use self::store::Store;
/// Represent an internally generated document unique identifier. /// Represent an internally generated document unique identifier.

View File

@ -37,6 +37,16 @@ impl Automaton {
} }
} }
pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> { fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
@ -55,7 +65,10 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
let has_following_word = ngrams.peek().is_some(); let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) }; let lev = {
let normalized = normalize_str(&ngram);
if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }
};
let mut stream = synonyms.search(&lev).into_stream(); let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() { while let Some(base) = stream.next() {
@ -82,6 +95,7 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
} }
if n == 1 { if n == 1 {
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
let automaton = Automaton::original(index, ngram_nb_words, lev); let automaton = Automaton::original(index, ngram_nb_words, lev);
automatons.push((automaton, ngram)); automatons.push((automaton, ngram));
} }
@ -443,7 +457,7 @@ mod tests {
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
let mut builder = fst::SetBuilder::memory(); let mut builder = fst::SetBuilder::memory();
let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect()); let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
builder.extend_iter(set.into_iter()).unwrap(); builder.extend_iter(set.into_iter()).unwrap();
builder.into_inner().and_then(Set::from_bytes).unwrap() builder.into_inner().and_then(Set::from_bytes).unwrap()
} }
@ -953,4 +967,65 @@ mod tests {
}); });
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
} }
#[test]
fn deunicoded_synonyms() {
let mut store = InMemorySetStore::from_iter(vec![
("iPhone", &[doc_index(0, 0)][..]),
("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded
("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex
]);
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"]));
let builder = QueryBuilder::new(&store);
let results = builder.query("telephone", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
let builder = QueryBuilder::new(&store);
let results = builder.query("téléphone", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
let builder = QueryBuilder::new(&store);
let results = builder.query("télephone", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
} }

View File

@ -2,6 +2,8 @@ use std::collections::BTreeMap;
use std::sync::Arc; use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder}; use fst::{SetBuilder, set::OpBuilder};
use meilidb_tokenizer::is_cjk;
use meilidb_core::normalize_str;
use sdset::SetBuf; use sdset::SetBuf;
use crate::database::index::InnerIndex; use crate::database::index::InnerIndex;
@ -20,6 +22,8 @@ impl<'a> SynonymsAddition<'a> {
pub fn add_synonym<I>(&mut self, synonym: String, alternatives: I) pub fn add_synonym<I>(&mut self, synonym: String, alternatives: I)
where I: Iterator<Item=String>, where I: Iterator<Item=String>,
{ {
let mut synonym = normalize_str(&synonym);
let alternatives = alternatives.map(|s| s.to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
} }
@ -30,13 +34,11 @@ impl<'a> SynonymsAddition<'a> {
let mut synonyms_builder = SetBuilder::memory(); let mut synonyms_builder = SetBuilder::memory();
for (synonym, mut alternatives) in self.synonyms { for (synonym, alternatives) in self.synonyms {
synonyms_builder.insert(&synonym).unwrap(); synonyms_builder.insert(&synonym).unwrap();
let alternatives = { let alternatives = {
alternatives.iter_mut().for_each(|s| *s = s.to_lowercase());
let alternatives = SetBuf::from_dirty(alternatives); let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory(); let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap(); alternatives_builder.extend_iter(alternatives).unwrap();
alternatives_builder.into_inner().unwrap() alternatives_builder.into_inner().unwrap()