feat: Normalize synonym strings and query strings to search for synonyms

This commit is contained in:
Clément Renault 2019-06-20 16:25:14 +02:00
parent 3dcbc737f3
commit 0a5d4eb7ed
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
4 changed files with 84 additions and 6 deletions

View File

@ -6,6 +6,7 @@ edition = "2018"
[dependencies]
byteorder = "1.3.1"
deunicode = "1.0.0"
hashbrown = "0.2.2"
lazy_static = "1.2.0"
log = "0.4.6"

View File

@ -15,7 +15,7 @@ use serde::{Serialize, Deserialize};
use slice_group_by::GroupBy;
use zerocopy::{AsBytes, FromBytes};
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
pub use self::store::Store;
/// Represent an internally generated document unique identifier.

View File

@ -37,6 +37,16 @@ impl Automaton {
}
}
pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
@ -55,7 +65,10 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
let lev = {
let normalized = normalize_str(&ngram);
if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
@ -82,6 +95,7 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
}
if n == 1 {
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
let automaton = Automaton::original(index, ngram_nb_words, lev);
automatons.push((automaton, ngram));
}
@ -443,7 +457,7 @@ mod tests {
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
let mut builder = fst::SetBuilder::memory();
let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect());
let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
builder.extend_iter(set.into_iter()).unwrap();
builder.into_inner().and_then(Set::from_bytes).unwrap()
}
@ -953,4 +967,65 @@ mod tests {
});
assert_matches!(iter.next(), None);
}
#[test]
fn deunicoded_synonyms() {
let mut store = InMemorySetStore::from_iter(vec![
("iPhone", &[doc_index(0, 0)][..]),
("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded
("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex
]);
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"]));
let builder = QueryBuilder::new(&store);
let results = builder.query("telephone", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
let builder = QueryBuilder::new(&store);
let results = builder.query("téléphone", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
let builder = QueryBuilder::new(&store);
let results = builder.query("télephone", 0..20).unwrap();
let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None);
}
}

View File

@ -2,6 +2,8 @@ use std::collections::BTreeMap;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_tokenizer::is_cjk;
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::index::InnerIndex;
@ -20,6 +22,8 @@ impl<'a> SynonymsAddition<'a> {
pub fn add_synonym<I>(&mut self, synonym: String, alternatives: I)
where I: Iterator<Item=String>,
{
let mut synonym = normalize_str(&synonym);
let alternatives = alternatives.map(|s| s.to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
}
@ -30,13 +34,11 @@ impl<'a> SynonymsAddition<'a> {
let mut synonyms_builder = SetBuilder::memory();
for (synonym, mut alternatives) in self.synonyms {
for (synonym, alternatives) in self.synonyms {
synonyms_builder.insert(&synonym).unwrap();
let alternatives = {
alternatives.iter_mut().for_each(|s| *s = s.to_lowercase());
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap();
alternatives_builder.into_inner().unwrap()