diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 8c1d4ee36..037a7788c 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] byteorder = "1.3.1" +deunicode = "1.0.0" hashbrown = "0.2.2" lazy_static = "1.2.0" log = "0.4.6" diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 3235cd6af..72435ea46 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -15,7 +15,7 @@ use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; -pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; +pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; pub use self::store::Store; /// Represent an internally generated document unique identifier. diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 3036283f2..147908906 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -37,6 +37,16 @@ impl Automaton { } } +pub fn normalize_str(string: &str) -> String { + let mut string = string.to_lowercase(); + + if !string.contains(is_cjk) { + string = deunicode::deunicode_with_tofu(&string, ""); + } + + string +} + fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); @@ -55,7 +65,10 @@ fn generate_automatons(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result) -> Set { let mut builder = fst::SetBuilder::memory(); - let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect()); + let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); builder.extend_iter(set.into_iter()).unwrap(); builder.into_inner().and_then(Set::from_bytes).unwrap() } @@ -953,4 +967,65 @@ mod tests { }); assert_matches!(iter.next(), None); } + + #[test] + fn deunicoded_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("iPhone", &[doc_index(0, 0)][..]), + ("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded + ("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex + ]); + + store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("telephone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("téléphone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("télephone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } } diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs index 755c11710..d0f6c2160 100644 --- a/meilidb-data/src/database/synonyms_addition.rs +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -2,6 +2,8 @@ use std::collections::BTreeMap; use std::sync::Arc; use fst::{SetBuilder, set::OpBuilder}; +use meilidb_tokenizer::is_cjk; +use meilidb_core::normalize_str; use sdset::SetBuf; use crate::database::index::InnerIndex; @@ -20,6 +22,8 @@ impl<'a> SynonymsAddition<'a> { pub fn add_synonym(&mut self, synonym: String, alternatives: I) where I: Iterator, { + let mut synonym = normalize_str(&synonym); + let alternatives = alternatives.map(|s| s.to_lowercase()); self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); } @@ -30,13 +34,11 @@ impl<'a> SynonymsAddition<'a> { let mut synonyms_builder = SetBuilder::memory(); - for (synonym, mut alternatives) in self.synonyms { + for (synonym, alternatives) in self.synonyms { synonyms_builder.insert(&synonym).unwrap(); let alternatives = { - alternatives.iter_mut().for_each(|s| *s = s.to_lowercase()); let alternatives = SetBuf::from_dirty(alternatives); - let mut alternatives_builder = SetBuilder::memory(); alternatives_builder.extend_iter(alternatives).unwrap(); alternatives_builder.into_inner().unwrap()