From e8b2e860074bb0d1d22665147266c1118391f4ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 13 Jun 2019 15:47:49 +0200 Subject: [PATCH 01/16] feat: Introduce a basic way to handle synonyms --- meilidb-core/Cargo.toml | 3 + meilidb-core/src/lib.rs | 3 + meilidb-core/src/query_builder.rs | 212 ++++++++++++++++++++++++++++-- 3 files changed, 207 insertions(+), 11 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 8790889e2..8c1d4ee36 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -25,6 +25,9 @@ git = "https://github.com/Kerollmops/levenshtein-automata.git" branch = "arc-byte-slice" features = ["fst_automaton"] +[dev-dependencies] +assert_matches = "1.3" + [features] i128 = ["byteorder/i128"] nightly = ["hashbrown/nightly", "slice-group-by/nightly"] diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index e61fa543b..bb2de2dec 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,3 +1,6 @@ +#[cfg(test)] +#[macro_use] extern crate assert_matches; + mod automaton; mod distinct_map; mod query_builder; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index f9f51dba2..6e8194e2f 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -17,19 +17,28 @@ use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::{Match, DocumentId, Store, RawDocument, Document}; -fn generate_automatons(query: &str) -> Vec { +fn generate_automatons(query: &str, synonyms: &HashMap<&str, &[&str]>) -> Vec<(usize, DfaExt)> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); let mut automatons = Vec::new(); + let mut index = 0; while let Some(word) = groups.next() { + let word = word.as_str(); let has_following_word = groups.peek().is_some(); - let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { - automaton::build_dfa(&word) - } else { - automaton::build_prefix_dfa(&word) - }; - automatons.push(lev); + let is_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + let words = synonyms.get(word).cloned().unwrap_or_default().iter().chain(Some(&word)); + + for word in words { + let lev = if is_prefix_dfa { + automaton::build_dfa(word) + } else { + automaton::build_prefix_dfa(word) + }; + automatons.push((index, lev)); + } + + index += 1; } automatons @@ -82,12 +91,22 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { - let automatons = generate_automatons(query); + let map = { + let mut map = HashMap::new(); + + map.insert("hello", &["bonjour", "salut"][..]); + map.insert("bonjour", &["hello", "salut"]); + map.insert("salut", &["hello", "bonjour"]); + + map + }; + + let automatons = generate_automatons(query, &map); let words = self.store.words()?.as_fst(); let mut stream = { let mut op_builder = fst::raw::OpBuilder::new(); - for automaton in &automatons { + for (_index, automaton) in &automatons { let stream = words.search(automaton); op_builder.push(stream); } @@ -98,7 +117,7 @@ where S: Store, while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { - let automaton = &automatons[iv.index]; + let (index, automaton) = &automatons[iv.index]; let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); @@ -111,7 +130,7 @@ where S: Store, for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { let match_ = Match { - query_index: iv.index as u32, + query_index: *index as u32, distance, attribute: di.attribute, word_index: di.word_index, @@ -321,3 +340,174 @@ where S: Store, Ok(out_documents) } } + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::{BTreeSet, HashMap}; + use std::iter::FromIterator; + + use sdset::SetBuf; + use fst::Set; + + use crate::DocIndex; + use crate::store::Store; + + #[derive(Default)] + struct InMemorySetStore { + set: Set, + indexes: HashMap, SetBuf>, + } + + impl Store for InMemorySetStore { + type Error = std::io::Error; + + fn words(&self) -> Result<&Set, Self::Error> { + Ok(&self.set) + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + Ok(self.indexes.get(word).cloned()) + } + } + + impl<'a> FromIterator<(&'a [u8], &'a [DocIndex])> for InMemorySetStore { + fn from_iter>(iter: I) -> Self { + let mut tree = BTreeSet::new(); + let mut map = HashMap::new(); + + for (word, indexes) in iter { + tree.insert(word); + map.insert(word.to_vec(), SetBuf::from_dirty(indexes.to_vec())); + } + + InMemorySetStore { + set: Set::from_iter(tree).unwrap(), + indexes: map, + } + } + } + + const fn doc_index(document_id: u64, word_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index: 0, + char_length: 0, + } + } + + #[test] + fn simple_synonymes() { + let store = InMemorySetStore::from_iter(vec![ + (&b"hello"[..], &[doc_index(0, 0)][..]), + ]); + + let builder = QueryBuilder::new(&store); + let results = builder.query("hello", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonjour", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn harder_synonymes() { + let store = InMemorySetStore::from_iter(vec![ + (&b"hello"[..], &[doc_index(0, 0)][..]), + (&b"bonjour"[..], &[doc_index(1, 3)]), + (&b"salut"[..], &[doc_index(2, 5)]), + ]); + + let builder = QueryBuilder::new(&store); + let results = builder.query("hello", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonjour", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("salut", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + } +} From 18736bdcd0fb5e7444f724595204ac8ff7b13b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 13 Jun 2019 16:20:01 +0200 Subject: [PATCH 02/16] feat: Introduce the synonyms concept to the Store trait --- meilidb-core/src/query_builder.rs | 66 +++++++++++++++++++++++++++---- meilidb-core/src/store.rs | 11 ++++++ 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 6e8194e2f..7c089c8dc 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -349,7 +349,7 @@ mod tests { use std::iter::FromIterator; use sdset::SetBuf; - use fst::Set; + use fst::{Set, IntoStreamer}; use crate::DocIndex; use crate::store::Store; @@ -357,18 +357,46 @@ mod tests { #[derive(Default)] struct InMemorySetStore { set: Set, + synonyms: Set, indexes: HashMap, SetBuf>, + alternatives: HashMap, Set>, } - impl Store for InMemorySetStore { - type Error = std::io::Error; + fn set_from_stream<'f, I, S>(stream: I) -> Set + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>, + S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>, + { + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(stream); + builder.into_inner().and_then(Set::from_bytes).unwrap() + } - fn words(&self) -> Result<&Set, Self::Error> { - Ok(&self.set) - } + fn insert_key(set: &Set, key: &[u8]) -> Set { + let unique_key = { + let mut builder = fst::SetBuilder::memory(); + builder.insert(key); + builder.into_inner().and_then(Set::from_bytes).unwrap() + }; - fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { - Ok(self.indexes.get(word).cloned()) + let union_ = set.op().add(unique_key.into_stream()).r#union(); + + set_from_stream(union_) + } + + fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { + let mut builder = fst::SetBuilder::memory(); + builder.extend_iter(set.into_iter()); + builder.into_inner().and_then(Set::from_bytes).unwrap() + } + + impl InMemorySetStore { + pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { + let alternatives = self.alternatives.entry(word.as_bytes().to_vec()).or_default(); + let new = sdset_into_fstset(&new); + *alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); + + self.synonyms = insert_key(&self.synonyms, word.as_bytes()); } } @@ -384,11 +412,33 @@ mod tests { InMemorySetStore { set: Set::from_iter(tree).unwrap(), + synonyms: Set::default(), indexes: map, + alternatives: HashMap::new(), } } } + impl Store for InMemorySetStore { + type Error = std::io::Error; + + fn words(&self) -> Result<&Set, Self::Error> { + Ok(&self.set) + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + Ok(self.indexes.get(word).cloned()) + } + + fn synonyms(&self) -> Result<&Set, Self::Error> { + Ok(&self.synonyms) + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + Ok(self.alternatives.get(word).map(|s| Set::from_bytes(s.as_fst().to_vec()).unwrap())) + } + } + const fn doc_index(document_id: u64, word_index: u16) -> DocIndex { DocIndex { document_id: DocumentId(document_id), diff --git a/meilidb-core/src/store.rs b/meilidb-core/src/store.rs index 14e95f0cc..6e429a1b4 100644 --- a/meilidb-core/src/store.rs +++ b/meilidb-core/src/store.rs @@ -8,6 +8,9 @@ pub trait Store { fn words(&self) -> Result<&Set, Self::Error>; fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error>; + + fn synonyms(&self) -> Result<&Set, Self::Error>; + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error>; } impl Store for &'_ T where T: Store { @@ -20,4 +23,12 @@ impl Store for &'_ T where T: Store { fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { (*self).word_indexes(word) } + + fn synonyms(&self) -> Result<&Set, Self::Error> { + (*self).synonyms() + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + (*self).alternatives_to(word) + } } From 707d7b062b78d815f908d453c23ca1f8cd52e719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 13 Jun 2019 16:38:37 +0200 Subject: [PATCH 03/16] feat: Made query handle synonyms via the Store --- meilidb-core/src/query_builder.rs | 58 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 7c089c8dc..0756b9634 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -8,40 +8,46 @@ use rayon::slice::ParallelSliceMut; use slice_group_by::GroupByMut; use meilidb_tokenizer::{is_cjk, split_query_string}; use hashbrown::{HashMap, HashSet}; -use fst::Streamer; +use fst::{Streamer, IntoStreamer}; use log::info; -use crate::automaton::{self, DfaExt, AutomatonExt}; +use crate::automaton::{self, DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::{Match, DocumentId, Store, RawDocument, Document}; -fn generate_automatons(query: &str, synonyms: &HashMap<&str, &[&str]>) -> Vec<(usize, DfaExt)> { +fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); let mut automatons = Vec::new(); let mut index = 0; + let synonyms = store.synonyms()?; + while let Some(word) = groups.next() { let word = word.as_str(); let has_following_word = groups.peek().is_some(); - let is_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - let words = synonyms.get(word).cloned().unwrap_or_default().iter().chain(Some(&word)); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - for word in words { - let lev = if is_prefix_dfa { - automaton::build_dfa(word) - } else { - automaton::build_prefix_dfa(word) - }; - automatons.push((index, lev)); + let lev = if not_prefix_dfa { build_dfa(word) } else { build_prefix_dfa(word) }; + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(synonym) = stream.next() { + if let Some(words) = store.alternatives_to(synonym)? { + let mut stream = words.into_stream(); + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word).unwrap(); + let lev = if not_prefix_dfa { build_dfa(word) } else { build_prefix_dfa(word) }; + automatons.push((index, lev)); + } + } } + automatons.push((index, lev)); index += 1; } - automatons + Ok(automatons) } pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { @@ -91,17 +97,7 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { - let map = { - let mut map = HashMap::new(); - - map.insert("hello", &["bonjour", "salut"][..]); - map.insert("bonjour", &["hello", "salut"]); - map.insert("salut", &["hello", "bonjour"]); - - map - }; - - let automatons = generate_automatons(query, &map); + let automatons = generate_automatons(query, &self.store)?; let words = self.store.words()?.as_fst(); let mut stream = { @@ -450,11 +446,13 @@ mod tests { } #[test] - fn simple_synonymes() { - let store = InMemorySetStore::from_iter(vec![ + fn simple_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ (&b"hello"[..], &[doc_index(0, 0)][..]), ]); + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + let builder = QueryBuilder::new(&store); let results = builder.query("hello", 0..20).unwrap(); let mut iter = results.into_iter(); @@ -481,13 +479,17 @@ mod tests { } #[test] - fn harder_synonymes() { - let store = InMemorySetStore::from_iter(vec![ + fn harder_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ (&b"hello"[..], &[doc_index(0, 0)][..]), (&b"bonjour"[..], &[doc_index(1, 3)]), (&b"salut"[..], &[doc_index(2, 5)]), ]); + store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); + store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); + let builder = QueryBuilder::new(&store); let results = builder.query("hello", 0..20).unwrap(); let mut iter = results.into_iter(); From 9861c3878e1f1ec8e9dbfcc4e44c90efc209c54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 13 Jun 2019 16:44:09 +0200 Subject: [PATCH 04/16] tests: Add more tests about synonyms --- meilidb-core/src/query_builder.rs | 87 +++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 4 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 0756b9634..c7641e764 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -11,7 +11,7 @@ use hashbrown::{HashMap, HashSet}; use fst::{Streamer, IntoStreamer}; use log::info; -use crate::automaton::{self, DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; +use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; use crate::raw_documents_from_matches; @@ -364,14 +364,14 @@ mod tests { S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>, { let mut builder = fst::SetBuilder::memory(); - builder.extend_stream(stream); + builder.extend_stream(stream).unwrap(); builder.into_inner().and_then(Set::from_bytes).unwrap() } fn insert_key(set: &Set, key: &[u8]) -> Set { let unique_key = { let mut builder = fst::SetBuilder::memory(); - builder.insert(key); + builder.insert(key).unwrap(); builder.into_inner().and_then(Set::from_bytes).unwrap() }; @@ -382,7 +382,7 @@ mod tests { fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { let mut builder = fst::SetBuilder::memory(); - builder.extend_iter(set.into_iter()); + builder.extend_iter(set.into_iter()).unwrap(); builder.into_inner().and_then(Set::from_bytes).unwrap() } @@ -478,6 +478,85 @@ mod tests { assert_matches!(iter.next(), None); } + #[test] + fn prefix_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + (&b"hello"[..], &[doc_index(0, 0)][..]), + ]); + + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("sal", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonj", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("sal blabla", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonj blabla", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), None); + } + + #[test] + fn levenshtein_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + (&b"hello"[..], &[doc_index(0, 0)][..]), + ]); + + store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("salutution", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("saluttion", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + } + #[test] fn harder_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ From 6cb57aa8a4bdc2a075fa6eb7d1fd4f5c725547d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 10:28:43 +0200 Subject: [PATCH 05/16] feat: Unique word has multi-word synonyms basically work --- meilidb-core/src/criterion/mod.rs | 2 +- meilidb-core/src/query_builder.rs | 50 +++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/meilidb-core/src/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs index 9b2962bdb..6ce42007c 100644 --- a/meilidb-core/src/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -113,7 +113,7 @@ impl<'a> Default for Criteria<'a> { } } -impl<'a> AsRef<[Box]> for Criteria<'a> { +impl<'a> AsRef<[Box]> for Criteria<'a> { fn as_ref(&self) -> &[Box] { &self.inner } diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index c7641e764..e11ec4535 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -37,8 +37,10 @@ fn generate_automatons(query: &str, store: &S) -> Result) { + let word = word.to_lowercase(); let alternatives = self.alternatives.entry(word.as_bytes().to_vec()).or_default(); let new = sdset_into_fstset(&new); *alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); @@ -641,4 +644,47 @@ mod tests { }); assert_matches!(iter.next(), None); } + + /// Unique word has multi-word synonyms + #[test] + fn multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + (&b"new"[..], &[doc_index(0, 0)][..]), + (&b"york"[..], &[doc_index(0, 1)][..]), + (&b"subway"[..], &[doc_index(0, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } } From 62930ecc4e3747f3e13e649598f8b5d5ceb958ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 10:44:16 +0200 Subject: [PATCH 06/16] feat: Deduplicate automatons when synonyms produce duplicated ones --- meilidb-core/src/query_builder.rs | 35 ++++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index e11ec4535..4158bd1ba 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -25,30 +25,35 @@ fn generate_automatons(query: &str, store: &S) -> Result { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); @@ -679,8 +682,6 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); From d2bd99cc2a1fe41143010b7c8abdd6b77a213498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 11:31:10 +0200 Subject: [PATCH 07/16] fix: Append DocIndexes when building InMemorySetStore from an Iterator --- meilidb-core/src/query_builder.rs | 45 +++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 4158bd1ba..d8065bba2 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -389,6 +389,7 @@ mod tests { fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { let mut builder = fst::SetBuilder::memory(); + let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect()); builder.extend_iter(set.into_iter()).unwrap(); builder.into_inner().and_then(Set::from_bytes).unwrap() } @@ -404,20 +405,21 @@ mod tests { } } - impl<'a> FromIterator<(&'a [u8], &'a [DocIndex])> for InMemorySetStore { - fn from_iter>(iter: I) -> Self { + impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for InMemorySetStore { + fn from_iter>(iter: I) -> Self { let mut tree = BTreeSet::new(); let mut map = HashMap::new(); for (word, indexes) in iter { - tree.insert(word); - map.insert(word.to_vec(), SetBuf::from_dirty(indexes.to_vec())); + let word = word.to_lowercase().into_bytes(); + tree.insert(word.clone()); + map.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); } InMemorySetStore { set: Set::from_iter(tree).unwrap(), synonyms: Set::default(), - indexes: map, + indexes: map.into_iter().map(|(k, v)| (k, SetBuf::from_dirty(v))).collect(), alternatives: HashMap::new(), } } @@ -456,7 +458,7 @@ mod tests { #[test] fn simple_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - (&b"hello"[..], &[doc_index(0, 0)][..]), + ("hello", &[doc_index(0, 0)][..]), ]); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); @@ -489,7 +491,7 @@ mod tests { #[test] fn prefix_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - (&b"hello"[..], &[doc_index(0, 0)][..]), + ("hello", &[doc_index(0, 0)][..]), ]); store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); @@ -535,7 +537,7 @@ mod tests { #[test] fn levenshtein_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - (&b"hello"[..], &[doc_index(0, 0)][..]), + ("hello", &[doc_index(0, 0)][..]), ]); store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); @@ -568,9 +570,9 @@ mod tests { #[test] fn harder_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - (&b"hello"[..], &[doc_index(0, 0)][..]), - (&b"bonjour"[..], &[doc_index(1, 3)]), - (&b"salut"[..], &[doc_index(2, 5)]), + ("hello", &[doc_index(0, 0)][..]), + ("bonjour", &[doc_index(1, 3)]), + ("salut", &[doc_index(2, 5)]), ]); store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); @@ -654,9 +656,12 @@ mod tests { #[test] fn multiword_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - (&b"new"[..], &[doc_index(0, 0)][..]), - (&b"york"[..], &[doc_index(0, 1)][..]), - (&b"subway"[..], &[doc_index(0, 2)][..]), + ("new", &[doc_index(0, 0)][..]), + ("york", &[doc_index(0, 1)][..]), + ("subway", &[doc_index(0, 2)][..]), + + ("NY", &[doc_index(1, 0)][..]), + ("subway", &[doc_index(1, 1)][..]), ]); store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); @@ -666,6 +671,12 @@ mod tests { let results = builder.query("NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new @@ -679,6 +690,12 @@ mod tests { let results = builder.query("NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new From 59fafb8b30bf4d8d911cd41d0de7ab52795fd917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 16:01:31 +0200 Subject: [PATCH 08/16] feat: Support one word has multi-word alternatives --- meilidb-core/src/lib.rs | 8 +- meilidb-core/src/query_builder.rs | 230 ++++++++++++++++++++++++++---- 2 files changed, 206 insertions(+), 32 deletions(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index bb2de2dec..3235cd6af 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -10,7 +10,7 @@ pub mod criterion; use std::fmt; use std::sync::Arc; -use rayon::slice::ParallelSliceMut; +use sdset::SetBuf; use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; @@ -229,12 +229,10 @@ impl fmt::Debug for RawDocument { } } -pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { - let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); +pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec { + let mut docs_ranges = Vec::<(_, Range)>::new(); let mut matches2 = Matches::with_capacity(matches.len()); - matches.par_sort_unstable(); - for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { let id = group[0].0; let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index d8065bba2..c93a7be9e 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -4,12 +4,13 @@ use std::rc::Rc; use std::time::Instant; use std::{cmp, mem}; -use rayon::slice::ParallelSliceMut; -use slice_group_by::GroupByMut; -use meilidb_tokenizer::{is_cjk, split_query_string}; -use hashbrown::{HashMap, HashSet}; use fst::{Streamer, IntoStreamer}; +use hashbrown::{HashMap, HashSet}; use log::info; +use meilidb_tokenizer::{is_cjk, split_query_string}; +use rayon::slice::ParallelSliceMut; +use sdset::SetBuf; +use slice_group_by::GroupByMut; use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; @@ -46,12 +47,11 @@ fn generate_automatons(query: &str, store: &S) -> Result QueryBuilder<'c, S, FI> store: self.store, criteria: self.criteria, searchable_attrs: self.searchable_attrs, - filter: Some(function) + filter: Some(function), } } @@ -147,8 +147,22 @@ where S: Store, } } + matches.par_sort_unstable(); + + for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) { + let mut offset = 0; + for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) { + let word_index = query_indexes[0].1.word_index - offset as u16; + for (_, match_) in query_indexes.iter_mut() { + match_.word_index = word_index; + } + offset += query_indexes.len() - 1; + } + } + let total_matches = matches.len(); - let raw_documents = raw_documents_from_matches(matches); + let padded_matches = SetBuf::from_dirty(matches); + let raw_documents = raw_documents_from_matches(padded_matches); info!("{} total documents to classify", raw_documents.len()); info!("{} total matches to classify", total_matches); @@ -455,6 +469,16 @@ mod tests { } } + const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index, + char_length: 0, + } + } + #[test] fn simple_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ @@ -652,35 +676,97 @@ mod tests { assert_matches!(iter.next(), None); } - /// Unique word has multi-word synonyms #[test] - fn multiword_synonyms() { + /// Unique word has multi-word synonyms + fn unique_to_multiword_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("new", &[doc_index(0, 0)][..]), - ("york", &[doc_index(0, 1)][..]), - ("subway", &[doc_index(0, 2)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), - ("NY", &[doc_index(1, 0)][..]), - ("subway", &[doc_index(1, 1)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("subway", &[doc_char_index(1, 1, 1)][..]), ]); - store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); - store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); let builder = QueryBuilder::new(&store); let results = builder.query("NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway assert_matches!(iter.next(), None); }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn harder_unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("subway", &[doc_char_index(0, 4, 4)][..]), + ("broken", &[doc_char_index(0, 5, 5)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); }); @@ -690,19 +776,109 @@ mod tests { let results = builder.query("NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); } + + #[test] + /// Unique word has multi-word synonyms + fn even_harder_unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("underground", &[doc_char_index(0, 4, 4)][..]), + ("train", &[doc_char_index(0, 5, 5)][..]), + ("broken", &[doc_char_index(0, 6, 6)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + #[ignore] + /// Multi-word has multi-word synonyms + fn multiword_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_index(0, 0)][..]), + ("subway", &[doc_index(0, 1)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } } From 0633f16b4dca0fe2d61da4534862b69440b22e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 18:21:10 +0200 Subject: [PATCH 09/16] feat: Make multi-word support multi-word synonyms --- meilidb-core/src/query_builder.rs | 92 ++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index c93a7be9e..088d2bc2e 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -18,36 +18,46 @@ use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::{Match, DocumentId, Store, RawDocument, Document}; +const NGRAMS: usize = 3; + fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let mut automatons = Vec::new(); - let mut index = 0; let synonyms = store.synonyms()?; - while let Some(query_word) = groups.next() { - let query_word_str = query_word.as_str(); - let has_following_word = groups.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || query_word_str.chars().all(is_cjk); + for n in 1..=NGRAMS { + let mut index = 0; + let mut ngrams = query_words.windows(n).peekable(); - let lev = if not_prefix_dfa { build_dfa(query_word_str) } else { build_prefix_dfa(query_word_str) }; - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(word) = stream.next() { - if let Some(synonyms) = store.alternatives_to(word)? { - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - for synonym in split_query_string(synonyms) { - let lev = if not_prefix_dfa { build_dfa(synonym) } else { build_prefix_dfa(synonym) }; - automatons.push((index, synonym.to_owned(), lev)); + while let Some(ngram) = ngrams.next() { + let ngram = ngram.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) }; + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(word) = stream.next() { + if let Some(synonyms) = store.alternatives_to(word)? { + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + for synonym in split_query_string(synonyms) { + let lev = if not_prefix_dfa { build_dfa(synonym) } else { build_prefix_dfa(synonym) }; + automatons.push((index, synonym.to_owned(), lev)); + } } } } - } - automatons.push((index, query_word, lev)); - index += 1; + if n == 1 { + automatons.push((index, ngram, lev)); + } + + index += 1; + } } automatons.sort_unstable_by(|a, b| (a.0, &a.1).cmp(&(b.0, &b.1))); @@ -859,24 +869,56 @@ mod tests { } #[test] - #[ignore] /// Multi-word has multi-word synonyms fn multiword_to_multiword_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("NY", &[doc_index(0, 0)][..]), - ("subway", &[doc_index(0, 1)][..]), + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + + ("NYC", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ("broken", &[doc_char_index(1, 3, 3)][..]), ]); store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); + store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC", "NY", "new york"])); + store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); let builder = QueryBuilder::new(&store); - let results = builder.query("new york subway", 0..20).unwrap(); + let results = builder.query("new york underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 2, .. })); // subway = underground train + assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york city underground train broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, .. })); // subway = underground train + assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york city + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, .. })); // subway = underground train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); From a76c00a787d706f42c17de01ffea53bc3e4643ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 16:49:31 +0200 Subject: [PATCH 10/16] feat: Create types to edit synonyms and keep them in the database --- .../src/database/documents_addition.rs | 3 +- .../src/database/documents_deletion.rs | 3 +- meilidb-data/src/database/index.rs | 30 ++++++- meilidb-data/src/database/main_index.rs | 16 ++++ meilidb-data/src/database/mod.rs | 22 ++++- meilidb-data/src/database/raw_index.rs | 4 +- .../src/database/synonyms_addition.rs | 83 +++++++++++++++++++ .../src/database/synonyms_deletion.rs | 71 ++++++++++++++++ meilidb-data/src/database/synonyms_index.rs | 23 +++++ 9 files changed, 248 insertions(+), 7 deletions(-) create mode 100644 meilidb-data/src/database/synonyms_addition.rs create mode 100644 meilidb-data/src/database/synonyms_deletion.rs create mode 100644 meilidb-data/src/database/synonyms_index.rs diff --git a/meilidb-data/src/database/documents_addition.rs b/meilidb-data/src/database/documents_addition.rs index 177d1975c..15323be70 100644 --- a/meilidb-data/src/database/documents_addition.rs +++ b/meilidb-data/src/database/documents_addition.rs @@ -120,11 +120,12 @@ impl<'a> DocumentsAddition<'a> { // update the "consistent" view of the Index let ranked_map = self.ranked_map; + let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone() let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; self.inner.0.store(Arc::new(inner)); Ok(()) diff --git a/meilidb-data/src/database/documents_deletion.rs b/meilidb-data/src/database/documents_deletion.rs index e89923199..9813afe3c 100644 --- a/meilidb-data/src/database/documents_deletion.rs +++ b/meilidb-data/src/database/documents_deletion.rs @@ -119,11 +119,12 @@ impl<'a> DocumentsDeletion<'a> { // update the "consistent" view of the Index let ranked_map = lease_inner.ranked_map.clone(); + let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone() let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; self.inner.0.store(Arc::new(inner)); Ok(()) diff --git a/meilidb-data/src/database/index.rs b/meilidb-data/src/database/index.rs index 4cc6d7acb..886d31118 100644 --- a/meilidb-data/src/database/index.rs +++ b/meilidb-data/src/database/index.rs @@ -13,7 +13,11 @@ use crate::ranked_map::RankedMap; use crate::serde::Deserializer; use super::{Error, CustomSettings}; -use super::{RawIndex, DocumentsAddition, DocumentsDeletion}; +use super::{ + RawIndex, + DocumentsAddition, DocumentsDeletion, + SynonymsAddition, SynonymsDeletion, +}; #[derive(Copy, Clone)] pub struct IndexStats { @@ -27,6 +31,7 @@ pub struct Index(pub ArcSwap); pub struct InnerIndex { pub words: fst::Set, + pub synonyms: fst::Set, pub schema: Schema, pub ranked_map: RankedMap, pub raw: RawIndex, // TODO this will be a snapshot in the future @@ -39,6 +44,11 @@ impl Index { None => fst::Set::default(), }; + let synonyms = match raw.main.synonyms_set()? { + Some(synonyms) => synonyms, + None => fst::Set::default(), + }; + let schema = match raw.main.schema()? { Some(schema) => schema, None => return Err(Error::SchemaMissing), @@ -49,7 +59,7 @@ impl Index { None => RankedMap::default(), }; - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; let index = Index(ArcSwap::new(Arc::new(inner))); Ok(index) @@ -101,6 +111,14 @@ impl Index { DocumentsDeletion::new(self, ranked_map) } + pub fn synonyms_addition(&self) -> SynonymsAddition { + SynonymsAddition::new(self) + } + + pub fn synonyms_deletion(&self) -> SynonymsDeletion { + SynonymsDeletion::new(self) + } + pub fn document( &self, fields: Option<&HashSet<&str>>, @@ -141,4 +159,12 @@ impl Store for IndexLease { fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { Ok(self.0.raw.words.doc_indexes(word)?) } + + fn synonyms(&self) -> Result<&fst::Set, Self::Error> { + Ok(&self.0.synonyms) + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + Ok(self.0.raw.synonyms.alternatives_to(word)?) + } } diff --git a/meilidb-data/src/database/main_index.rs b/meilidb-data/src/database/main_index.rs index 7b3b98479..d7d4e1fbd 100644 --- a/meilidb-data/src/database/main_index.rs +++ b/meilidb-data/src/database/main_index.rs @@ -44,6 +44,22 @@ impl MainIndex { self.0.set("words", value.as_fst().as_bytes()).map_err(Into::into) } + pub fn synonyms_set(&self) -> Result, Error> { + match self.0.get_pinned("synonyms")? { + Some(bytes) => { + let len = bytes.len(); + let value = Arc::from(bytes.as_ref()); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None), + } + } + + pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> { + self.0.set("synonyms", value.as_fst().as_bytes()).map_err(Into::into) + } + pub fn ranked_map(&self) -> Result, Error> { match self.0.get_pinned("ranked-map")? { Some(bytes) => { diff --git a/meilidb-data/src/database/mod.rs b/meilidb-data/src/database/mod.rs index b9df6fc0b..2edf774e0 100644 --- a/meilidb-data/src/database/mod.rs +++ b/meilidb-data/src/database/mod.rs @@ -13,6 +13,9 @@ mod error; mod index; mod main_index; mod raw_index; +mod synonyms_addition; +mod synonyms_deletion; +mod synonyms_index; mod words_index; pub use self::error::Error; @@ -22,11 +25,14 @@ pub use self::custom_settings::CustomSettings; use self::docs_words_index::DocsWordsIndex; use self::documents_addition::DocumentsAddition; use self::documents_deletion::DocumentsDeletion; +use self::synonyms_addition::SynonymsAddition; +use self::synonyms_deletion::SynonymsDeletion; use self::documents_index::DocumentsIndex; use self::index::InnerIndex; use self::main_index::MainIndex; use self::raw_index::{RawIndex, InnerRawIndex}; use self::words_index::WordsIndex; +use self::synonyms_index::SynonymsIndex; pub struct Database { cache: RwLock>>, @@ -99,6 +105,12 @@ impl Database { MainIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(name))) }; + let synonyms = { + let cf_name = format!("{}-synonyms", name); + self.inner.cf_handle(&cf_name).expect("cf not found"); + SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) + }; + let words = { let cf_name = format!("{}-words", name); self.inner.cf_handle(&cf_name).expect("cf not found"); @@ -123,7 +135,7 @@ impl Database { CustomSettings(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) }; - let raw_index = RawIndex { main, words, docs_words, documents, custom }; + let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -154,6 +166,12 @@ impl Database { main.set_schema(&schema)?; + let synonyms = { + let cf_name = format!("{}-synonyms", name); + self.inner.create_cf(&cf_name, &rocksdb::Options::default())?; + SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) + }; + let words = { let cf_name = format!("{}-words", name); self.inner.create_cf(&cf_name, &rocksdb::Options::default())?; @@ -182,7 +200,7 @@ impl Database { indexes.insert(name.to_string()); self.set_indexes(&indexes)?; - let raw_index = RawIndex { main, words, docs_words, documents, custom }; + let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() diff --git a/meilidb-data/src/database/raw_index.rs b/meilidb-data/src/database/raw_index.rs index 8c129ac2d..612fb0df1 100644 --- a/meilidb-data/src/database/raw_index.rs +++ b/meilidb-data/src/database/raw_index.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; +use super::{MainIndex, SynonymsIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; #[derive(Clone)] pub struct RawIndex { pub main: MainIndex, + pub synonyms: SynonymsIndex, pub words: WordsIndex, pub docs_words: DocsWordsIndex, pub documents: DocumentsIndex, @@ -13,6 +14,7 @@ pub struct RawIndex { impl RawIndex { pub(crate) fn compact(&self) { self.main.0.compact_range(None::<&[u8]>, None::<&[u8]>); + self.synonyms.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.words.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.docs_words.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.documents.0.compact_range(None::<&[u8]>, None::<&[u8]>); diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs new file mode 100644 index 000000000..755c11710 --- /dev/null +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -0,0 +1,83 @@ +use std::collections::BTreeMap; +use std::sync::Arc; + +use fst::{SetBuilder, set::OpBuilder}; +use sdset::SetBuf; + +use crate::database::index::InnerIndex; +use super::{Error, Index}; + +pub struct SynonymsAddition<'a> { + inner: &'a Index, + synonyms: BTreeMap>, +} + +impl<'a> SynonymsAddition<'a> { + pub fn new(inner: &'a Index) -> SynonymsAddition<'a> { + SynonymsAddition { inner, synonyms: BTreeMap::new() } + } + + pub fn add_synonym(&mut self, synonym: String, alternatives: I) + where I: Iterator, + { + self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let synonyms = &lease_inner.raw.synonyms; + let main = &lease_inner.raw.main; + + let mut synonyms_builder = SetBuilder::memory(); + + for (synonym, mut alternatives) in self.synonyms { + synonyms_builder.insert(&synonym).unwrap(); + + let alternatives = { + alternatives.iter_mut().for_each(|s| *s = s.to_lowercase()); + let alternatives = SetBuf::from_dirty(alternatives); + + let mut alternatives_builder = SetBuilder::memory(); + alternatives_builder.extend_iter(alternatives).unwrap(); + alternatives_builder.into_inner().unwrap() + }; + synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; + } + + let delta_synonyms = synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main.synonyms_set()? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .r#union(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_synonyms, + }; + + main.set_synonyms_set(&synonyms)?; + + // update the "consistent" view of the Index + let words = main.words_set()?.unwrap_or_default(); + let ranked_map = lease_inner.ranked_map.clone();; + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + lease_inner.raw.compact(); + + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/synonyms_deletion.rs b/meilidb-data/src/database/synonyms_deletion.rs new file mode 100644 index 000000000..8720d4b5c --- /dev/null +++ b/meilidb-data/src/database/synonyms_deletion.rs @@ -0,0 +1,71 @@ +use std::collections::BTreeSet; +use std::sync::Arc; + +use fst::{SetBuilder, set::OpBuilder}; + +use crate::database::index::InnerIndex; +use super::{Error, Index}; + +pub struct SynonymsDeletion<'a> { + inner: &'a Index, + synonyms: BTreeSet, +} + +impl<'a> SynonymsDeletion<'a> { + pub fn new(inner: &'a Index) -> SynonymsDeletion<'a> { + SynonymsDeletion { inner, synonyms: BTreeSet::new() } + } + + pub fn delete_alternatives_of(&mut self, synonym: String) { + self.synonyms.insert(synonym); + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let synonyms = &lease_inner.raw.synonyms; + let main = &lease_inner.raw.main; + + let mut synonyms_builder = SetBuilder::memory(); + + for synonym in self.synonyms { + synonyms_builder.insert(&synonym).unwrap(); + synonyms.del_alternatives_of(synonym.as_bytes())?; + } + + let delta_synonyms = synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main.synonyms_set()? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .difference(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => fst::Set::default(), + }; + + main.set_synonyms_set(&synonyms)?; + + // update the "consistent" view of the Index + let words = main.words_set()?.unwrap_or_default(); + let ranked_map = lease_inner.ranked_map.clone(); + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + lease_inner.raw.compact(); + + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/synonyms_index.rs b/meilidb-data/src/database/synonyms_index.rs new file mode 100644 index 000000000..dfc0182e4 --- /dev/null +++ b/meilidb-data/src/database/synonyms_index.rs @@ -0,0 +1,23 @@ +use crate::database::raw_index::InnerRawIndex; + +#[derive(Clone)] +pub struct SynonymsIndex(pub(crate) InnerRawIndex); + +impl SynonymsIndex { + pub fn alternatives_to(&self, word: &[u8]) -> Result, rocksdb::Error> { + match self.0.get(word)? { + Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())), + None => Ok(None), + } + } + + pub fn set_alternatives_to(&self, word: &[u8], value: Vec) -> Result<(), rocksdb::Error> { + self.0.set(word, value)?; + Ok(()) + } + + pub fn del_alternatives_of(&self, word: &[u8]) -> Result<(), rocksdb::Error> { + self.0.delete(word)?; + Ok(()) + } +} From 8f044c6853c0a8fe4107d92374309101ca291205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 18 Jun 2019 15:18:16 +0200 Subject: [PATCH 11/16] fix: Only create non-prefix DFA when generating synonyms alternatives --- meilidb-core/src/query_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 088d2bc2e..5325e6eb7 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -45,7 +45,7 @@ fn generate_automatons(query: &str, store: &S) -> Result Date: Tue, 18 Jun 2019 15:47:47 +0200 Subject: [PATCH 12/16] feat: Trigger synonym replacement only when the last word is tipped --- meilidb-core/src/query_builder.rs | 40 ++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 5325e6eb7..ce427698a 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -31,18 +31,25 @@ fn generate_automatons(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); assert_matches!(iter.next(), None); let builder = QueryBuilder::new(&store); let results = builder.query("bonj blabla", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); assert_matches!(iter.next(), None); } From 43f11e929dde94bc05c49cec0fdb4b50aaa43484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 19 Jun 2019 13:46:48 +0200 Subject: [PATCH 13/16] fix: Do not trigger a synonym when its not the last word and is a prefix --- meilidb-core/src/query_builder.rs | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index ce427698a..decffc3b6 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -31,11 +31,14 @@ fn generate_automatons(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result { - assert_eq!(matches.len(), 1); - let match_ = matches[0]; - assert_eq!(match_.query_index, 0); - assert_eq!(match_.word_index, 0); - }); assert_matches!(iter.next(), None); let builder = QueryBuilder::new(&store); let results = builder.query("bonj blabla", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { - assert_eq!(matches.len(), 1); - let match_ = matches[0]; - assert_eq!(match_.query_index, 0); - assert_eq!(match_.word_index, 0); - }); assert_matches!(iter.next(), None); } From 3dcbc737f358c6fb87b1ac79218ef0f8c9aa62d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 19 Jun 2019 14:10:21 +0200 Subject: [PATCH 14/16] feat: Make synonyms be not considered like exact matches --- meilidb-core/src/query_builder.rs | 106 ++++++++++++++++++------------ 1 file changed, 63 insertions(+), 43 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index decffc3b6..3036283f2 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -20,7 +20,24 @@ use crate::{Match, DocumentId, Store, RawDocument, Document}; const NGRAMS: usize = 3; -fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { +struct Automaton { + index: usize, + is_synonym: bool, + number_words: usize, + dfa: DfaExt, +} + +impl Automaton { + fn synonym(index: usize, number_words: usize, dfa: DfaExt) -> Automaton { + Automaton { index, is_synonym: true, number_words, dfa } + } + + fn original(index: usize, number_words: usize, dfa: DfaExt) -> Automaton { + Automaton { index, is_synonym: false, number_words, dfa } + } +} + +fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let mut automatons = Vec::new(); @@ -54,25 +71,28 @@ fn generate_automatons(query: &str, store: &S) -> Result { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY @@ -724,18 +750,18 @@ mod tests { assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); - }); assert_matches!(iter.next(), None); let builder = QueryBuilder::new(&store); let results = builder.query("NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY @@ -744,12 +770,6 @@ mod tests { assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); - }); assert_matches!(iter.next(), None); } @@ -776,6 +796,12 @@ mod tests { let results = builder.query("NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY @@ -784,18 +810,18 @@ mod tests { assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); - }); assert_matches!(iter.next(), None); let builder = QueryBuilder::new(&store); let results = builder.query("NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY @@ -804,12 +830,6 @@ mod tests { assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); - }); assert_matches!(iter.next(), None); } @@ -860,6 +880,12 @@ mod tests { let results = builder.query("NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY @@ -869,12 +895,6 @@ mod tests { assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway assert_matches!(iter.next(), None); // position rewritten ^ }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway - assert_matches!(iter.next(), None); - }); assert_matches!(iter.next(), None); } From 0a5d4eb7ed48dea81a8bccccc52a07956c155039 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 20 Jun 2019 16:25:14 +0200 Subject: [PATCH 15/16] feat: Normalize synonym strings and query strings to search for synonyms --- meilidb-core/Cargo.toml | 1 + meilidb-core/src/lib.rs | 2 +- meilidb-core/src/query_builder.rs | 79 ++++++++++++++++++- .../src/database/synonyms_addition.rs | 8 +- 4 files changed, 84 insertions(+), 6 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 8c1d4ee36..037a7788c 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] byteorder = "1.3.1" +deunicode = "1.0.0" hashbrown = "0.2.2" lazy_static = "1.2.0" log = "0.4.6" diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index 3235cd6af..72435ea46 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -15,7 +15,7 @@ use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; -pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; +pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; pub use self::store::Store; /// Represent an internally generated document unique identifier. diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 3036283f2..147908906 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -37,6 +37,16 @@ impl Automaton { } } +pub fn normalize_str(string: &str) -> String { + let mut string = string.to_lowercase(); + + if !string.contains(is_cjk) { + string = deunicode::deunicode_with_tofu(&string, ""); + } + + string +} + fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); @@ -55,7 +65,10 @@ fn generate_automatons(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result) -> Set { let mut builder = fst::SetBuilder::memory(); - let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect()); + let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); builder.extend_iter(set.into_iter()).unwrap(); builder.into_inner().and_then(Set::from_bytes).unwrap() } @@ -953,4 +967,65 @@ mod tests { }); assert_matches!(iter.next(), None); } + + #[test] + fn deunicoded_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("iPhone", &[doc_index(0, 0)][..]), + ("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded + ("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex + ]); + + store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("telephone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("téléphone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("télephone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } } diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs index 755c11710..d0f6c2160 100644 --- a/meilidb-data/src/database/synonyms_addition.rs +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -2,6 +2,8 @@ use std::collections::BTreeMap; use std::sync::Arc; use fst::{SetBuilder, set::OpBuilder}; +use meilidb_tokenizer::is_cjk; +use meilidb_core::normalize_str; use sdset::SetBuf; use crate::database::index::InnerIndex; @@ -20,6 +22,8 @@ impl<'a> SynonymsAddition<'a> { pub fn add_synonym(&mut self, synonym: String, alternatives: I) where I: Iterator, { + let mut synonym = normalize_str(&synonym); + let alternatives = alternatives.map(|s| s.to_lowercase()); self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); } @@ -30,13 +34,11 @@ impl<'a> SynonymsAddition<'a> { let mut synonyms_builder = SetBuilder::memory(); - for (synonym, mut alternatives) in self.synonyms { + for (synonym, alternatives) in self.synonyms { synonyms_builder.insert(&synonym).unwrap(); let alternatives = { - alternatives.iter_mut().for_each(|s| *s = s.to_lowercase()); let alternatives = SetBuf::from_dirty(alternatives); - let mut alternatives_builder = SetBuilder::memory(); alternatives_builder.extend_iter(alternatives).unwrap(); alternatives_builder.into_inner().unwrap() From b249b2a81b7c666727316ab680d649cdf404d18a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 24 Jun 2019 14:19:39 +0200 Subject: [PATCH 16/16] feat: Support removing specific synonym alternatives --- .../src/database/synonyms_addition.rs | 11 +-- .../src/database/synonyms_deletion.rs | 78 ++++++++++++++++--- 2 files changed, 74 insertions(+), 15 deletions(-) diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs index d0f6c2160..6e16ab97b 100644 --- a/meilidb-data/src/database/synonyms_addition.rs +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -2,7 +2,6 @@ use std::collections::BTreeMap; use std::sync::Arc; use fst::{SetBuilder, set::OpBuilder}; -use meilidb_tokenizer::is_cjk; use meilidb_core::normalize_str; use sdset::SetBuf; @@ -19,11 +18,13 @@ impl<'a> SynonymsAddition<'a> { SynonymsAddition { inner, synonyms: BTreeMap::new() } } - pub fn add_synonym(&mut self, synonym: String, alternatives: I) - where I: Iterator, + pub fn add_synonym(&mut self, synonym: S, alternatives: I) + where S: AsRef, + T: AsRef, + I: Iterator, { - let mut synonym = normalize_str(&synonym); - let alternatives = alternatives.map(|s| s.to_lowercase()); + let synonym = normalize_str(synonym.as_ref()); + let alternatives = alternatives.map(|s| s.as_ref().to_lowercase()); self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); } diff --git a/meilidb-data/src/database/synonyms_deletion.rs b/meilidb-data/src/database/synonyms_deletion.rs index 8720d4b5c..c07e92859 100644 --- a/meilidb-data/src/database/synonyms_deletion.rs +++ b/meilidb-data/src/database/synonyms_deletion.rs @@ -1,23 +1,41 @@ -use std::collections::BTreeSet; +use std::collections::BTreeMap; +use std::iter::FromIterator; use std::sync::Arc; use fst::{SetBuilder, set::OpBuilder}; +use meilidb_core::normalize_str; +use sdset::SetBuf; use crate::database::index::InnerIndex; use super::{Error, Index}; pub struct SynonymsDeletion<'a> { inner: &'a Index, - synonyms: BTreeSet, + synonyms: BTreeMap>>, } impl<'a> SynonymsDeletion<'a> { pub fn new(inner: &'a Index) -> SynonymsDeletion<'a> { - SynonymsDeletion { inner, synonyms: BTreeSet::new() } + SynonymsDeletion { inner, synonyms: BTreeMap::new() } } - pub fn delete_alternatives_of(&mut self, synonym: String) { - self.synonyms.insert(synonym); + pub fn delete_all_alternatives_of>(&mut self, synonym: S) { + let synonym = normalize_str(synonym.as_ref()); + self.synonyms.insert(synonym, None); + } + + pub fn delete_specific_alternatives_of(&mut self, synonym: S, alternatives: I) + where S: AsRef, + T: AsRef, + I: Iterator, + { + let synonym = normalize_str(synonym.as_ref()); + let value = self.synonyms.entry(synonym).or_insert(None); + let alternatives = alternatives.map(|s| s.as_ref().to_lowercase()); + match value { + Some(v) => v.extend(alternatives), + None => *value = Some(Vec::from_iter(alternatives)), + } } pub fn finalize(self) -> Result<(), Error> { @@ -25,14 +43,54 @@ impl<'a> SynonymsDeletion<'a> { let synonyms = &lease_inner.raw.synonyms; let main = &lease_inner.raw.main; - let mut synonyms_builder = SetBuilder::memory(); + let mut delete_whole_synonym_builder = SetBuilder::memory(); - for synonym in self.synonyms { - synonyms_builder.insert(&synonym).unwrap(); - synonyms.del_alternatives_of(synonym.as_bytes())?; + for (synonym, alternatives) in self.synonyms { + match alternatives { + Some(alternatives) => { + let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?; + let prev_alternatives = match prev_alternatives { + Some(alternatives) => alternatives, + None => continue, + }; + + let delta_alternatives = { + let alternatives = SetBuf::from_dirty(alternatives); + let mut builder = SetBuilder::memory(); + builder.extend_iter(alternatives).unwrap(); + builder.into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }; + + let op = OpBuilder::new() + .add(prev_alternatives.stream()) + .add(delta_alternatives.stream()) + .difference(); + + let (alternatives, empty_alternatives) = { + let mut builder = SetBuilder::memory(); + let len = builder.get_ref().len(); + builder.extend_stream(op).unwrap(); + let is_empty = len == builder.get_ref().len(); + let alternatives = builder.into_inner().unwrap(); + (alternatives, is_empty) + }; + + if empty_alternatives { + delete_whole_synonym_builder.insert(synonym.as_bytes())?; + } else { + synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; + } + }, + None => { + delete_whole_synonym_builder.insert(&synonym).unwrap(); + synonyms.del_alternatives_of(synonym.as_bytes())?; + } + } } - let delta_synonyms = synonyms_builder + let delta_synonyms = delete_whole_synonym_builder .into_inner() .and_then(fst::Set::from_bytes) .unwrap();