diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 8790889e2..037a7788c 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -6,6 +6,7 @@ edition = "2018" [dependencies] byteorder = "1.3.1" +deunicode = "1.0.0" hashbrown = "0.2.2" lazy_static = "1.2.0" log = "0.4.6" @@ -25,6 +26,9 @@ git = "https://github.com/Kerollmops/levenshtein-automata.git" branch = "arc-byte-slice" features = ["fst_automaton"] +[dev-dependencies] +assert_matches = "1.3" + [features] i128 = ["byteorder/i128"] nightly = ["hashbrown/nightly", "slice-group-by/nightly"] diff --git a/meilidb-core/src/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs index 9b2962bdb..6ce42007c 100644 --- a/meilidb-core/src/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -113,7 +113,7 @@ impl<'a> Default for Criteria<'a> { } } -impl<'a> AsRef<[Box]> for Criteria<'a> { +impl<'a> AsRef<[Box]> for Criteria<'a> { fn as_ref(&self) -> &[Box] { &self.inner } diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index e61fa543b..72435ea46 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,3 +1,6 @@ +#[cfg(test)] +#[macro_use] extern crate assert_matches; + mod automaton; mod distinct_map; mod query_builder; @@ -7,12 +10,12 @@ pub mod criterion; use std::fmt; use std::sync::Arc; -use rayon::slice::ParallelSliceMut; +use sdset::SetBuf; use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; -pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; +pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str}; pub use self::store::Store; /// Represent an internally generated document unique identifier. @@ -226,12 +229,10 @@ impl fmt::Debug for RawDocument { } } -pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { - let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); +pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec { + let mut docs_ranges = Vec::<(_, Range)>::new(); let mut matches2 = Matches::with_capacity(matches.len()); - matches.par_sort_unstable(); - for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { let id = group[0].0; let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index f9f51dba2..147908906 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -4,35 +4,111 @@ use std::rc::Rc; use std::time::Instant; use std::{cmp, mem}; -use rayon::slice::ParallelSliceMut; -use slice_group_by::GroupByMut; -use meilidb_tokenizer::{is_cjk, split_query_string}; +use fst::{Streamer, IntoStreamer}; use hashbrown::{HashMap, HashSet}; -use fst::Streamer; use log::info; +use meilidb_tokenizer::{is_cjk, split_query_string}; +use rayon::slice::ParallelSliceMut; +use sdset::SetBuf; +use slice_group_by::GroupByMut; -use crate::automaton::{self, DfaExt, AutomatonExt}; +use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::{Match, DocumentId, Store, RawDocument, Document}; -fn generate_automatons(query: &str) -> Vec { - let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); - let mut automatons = Vec::new(); +const NGRAMS: usize = 3; - while let Some(word) = groups.next() { - let has_following_word = groups.peek().is_some(); - let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { - automaton::build_dfa(&word) - } else { - automaton::build_prefix_dfa(&word) - }; - automatons.push(lev); +struct Automaton { + index: usize, + is_synonym: bool, + number_words: usize, + dfa: DfaExt, +} + +impl Automaton { + fn synonym(index: usize, number_words: usize, dfa: DfaExt) -> Automaton { + Automaton { index, is_synonym: true, number_words, dfa } } - automatons + fn original(index: usize, number_words: usize, dfa: DfaExt) -> Automaton { + Automaton { index, is_synonym: false, number_words, dfa } + } +} + +pub fn normalize_str(string: &str) -> String { + let mut string = string.to_lowercase(); + + if !string.contains(is_cjk) { + string = deunicode::deunicode_with_tofu(&string, ""); + } + + string +} + +fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { + let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); + let mut automatons = Vec::new(); + + let synonyms = store.synonyms()?; + + for n in 1..=NGRAMS { + let mut index = 0; + let mut ngrams = query_words.windows(n).peekable(); + + while let Some(ngram) = ngrams.next() { + let ngram_nb_words = ngram.len(); + let ngram = ngram.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + let lev = { + let normalized = normalize_str(&ngram); + if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) } + }; + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(base) = stream.next() { + + // only trigger alternatives when the last word has been typed + // i.e. "new " do not but "new yo" triggers alternatives to "new york" + let base = std::str::from_utf8(base).unwrap(); + let base_nb_words = split_query_string(base).count(); + if ngram_nb_words != base_nb_words { continue } + + if let Some(synonyms) = store.alternatives_to(base.as_bytes())? { + + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + + let synonyms = std::str::from_utf8(synonyms).unwrap(); + let nb_synonym_words = split_query_string(synonyms).count(); + for synonym in split_query_string(synonyms) { + let lev = build_dfa(synonym); + let automaton = Automaton::synonym(index, nb_synonym_words, lev); + automatons.push((automaton, synonym.to_owned())); + } + } + } + } + + if n == 1 { + let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) }; + let automaton = Automaton::original(index, ngram_nb_words, lev); + automatons.push((automaton, ngram)); + } + + index += 1; + } + } + + automatons.sort_unstable_by(|a, b| (a.0.index, &a.1).cmp(&(b.0.index, &b.1))); + automatons.dedup_by(|a, b| (a.0.index, &a.1) == (b.0.index, &b.1)); + let automatons = automatons.into_iter().map(|(a, _)| a).collect(); + + Ok(automatons) } pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> { @@ -61,7 +137,7 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> store: self.store, criteria: self.criteria, searchable_attrs: self.searchable_attrs, - filter: Some(function) + filter: Some(function), } } @@ -82,13 +158,13 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { - let automatons = generate_automatons(query); + let automatons = generate_automatons(query, &self.store)?; let words = self.store.words()?.as_fst(); let mut stream = { let mut op_builder = fst::raw::OpBuilder::new(); - for automaton in &automatons { - let stream = words.search(automaton); + for Automaton { dfa, .. } in &automatons { + let stream = words.search(dfa); op_builder.push(stream); } op_builder.r#union() @@ -98,9 +174,9 @@ where S: Store, while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { - let automaton = &automatons[iv.index]; - let distance = automaton.eval(input).to_u8(); - let is_exact = distance == 0 && input.len() == automaton.query_len(); + let Automaton { index, is_synonym, number_words, ref dfa } = automatons[iv.index]; + let distance = dfa.eval(input).to_u8(); + let is_exact = (is_synonym && number_words == 1) || (!is_synonym && distance == 0 && input.len() == dfa.query_len()); let doc_indexes = self.store.word_indexes(input)?; let doc_indexes = match doc_indexes { @@ -111,8 +187,8 @@ where S: Store, for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { let match_ = Match { - query_index: iv.index as u32, - distance, + query_index: index as u32, + distance: distance, attribute: di.attribute, word_index: di.word_index, is_exact, @@ -125,8 +201,22 @@ where S: Store, } } + matches.par_sort_unstable(); + + for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) { + let mut offset = 0; + for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) { + let word_index = query_indexes[0].1.word_index - offset as u16; + for (_, match_) in query_indexes.iter_mut() { + match_.word_index = word_index; + } + offset += query_indexes.len() - 1; + } + } + let total_matches = matches.len(); - let raw_documents = raw_documents_from_matches(matches); + let padded_matches = SetBuf::from_dirty(matches); + let raw_documents = raw_documents_from_matches(padded_matches); info!("{} total documents to classify", raw_documents.len()); info!("{} total matches to classify", total_matches); @@ -321,3 +411,621 @@ where S: Store, Ok(out_documents) } } + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::{BTreeSet, HashMap}; + use std::iter::FromIterator; + + use sdset::SetBuf; + use fst::{Set, IntoStreamer}; + + use crate::DocIndex; + use crate::store::Store; + + #[derive(Default)] + struct InMemorySetStore { + set: Set, + synonyms: Set, + indexes: HashMap, SetBuf>, + alternatives: HashMap, Set>, + } + + fn set_from_stream<'f, I, S>(stream: I) -> Set + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=&'a [u8]>, + S: 'f + for<'a> fst::Streamer<'a, Item=&'a [u8]>, + { + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(stream).unwrap(); + builder.into_inner().and_then(Set::from_bytes).unwrap() + } + + fn insert_key(set: &Set, key: &[u8]) -> Set { + let unique_key = { + let mut builder = fst::SetBuilder::memory(); + builder.insert(key).unwrap(); + builder.into_inner().and_then(Set::from_bytes).unwrap() + }; + + let union_ = set.op().add(unique_key.into_stream()).r#union(); + + set_from_stream(union_) + } + + fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { + let mut builder = fst::SetBuilder::memory(); + let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); + builder.extend_iter(set.into_iter()).unwrap(); + builder.into_inner().and_then(Set::from_bytes).unwrap() + } + + impl InMemorySetStore { + pub fn add_synonym(&mut self, word: &str, new: SetBuf<&str>) { + let word = word.to_lowercase(); + let alternatives = self.alternatives.entry(word.as_bytes().to_vec()).or_default(); + let new = sdset_into_fstset(&new); + *alternatives = set_from_stream(alternatives.op().add(new.into_stream()).r#union()); + + self.synonyms = insert_key(&self.synonyms, word.as_bytes()); + } + } + + impl<'a> FromIterator<(&'a str, &'a [DocIndex])> for InMemorySetStore { + fn from_iter>(iter: I) -> Self { + let mut tree = BTreeSet::new(); + let mut map = HashMap::new(); + + for (word, indexes) in iter { + let word = word.to_lowercase().into_bytes(); + tree.insert(word.clone()); + map.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); + } + + InMemorySetStore { + set: Set::from_iter(tree).unwrap(), + synonyms: Set::default(), + indexes: map.into_iter().map(|(k, v)| (k, SetBuf::from_dirty(v))).collect(), + alternatives: HashMap::new(), + } + } + } + + impl Store for InMemorySetStore { + type Error = std::io::Error; + + fn words(&self) -> Result<&Set, Self::Error> { + Ok(&self.set) + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + Ok(self.indexes.get(word).cloned()) + } + + fn synonyms(&self) -> Result<&Set, Self::Error> { + Ok(&self.synonyms) + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + Ok(self.alternatives.get(word).map(|s| Set::from_bytes(s.as_fst().to_vec()).unwrap())) + } + } + + const fn doc_index(document_id: u64, word_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index: 0, + char_length: 0, + } + } + + const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index, + char_length: 0, + } + } + + #[test] + fn simple_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("hello", &[doc_index(0, 0)][..]), + ]); + + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("hello", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonjour", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn prefix_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("hello", &[doc_index(0, 0)][..]), + ]); + + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"])); + store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("sal", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonj", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("sal blabla", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonj blabla", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), None); + } + + #[test] + fn levenshtein_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("hello", &[doc_index(0, 0)][..]), + ]); + + store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("salutution", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("saluttion", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn harder_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("hello", &[doc_index(0, 0)][..]), + ("bonjour", &[doc_index(1, 3)]), + ("salut", &[doc_index(2, 5)]), + ]); + + store.add_synonym("hello", SetBuf::from_dirty(vec!["bonjour", "salut"])); + store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello", "salut"])); + store.add_synonym("salut", SetBuf::from_dirty(vec!["hello", "bonjour"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("hello", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonjour", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("salut", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("subway", &[doc_char_index(1, 1, 1)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn harder_unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("subway", &[doc_char_index(0, 4, 4)][..]), + ("broken", &[doc_char_index(0, 5, 5)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn even_harder_unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("underground", &[doc_char_index(0, 4, 4)][..]), + ("train", &[doc_char_index(0, 5, 5)][..]), + ("broken", &[doc_char_index(0, 6, 6)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Multi-word has multi-word synonyms + fn multiword_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + + ("NYC", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ("broken", &[doc_char_index(1, 3, 3)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); + store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC", "NY", "new york"])); + store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york underground train broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 2, .. })); // subway = underground train + assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york city underground train broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, .. })); // subway = underground train + assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york city + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn deunicoded_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("iPhone", &[doc_index(0, 0)][..]), + ("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded + ("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex + ]); + + store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("telephone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("téléphone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("télephone", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, .. })); + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } +} diff --git a/meilidb-core/src/store.rs b/meilidb-core/src/store.rs index 14e95f0cc..6e429a1b4 100644 --- a/meilidb-core/src/store.rs +++ b/meilidb-core/src/store.rs @@ -8,6 +8,9 @@ pub trait Store { fn words(&self) -> Result<&Set, Self::Error>; fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error>; + + fn synonyms(&self) -> Result<&Set, Self::Error>; + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error>; } impl Store for &'_ T where T: Store { @@ -20,4 +23,12 @@ impl Store for &'_ T where T: Store { fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { (*self).word_indexes(word) } + + fn synonyms(&self) -> Result<&Set, Self::Error> { + (*self).synonyms() + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + (*self).alternatives_to(word) + } } diff --git a/meilidb-data/src/database/documents_addition.rs b/meilidb-data/src/database/documents_addition.rs index 177d1975c..15323be70 100644 --- a/meilidb-data/src/database/documents_addition.rs +++ b/meilidb-data/src/database/documents_addition.rs @@ -120,11 +120,12 @@ impl<'a> DocumentsAddition<'a> { // update the "consistent" view of the Index let ranked_map = self.ranked_map; + let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone() let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; self.inner.0.store(Arc::new(inner)); Ok(()) diff --git a/meilidb-data/src/database/documents_deletion.rs b/meilidb-data/src/database/documents_deletion.rs index e89923199..9813afe3c 100644 --- a/meilidb-data/src/database/documents_deletion.rs +++ b/meilidb-data/src/database/documents_deletion.rs @@ -119,11 +119,12 @@ impl<'a> DocumentsDeletion<'a> { // update the "consistent" view of the Index let ranked_map = lease_inner.ranked_map.clone(); + let synonyms = fst::Set::from_bytes(lease_inner.synonyms.as_fst().to_vec()).unwrap(); // clone() let schema = lease_inner.schema.clone(); let raw = lease_inner.raw.clone(); lease_inner.raw.compact(); - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; self.inner.0.store(Arc::new(inner)); Ok(()) diff --git a/meilidb-data/src/database/index.rs b/meilidb-data/src/database/index.rs index 4cc6d7acb..886d31118 100644 --- a/meilidb-data/src/database/index.rs +++ b/meilidb-data/src/database/index.rs @@ -13,7 +13,11 @@ use crate::ranked_map::RankedMap; use crate::serde::Deserializer; use super::{Error, CustomSettings}; -use super::{RawIndex, DocumentsAddition, DocumentsDeletion}; +use super::{ + RawIndex, + DocumentsAddition, DocumentsDeletion, + SynonymsAddition, SynonymsDeletion, +}; #[derive(Copy, Clone)] pub struct IndexStats { @@ -27,6 +31,7 @@ pub struct Index(pub ArcSwap); pub struct InnerIndex { pub words: fst::Set, + pub synonyms: fst::Set, pub schema: Schema, pub ranked_map: RankedMap, pub raw: RawIndex, // TODO this will be a snapshot in the future @@ -39,6 +44,11 @@ impl Index { None => fst::Set::default(), }; + let synonyms = match raw.main.synonyms_set()? { + Some(synonyms) => synonyms, + None => fst::Set::default(), + }; + let schema = match raw.main.schema()? { Some(schema) => schema, None => return Err(Error::SchemaMissing), @@ -49,7 +59,7 @@ impl Index { None => RankedMap::default(), }; - let inner = InnerIndex { words, schema, ranked_map, raw }; + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; let index = Index(ArcSwap::new(Arc::new(inner))); Ok(index) @@ -101,6 +111,14 @@ impl Index { DocumentsDeletion::new(self, ranked_map) } + pub fn synonyms_addition(&self) -> SynonymsAddition { + SynonymsAddition::new(self) + } + + pub fn synonyms_deletion(&self) -> SynonymsDeletion { + SynonymsDeletion::new(self) + } + pub fn document( &self, fields: Option<&HashSet<&str>>, @@ -141,4 +159,12 @@ impl Store for IndexLease { fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { Ok(self.0.raw.words.doc_indexes(word)?) } + + fn synonyms(&self) -> Result<&fst::Set, Self::Error> { + Ok(&self.0.synonyms) + } + + fn alternatives_to(&self, word: &[u8]) -> Result, Self::Error> { + Ok(self.0.raw.synonyms.alternatives_to(word)?) + } } diff --git a/meilidb-data/src/database/main_index.rs b/meilidb-data/src/database/main_index.rs index 7b3b98479..d7d4e1fbd 100644 --- a/meilidb-data/src/database/main_index.rs +++ b/meilidb-data/src/database/main_index.rs @@ -44,6 +44,22 @@ impl MainIndex { self.0.set("words", value.as_fst().as_bytes()).map_err(Into::into) } + pub fn synonyms_set(&self) -> Result, Error> { + match self.0.get_pinned("synonyms")? { + Some(bytes) => { + let len = bytes.len(); + let value = Arc::from(bytes.as_ref()); + let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?; + Ok(Some(fst::Set::from(fst))) + }, + None => Ok(None), + } + } + + pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> { + self.0.set("synonyms", value.as_fst().as_bytes()).map_err(Into::into) + } + pub fn ranked_map(&self) -> Result, Error> { match self.0.get_pinned("ranked-map")? { Some(bytes) => { diff --git a/meilidb-data/src/database/mod.rs b/meilidb-data/src/database/mod.rs index b9df6fc0b..2edf774e0 100644 --- a/meilidb-data/src/database/mod.rs +++ b/meilidb-data/src/database/mod.rs @@ -13,6 +13,9 @@ mod error; mod index; mod main_index; mod raw_index; +mod synonyms_addition; +mod synonyms_deletion; +mod synonyms_index; mod words_index; pub use self::error::Error; @@ -22,11 +25,14 @@ pub use self::custom_settings::CustomSettings; use self::docs_words_index::DocsWordsIndex; use self::documents_addition::DocumentsAddition; use self::documents_deletion::DocumentsDeletion; +use self::synonyms_addition::SynonymsAddition; +use self::synonyms_deletion::SynonymsDeletion; use self::documents_index::DocumentsIndex; use self::index::InnerIndex; use self::main_index::MainIndex; use self::raw_index::{RawIndex, InnerRawIndex}; use self::words_index::WordsIndex; +use self::synonyms_index::SynonymsIndex; pub struct Database { cache: RwLock>>, @@ -99,6 +105,12 @@ impl Database { MainIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(name))) }; + let synonyms = { + let cf_name = format!("{}-synonyms", name); + self.inner.cf_handle(&cf_name).expect("cf not found"); + SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) + }; + let words = { let cf_name = format!("{}-words", name); self.inner.cf_handle(&cf_name).expect("cf not found"); @@ -123,7 +135,7 @@ impl Database { CustomSettings(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) }; - let raw_index = RawIndex { main, words, docs_words, documents, custom }; + let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() @@ -154,6 +166,12 @@ impl Database { main.set_schema(&schema)?; + let synonyms = { + let cf_name = format!("{}-synonyms", name); + self.inner.create_cf(&cf_name, &rocksdb::Options::default())?; + SynonymsIndex(InnerRawIndex::new(self.inner.clone(), Arc::from(cf_name))) + }; + let words = { let cf_name = format!("{}-words", name); self.inner.create_cf(&cf_name, &rocksdb::Options::default())?; @@ -182,7 +200,7 @@ impl Database { indexes.insert(name.to_string()); self.set_indexes(&indexes)?; - let raw_index = RawIndex { main, words, docs_words, documents, custom }; + let raw_index = RawIndex { main, synonyms, words, docs_words, documents, custom }; let index = Index::from_raw(raw_index)?; vacant.insert(Arc::new(index)).clone() diff --git a/meilidb-data/src/database/raw_index.rs b/meilidb-data/src/database/raw_index.rs index 8c129ac2d..612fb0df1 100644 --- a/meilidb-data/src/database/raw_index.rs +++ b/meilidb-data/src/database/raw_index.rs @@ -1,9 +1,10 @@ use std::sync::Arc; -use super::{MainIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; +use super::{MainIndex, SynonymsIndex, WordsIndex, DocsWordsIndex, DocumentsIndex, CustomSettings}; #[derive(Clone)] pub struct RawIndex { pub main: MainIndex, + pub synonyms: SynonymsIndex, pub words: WordsIndex, pub docs_words: DocsWordsIndex, pub documents: DocumentsIndex, @@ -13,6 +14,7 @@ pub struct RawIndex { impl RawIndex { pub(crate) fn compact(&self) { self.main.0.compact_range(None::<&[u8]>, None::<&[u8]>); + self.synonyms.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.words.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.docs_words.0.compact_range(None::<&[u8]>, None::<&[u8]>); self.documents.0.compact_range(None::<&[u8]>, None::<&[u8]>); diff --git a/meilidb-data/src/database/synonyms_addition.rs b/meilidb-data/src/database/synonyms_addition.rs new file mode 100644 index 000000000..6e16ab97b --- /dev/null +++ b/meilidb-data/src/database/synonyms_addition.rs @@ -0,0 +1,86 @@ +use std::collections::BTreeMap; +use std::sync::Arc; + +use fst::{SetBuilder, set::OpBuilder}; +use meilidb_core::normalize_str; +use sdset::SetBuf; + +use crate::database::index::InnerIndex; +use super::{Error, Index}; + +pub struct SynonymsAddition<'a> { + inner: &'a Index, + synonyms: BTreeMap>, +} + +impl<'a> SynonymsAddition<'a> { + pub fn new(inner: &'a Index) -> SynonymsAddition<'a> { + SynonymsAddition { inner, synonyms: BTreeMap::new() } + } + + pub fn add_synonym(&mut self, synonym: S, alternatives: I) + where S: AsRef, + T: AsRef, + I: Iterator, + { + let synonym = normalize_str(synonym.as_ref()); + let alternatives = alternatives.map(|s| s.as_ref().to_lowercase()); + self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives); + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let synonyms = &lease_inner.raw.synonyms; + let main = &lease_inner.raw.main; + + let mut synonyms_builder = SetBuilder::memory(); + + for (synonym, alternatives) in self.synonyms { + synonyms_builder.insert(&synonym).unwrap(); + + let alternatives = { + let alternatives = SetBuf::from_dirty(alternatives); + let mut alternatives_builder = SetBuilder::memory(); + alternatives_builder.extend_iter(alternatives).unwrap(); + alternatives_builder.into_inner().unwrap() + }; + synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; + } + + let delta_synonyms = synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main.synonyms_set()? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .r#union(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => delta_synonyms, + }; + + main.set_synonyms_set(&synonyms)?; + + // update the "consistent" view of the Index + let words = main.words_set()?.unwrap_or_default(); + let ranked_map = lease_inner.ranked_map.clone();; + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + lease_inner.raw.compact(); + + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/synonyms_deletion.rs b/meilidb-data/src/database/synonyms_deletion.rs new file mode 100644 index 000000000..c07e92859 --- /dev/null +++ b/meilidb-data/src/database/synonyms_deletion.rs @@ -0,0 +1,129 @@ +use std::collections::BTreeMap; +use std::iter::FromIterator; +use std::sync::Arc; + +use fst::{SetBuilder, set::OpBuilder}; +use meilidb_core::normalize_str; +use sdset::SetBuf; + +use crate::database::index::InnerIndex; +use super::{Error, Index}; + +pub struct SynonymsDeletion<'a> { + inner: &'a Index, + synonyms: BTreeMap>>, +} + +impl<'a> SynonymsDeletion<'a> { + pub fn new(inner: &'a Index) -> SynonymsDeletion<'a> { + SynonymsDeletion { inner, synonyms: BTreeMap::new() } + } + + pub fn delete_all_alternatives_of>(&mut self, synonym: S) { + let synonym = normalize_str(synonym.as_ref()); + self.synonyms.insert(synonym, None); + } + + pub fn delete_specific_alternatives_of(&mut self, synonym: S, alternatives: I) + where S: AsRef, + T: AsRef, + I: Iterator, + { + let synonym = normalize_str(synonym.as_ref()); + let value = self.synonyms.entry(synonym).or_insert(None); + let alternatives = alternatives.map(|s| s.as_ref().to_lowercase()); + match value { + Some(v) => v.extend(alternatives), + None => *value = Some(Vec::from_iter(alternatives)), + } + } + + pub fn finalize(self) -> Result<(), Error> { + let lease_inner = self.inner.lease_inner(); + let synonyms = &lease_inner.raw.synonyms; + let main = &lease_inner.raw.main; + + let mut delete_whole_synonym_builder = SetBuilder::memory(); + + for (synonym, alternatives) in self.synonyms { + match alternatives { + Some(alternatives) => { + let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?; + let prev_alternatives = match prev_alternatives { + Some(alternatives) => alternatives, + None => continue, + }; + + let delta_alternatives = { + let alternatives = SetBuf::from_dirty(alternatives); + let mut builder = SetBuilder::memory(); + builder.extend_iter(alternatives).unwrap(); + builder.into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }; + + let op = OpBuilder::new() + .add(prev_alternatives.stream()) + .add(delta_alternatives.stream()) + .difference(); + + let (alternatives, empty_alternatives) = { + let mut builder = SetBuilder::memory(); + let len = builder.get_ref().len(); + builder.extend_stream(op).unwrap(); + let is_empty = len == builder.get_ref().len(); + let alternatives = builder.into_inner().unwrap(); + (alternatives, is_empty) + }; + + if empty_alternatives { + delete_whole_synonym_builder.insert(synonym.as_bytes())?; + } else { + synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?; + } + }, + None => { + delete_whole_synonym_builder.insert(&synonym).unwrap(); + synonyms.del_alternatives_of(synonym.as_bytes())?; + } + } + } + + let delta_synonyms = delete_whole_synonym_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap(); + + let synonyms = match main.synonyms_set()? { + Some(synonyms) => { + let op = OpBuilder::new() + .add(synonyms.stream()) + .add(delta_synonyms.stream()) + .difference(); + + let mut synonyms_builder = SetBuilder::memory(); + synonyms_builder.extend_stream(op).unwrap(); + synonyms_builder + .into_inner() + .and_then(fst::Set::from_bytes) + .unwrap() + }, + None => fst::Set::default(), + }; + + main.set_synonyms_set(&synonyms)?; + + // update the "consistent" view of the Index + let words = main.words_set()?.unwrap_or_default(); + let ranked_map = lease_inner.ranked_map.clone(); + let schema = lease_inner.schema.clone(); + let raw = lease_inner.raw.clone(); + lease_inner.raw.compact(); + + let inner = InnerIndex { words, synonyms, schema, ranked_map, raw }; + self.inner.0.store(Arc::new(inner)); + + Ok(()) + } +} diff --git a/meilidb-data/src/database/synonyms_index.rs b/meilidb-data/src/database/synonyms_index.rs new file mode 100644 index 000000000..dfc0182e4 --- /dev/null +++ b/meilidb-data/src/database/synonyms_index.rs @@ -0,0 +1,23 @@ +use crate::database::raw_index::InnerRawIndex; + +#[derive(Clone)] +pub struct SynonymsIndex(pub(crate) InnerRawIndex); + +impl SynonymsIndex { + pub fn alternatives_to(&self, word: &[u8]) -> Result, rocksdb::Error> { + match self.0.get(word)? { + Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())), + None => Ok(None), + } + } + + pub fn set_alternatives_to(&self, word: &[u8], value: Vec) -> Result<(), rocksdb::Error> { + self.0.set(word, value)?; + Ok(()) + } + + pub fn del_alternatives_of(&self, word: &[u8]) -> Result<(), rocksdb::Error> { + self.0.delete(word)?; + Ok(()) + } +}