From e8b2e860074bb0d1d22665147266c1118391f4ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 13 Jun 2019 15:47:49 +0200 Subject: [PATCH] feat: Introduce a basic way to handle synonyms --- meilidb-core/Cargo.toml | 3 + meilidb-core/src/lib.rs | 3 + meilidb-core/src/query_builder.rs | 212 ++++++++++++++++++++++++++++-- 3 files changed, 207 insertions(+), 11 deletions(-) diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 8790889e2..8c1d4ee36 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -25,6 +25,9 @@ git = "https://github.com/Kerollmops/levenshtein-automata.git" branch = "arc-byte-slice" features = ["fst_automaton"] +[dev-dependencies] +assert_matches = "1.3" + [features] i128 = ["byteorder/i128"] nightly = ["hashbrown/nightly", "slice-group-by/nightly"] diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index e61fa543b..bb2de2dec 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -1,3 +1,6 @@ +#[cfg(test)] +#[macro_use] extern crate assert_matches; + mod automaton; mod distinct_map; mod query_builder; diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index f9f51dba2..6e8194e2f 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -17,19 +17,28 @@ use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::{Match, DocumentId, Store, RawDocument, Document}; -fn generate_automatons(query: &str) -> Vec { +fn generate_automatons(query: &str, synonyms: &HashMap<&str, &[&str]>) -> Vec<(usize, DfaExt)> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); let mut automatons = Vec::new(); + let mut index = 0; while let Some(word) = groups.next() { + let word = word.as_str(); let has_following_word = groups.peek().is_some(); - let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { - automaton::build_dfa(&word) - } else { - automaton::build_prefix_dfa(&word) - }; - automatons.push(lev); + let is_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + let words = synonyms.get(word).cloned().unwrap_or_default().iter().chain(Some(&word)); + + for word in words { + let lev = if is_prefix_dfa { + automaton::build_dfa(word) + } else { + automaton::build_prefix_dfa(word) + }; + automatons.push((index, lev)); + } + + index += 1; } automatons @@ -82,12 +91,22 @@ impl<'c, S, FI> QueryBuilder<'c, S, FI> where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { - let automatons = generate_automatons(query); + let map = { + let mut map = HashMap::new(); + + map.insert("hello", &["bonjour", "salut"][..]); + map.insert("bonjour", &["hello", "salut"]); + map.insert("salut", &["hello", "bonjour"]); + + map + }; + + let automatons = generate_automatons(query, &map); let words = self.store.words()?.as_fst(); let mut stream = { let mut op_builder = fst::raw::OpBuilder::new(); - for automaton in &automatons { + for (_index, automaton) in &automatons { let stream = words.search(automaton); op_builder.push(stream); } @@ -98,7 +117,7 @@ where S: Store, while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { - let automaton = &automatons[iv.index]; + let (index, automaton) = &automatons[iv.index]; let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); @@ -111,7 +130,7 @@ where S: Store, for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { let match_ = Match { - query_index: iv.index as u32, + query_index: *index as u32, distance, attribute: di.attribute, word_index: di.word_index, @@ -321,3 +340,174 @@ where S: Store, Ok(out_documents) } } + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::{BTreeSet, HashMap}; + use std::iter::FromIterator; + + use sdset::SetBuf; + use fst::Set; + + use crate::DocIndex; + use crate::store::Store; + + #[derive(Default)] + struct InMemorySetStore { + set: Set, + indexes: HashMap, SetBuf>, + } + + impl Store for InMemorySetStore { + type Error = std::io::Error; + + fn words(&self) -> Result<&Set, Self::Error> { + Ok(&self.set) + } + + fn word_indexes(&self, word: &[u8]) -> Result>, Self::Error> { + Ok(self.indexes.get(word).cloned()) + } + } + + impl<'a> FromIterator<(&'a [u8], &'a [DocIndex])> for InMemorySetStore { + fn from_iter>(iter: I) -> Self { + let mut tree = BTreeSet::new(); + let mut map = HashMap::new(); + + for (word, indexes) in iter { + tree.insert(word); + map.insert(word.to_vec(), SetBuf::from_dirty(indexes.to_vec())); + } + + InMemorySetStore { + set: Set::from_iter(tree).unwrap(), + indexes: map, + } + } + } + + const fn doc_index(document_id: u64, word_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index: 0, + char_length: 0, + } + } + + #[test] + fn simple_synonymes() { + let store = InMemorySetStore::from_iter(vec![ + (&b"hello"[..], &[doc_index(0, 0)][..]), + ]); + + let builder = QueryBuilder::new(&store); + let results = builder.query("hello", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonjour", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), None); + } + + #[test] + fn harder_synonymes() { + let store = InMemorySetStore::from_iter(vec![ + (&b"hello"[..], &[doc_index(0, 0)][..]), + (&b"bonjour"[..], &[doc_index(1, 3)]), + (&b"salut"[..], &[doc_index(2, 5)]), + ]); + + let builder = QueryBuilder::new(&store); + let results = builder.query("hello", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("bonjour", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("salut", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 0); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 3); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + assert_eq!(matches.len(), 1); + let match_ = matches[0]; + assert_eq!(match_.query_index, 0); + assert_eq!(match_.word_index, 5); + }); + assert_matches!(iter.next(), None); + } +}