From f5d52396f5e375883728f49b71df4230ef09dffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 27 Jun 2019 15:23:19 +0200 Subject: [PATCH] feat: Support query words splits --- meilidb-core/src/query_builder.rs | 148 ++++++++++++++++++++++++++---- 1 file changed, 132 insertions(+), 16 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 5448c537b..b02caa30c 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -67,6 +67,29 @@ pub fn normalize_str(string: &str) -> String { string } +fn split_best_frequency<'a, S: Store>( + word: &'a str, + store: &S, +) -> Result, S::Error> +{ + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let left_freq = store.word_indexes(left.as_bytes())?.map_or(0, |i| i.len()); + let right_freq = store.word_indexes(right.as_bytes())?.map_or(0, |i| i.len()); + let min_freq = cmp::min(left_freq, right_freq); + + if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { + best = Some((min_freq, left, right)); + } + } + + Ok(best.map(|(_, l, r)| (l, r))) +} + fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); @@ -81,13 +104,6 @@ fn generate_automatons(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(Match { query_index: 1, word_index: 1, .. })); + assert_matches!(matches.next(), Some(Match { query_index: 2, word_index: 2, .. })); + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), None); + } + #[test] fn simple_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ @@ -754,12 +812,12 @@ mod tests { /// Unique word has multi-word synonyms fn unique_to_multiword_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("new", &[doc_char_index(0, 0, 0)][..]), - ("york", &[doc_char_index(0, 1, 1)][..]), - ("city", &[doc_char_index(0, 2, 2)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), ("subway", &[doc_char_index(0, 3, 3)][..]), - ("NY", &[doc_char_index(1, 0, 0)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), ("subway", &[doc_char_index(1, 1, 1)][..]), ]); @@ -943,6 +1001,12 @@ mod tests { ("blue", &[doc_char_index(1, 1, 1)][..]), ("subway", &[doc_char_index(1, 2, 2)][..]), ("broken", &[doc_char_index(1, 3, 3)][..]), + + ("new", &[doc_char_index(2, 0, 0)][..]), + ("york", &[doc_char_index(2, 1, 1)][..]), + ("underground", &[doc_char_index(2, 2, 2)][..]), + ("train", &[doc_char_index(2, 3, 3)][..]), + ("broken", &[doc_char_index(2, 4, 4)][..]), ]); store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); @@ -953,6 +1017,16 @@ mod tests { let results = builder.query("new york underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // new + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 0, char_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, char_index: 2, .. })); // underground + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, char_index: 3, .. })); // train + assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, char_index: 4, .. })); // broken + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york @@ -972,6 +1046,16 @@ mod tests { let results = builder.query("new york city underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // new + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 0, char_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, char_index: 2, .. })); // underground + assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 2, char_index: 3, .. })); // train + assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, char_index: 4, .. })); // broken + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city @@ -1043,9 +1127,7 @@ mod tests { }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 0, .. })); // téléphone - assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 1, .. })); // telephone - assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 2, .. })); // télephone + assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 1, .. })); // téléphone assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1071,4 +1153,38 @@ mod tests { }); assert_matches!(iter.next(), None); } + + #[test] + fn simple_split() { + let store = InMemorySetStore::from_iter(vec![ + ("porte", &[doc_char_index(0, 0, 0)][..]), + ("feuille", &[doc_char_index(0, 1, 1)][..]), + ("search", &[doc_char_index(1, 0, 0)][..]), + ("engine", &[doc_char_index(1, 1, 1)][..]), + ]); + + let builder = QueryBuilder::new(&store); + let results = builder.query("portefeuille", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // porte + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // feuille + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("searchengine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // search + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } }