mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
feat: Support query words splits
This commit is contained in:
parent
9cc154da05
commit
f5d52396f5
@ -67,6 +67,29 @@ pub fn normalize_str(string: &str) -> String {
|
|||||||
string
|
string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn split_best_frequency<'a, S: Store>(
|
||||||
|
word: &'a str,
|
||||||
|
store: &S,
|
||||||
|
) -> Result<Option<(&'a str, &'a str)>, S::Error>
|
||||||
|
{
|
||||||
|
let chars = word.char_indices().skip(1);
|
||||||
|
let mut best = None;
|
||||||
|
|
||||||
|
for (i, _) in chars {
|
||||||
|
let (left, right) = word.split_at(i);
|
||||||
|
|
||||||
|
let left_freq = store.word_indexes(left.as_bytes())?.map_or(0, |i| i.len());
|
||||||
|
let right_freq = store.word_indexes(right.as_bytes())?.map_or(0, |i| i.len());
|
||||||
|
let min_freq = cmp::min(left_freq, right_freq);
|
||||||
|
|
||||||
|
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||||
|
best = Some((min_freq, left, right));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(best.map(|(_, l, r)| (l, r)))
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
@ -81,13 +104,6 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
while let Some(ngram_slice) = ngrams.next() {
|
while let Some(ngram_slice) = ngrams.next() {
|
||||||
let ngram_nb_words = ngram_slice.len();
|
let ngram_nb_words = ngram_slice.len();
|
||||||
let ngram = ngram_slice.join(" ");
|
let ngram = ngram_slice.join(" ");
|
||||||
let concat = ngram_slice.concat();
|
|
||||||
|
|
||||||
// automaton of concatenation of query words
|
|
||||||
let normalized = normalize_str(&concat);
|
|
||||||
let lev = build_dfa(&normalized);
|
|
||||||
let automaton = Automaton::exact(query_index, lev);
|
|
||||||
automatons.push((automaton, normalized));
|
|
||||||
|
|
||||||
let has_following_word = ngrams.peek().is_some();
|
let has_following_word = ngrams.peek().is_some();
|
||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||||
@ -125,9 +141,29 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
}
|
}
|
||||||
|
|
||||||
if n == 1 {
|
if n == 1 {
|
||||||
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
// TODO we do not support "phrase query" in other words:
|
||||||
let automaton = Automaton::exact(query_index, lev);
|
// first term *must* follow the second term
|
||||||
|
if let Some((left, right)) = split_best_frequency(&ngram, store)? {
|
||||||
|
let automaton = Automaton::exact(query_index, left);
|
||||||
|
automatons.push((automaton, left.to_owned()));
|
||||||
|
|
||||||
|
let automaton = Automaton::exact(query_index, right);
|
||||||
|
automatons.push((automaton, right.to_owned()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let automaton = if not_prefix_dfa {
|
||||||
|
Automaton::exact(query_index, &ngram)
|
||||||
|
} else {
|
||||||
|
Automaton::prefix_exact(query_index, &ngram)
|
||||||
|
};
|
||||||
automatons.push((automaton, ngram));
|
automatons.push((automaton, ngram));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// automaton of concatenation of query words
|
||||||
|
let concat = ngram_slice.concat();
|
||||||
|
let normalized = normalize_str(&concat);
|
||||||
|
let automaton = Automaton::exact(query_index, &normalized);
|
||||||
|
automatons.push((automaton, normalized));
|
||||||
}
|
}
|
||||||
|
|
||||||
query_index += 1;
|
query_index += 1;
|
||||||
@ -568,6 +604,28 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple() {
|
||||||
|
let store = InMemorySetStore::from_iter(vec![
|
||||||
|
("iphone", &[doc_char_index(0, 0, 0)][..]),
|
||||||
|
("from", &[doc_char_index(0, 1, 1)][..]),
|
||||||
|
("apple", &[doc_char_index(0, 2, 2)][..]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("iphone from apple", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut matches = matches.into_iter();
|
||||||
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
|
assert_matches!(matches.next(), Some(Match { query_index: 1, word_index: 1, .. }));
|
||||||
|
assert_matches!(matches.next(), Some(Match { query_index: 2, word_index: 2, .. }));
|
||||||
|
assert_matches!(matches.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_synonyms() {
|
fn simple_synonyms() {
|
||||||
let mut store = InMemorySetStore::from_iter(vec![
|
let mut store = InMemorySetStore::from_iter(vec![
|
||||||
@ -943,6 +1001,12 @@ mod tests {
|
|||||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
("broken", &[doc_char_index(1, 3, 3)][..]),
|
("broken", &[doc_char_index(1, 3, 3)][..]),
|
||||||
|
|
||||||
|
("new", &[doc_char_index(2, 0, 0)][..]),
|
||||||
|
("york", &[doc_char_index(2, 1, 1)][..]),
|
||||||
|
("underground", &[doc_char_index(2, 2, 2)][..]),
|
||||||
|
("train", &[doc_char_index(2, 3, 3)][..]),
|
||||||
|
("broken", &[doc_char_index(2, 4, 4)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]));
|
store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]));
|
||||||
@ -953,6 +1017,16 @@ mod tests {
|
|||||||
let results = builder.query("new york underground train broken", 0..20).unwrap();
|
let results = builder.query("new york underground train broken", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // new
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, char_index: 2, .. })); // underground
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, char_index: 3, .. })); // train
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, char_index: 4, .. })); // broken
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york
|
||||||
@ -972,6 +1046,16 @@ mod tests {
|
|||||||
let results = builder.query("new york city underground train broken", 0..20).unwrap();
|
let results = builder.query("new york city underground train broken", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // new
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, char_index: 2, .. })); // underground
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 2, char_index: 3, .. })); // train
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, char_index: 4, .. })); // broken
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city
|
||||||
@ -1043,9 +1127,7 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 0, .. })); // téléphone
|
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 1, .. })); // téléphone
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 1, .. })); // telephone
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 2, .. })); // télephone
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
@ -1071,4 +1153,38 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_split() {
|
||||||
|
let store = InMemorySetStore::from_iter(vec![
|
||||||
|
("porte", &[doc_char_index(0, 0, 0)][..]),
|
||||||
|
("feuille", &[doc_char_index(0, 1, 1)][..]),
|
||||||
|
("search", &[doc_char_index(1, 0, 0)][..]),
|
||||||
|
("engine", &[doc_char_index(1, 1, 1)][..]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("portefeuille", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // porte
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // feuille
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("searchengine", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // search
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // engine
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user