mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-27 07:14:26 +01:00
feat: Make synonyms be not considered like exact matches
This commit is contained in:
parent
43f11e929d
commit
3dcbc737f3
@ -20,7 +20,24 @@ use crate::{Match, DocumentId, Store, RawDocument, Document};
|
|||||||
|
|
||||||
const NGRAMS: usize = 3;
|
const NGRAMS: usize = 3;
|
||||||
|
|
||||||
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<(usize, DfaExt)>, S::Error> {
|
struct Automaton {
|
||||||
|
index: usize,
|
||||||
|
is_synonym: bool,
|
||||||
|
number_words: usize,
|
||||||
|
dfa: DfaExt,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Automaton {
|
||||||
|
fn synonym(index: usize, number_words: usize, dfa: DfaExt) -> Automaton {
|
||||||
|
Automaton { index, is_synonym: true, number_words, dfa }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn original(index: usize, number_words: usize, dfa: DfaExt) -> Automaton {
|
||||||
|
Automaton { index, is_synonym: false, number_words, dfa }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
@ -54,25 +71,28 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<(usize, D
|
|||||||
while let Some(synonyms) = stream.next() {
|
while let Some(synonyms) = stream.next() {
|
||||||
|
|
||||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||||
|
let nb_synonym_words = split_query_string(synonyms).count();
|
||||||
for synonym in split_query_string(synonyms) {
|
for synonym in split_query_string(synonyms) {
|
||||||
let lev = build_dfa(synonym);
|
let lev = build_dfa(synonym);
|
||||||
automatons.push((index, synonym.to_owned(), lev));
|
let automaton = Automaton::synonym(index, nb_synonym_words, lev);
|
||||||
|
automatons.push((automaton, synonym.to_owned()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if n == 1 {
|
if n == 1 {
|
||||||
automatons.push((index, ngram, lev));
|
let automaton = Automaton::original(index, ngram_nb_words, lev);
|
||||||
|
automatons.push((automaton, ngram));
|
||||||
}
|
}
|
||||||
|
|
||||||
index += 1;
|
index += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
automatons.sort_unstable_by(|a, b| (a.0, &a.1).cmp(&(b.0, &b.1)));
|
automatons.sort_unstable_by(|a, b| (a.0.index, &a.1).cmp(&(b.0.index, &b.1)));
|
||||||
automatons.dedup_by(|a, b| (a.0, &a.1) == (b.0, &b.1));
|
automatons.dedup_by(|a, b| (a.0.index, &a.1) == (b.0.index, &b.1));
|
||||||
let automatons = automatons.into_iter().map(|(i, _, a)| (i, a)).collect();
|
let automatons = automatons.into_iter().map(|(a, _)| a).collect();
|
||||||
|
|
||||||
Ok(automatons)
|
Ok(automatons)
|
||||||
}
|
}
|
||||||
@ -129,8 +149,8 @@ where S: Store,
|
|||||||
|
|
||||||
let mut stream = {
|
let mut stream = {
|
||||||
let mut op_builder = fst::raw::OpBuilder::new();
|
let mut op_builder = fst::raw::OpBuilder::new();
|
||||||
for (_index, automaton) in &automatons {
|
for Automaton { dfa, .. } in &automatons {
|
||||||
let stream = words.search(automaton);
|
let stream = words.search(dfa);
|
||||||
op_builder.push(stream);
|
op_builder.push(stream);
|
||||||
}
|
}
|
||||||
op_builder.r#union()
|
op_builder.r#union()
|
||||||
@ -140,9 +160,9 @@ where S: Store,
|
|||||||
|
|
||||||
while let Some((input, indexed_values)) = stream.next() {
|
while let Some((input, indexed_values)) = stream.next() {
|
||||||
for iv in indexed_values {
|
for iv in indexed_values {
|
||||||
let (index, automaton) = &automatons[iv.index];
|
let Automaton { index, is_synonym, number_words, ref dfa } = automatons[iv.index];
|
||||||
let distance = automaton.eval(input).to_u8();
|
let distance = dfa.eval(input).to_u8();
|
||||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
let is_exact = (is_synonym && number_words == 1) || (!is_synonym && distance == 0 && input.len() == dfa.query_len());
|
||||||
|
|
||||||
let doc_indexes = self.store.word_indexes(input)?;
|
let doc_indexes = self.store.word_indexes(input)?;
|
||||||
let doc_indexes = match doc_indexes {
|
let doc_indexes = match doc_indexes {
|
||||||
@ -153,8 +173,8 @@ where S: Store,
|
|||||||
for di in doc_indexes.as_slice() {
|
for di in doc_indexes.as_slice() {
|
||||||
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
||||||
let match_ = Match {
|
let match_ = Match {
|
||||||
query_index: *index as u32,
|
query_index: index as u32,
|
||||||
distance,
|
distance: distance,
|
||||||
attribute: di.attribute,
|
attribute: di.attribute,
|
||||||
word_index: di.word_index,
|
word_index: di.word_index,
|
||||||
is_exact,
|
is_exact,
|
||||||
@ -716,6 +736,12 @@ mod tests {
|
|||||||
let results = builder.query("NY subway", 0..20).unwrap();
|
let results = builder.query("NY subway", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||||
@ -724,18 +750,18 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = QueryBuilder::new(&store);
|
let builder = QueryBuilder::new(&store);
|
||||||
let results = builder.query("NYC subway", 0..20).unwrap();
|
let results = builder.query("NYC subway", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||||
@ -744,12 +770,6 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -776,6 +796,12 @@ mod tests {
|
|||||||
let results = builder.query("NY subway", 0..20).unwrap();
|
let results = builder.query("NY subway", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||||
@ -784,18 +810,18 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = QueryBuilder::new(&store);
|
let builder = QueryBuilder::new(&store);
|
||||||
let results = builder.query("NYC subway", 0..20).unwrap();
|
let results = builder.query("NYC subway", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||||
@ -804,12 +830,6 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -860,6 +880,12 @@ mod tests {
|
|||||||
let results = builder.query("NYC subway", 0..20).unwrap();
|
let results = builder.query("NYC subway", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
|
||||||
@ -869,12 +895,6 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user