mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 21:50:07 +01:00
Matching words fixes
This commit is contained in:
parent
e7bb8c940f
commit
7ab48ed8c7
@ -1,8 +1,6 @@
|
|||||||
// #[cfg(test)]
|
// #[cfg(test)]
|
||||||
pub mod detailed;
|
pub mod detailed;
|
||||||
|
|
||||||
pub mod test_logger;
|
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::interner::{Interned, MappedInterner};
|
use super::interner::{Interned, MappedInterner};
|
||||||
|
@ -5,9 +5,7 @@ use std::ops::RangeInclusive;
|
|||||||
use charabia::Token;
|
use charabia::Token;
|
||||||
|
|
||||||
use super::super::interner::Interned;
|
use super::super::interner::Interned;
|
||||||
use super::super::query_term::{
|
use super::super::query_term::LocatedQueryTerm;
|
||||||
Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
|
|
||||||
};
|
|
||||||
use super::super::{DedupInterner, Phrase};
|
use super::super::{DedupInterner, Phrase};
|
||||||
use crate::SearchContext;
|
use crate::SearchContext;
|
||||||
|
|
||||||
@ -33,68 +31,16 @@ pub struct MatchingWords {
|
|||||||
words: Vec<LocatedMatchingWords>,
|
words: Vec<LocatedMatchingWords>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
|
|
||||||
fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
|
|
||||||
let mut matching_words = Vec::new();
|
|
||||||
let mut matching_phrases = Vec::new();
|
|
||||||
|
|
||||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
|
||||||
let QueryTerm {
|
|
||||||
original: _,
|
|
||||||
is_multiple_words: _,
|
|
||||||
max_nbr_typos: _,
|
|
||||||
is_prefix: _,
|
|
||||||
zero_typo,
|
|
||||||
one_typo,
|
|
||||||
two_typo,
|
|
||||||
} = term;
|
|
||||||
|
|
||||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
|
||||||
let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
|
|
||||||
|
|
||||||
// zero typo
|
|
||||||
if let Some(phrase) = phrase {
|
|
||||||
matching_phrases.push(*phrase);
|
|
||||||
}
|
|
||||||
if let Some(zero_typo) = zero_typo {
|
|
||||||
matching_words.push(*zero_typo);
|
|
||||||
}
|
|
||||||
for synonym in synonyms {
|
|
||||||
matching_phrases.push(*synonym);
|
|
||||||
}
|
|
||||||
|
|
||||||
// one typo
|
|
||||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
|
||||||
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
|
|
||||||
if let Some(split_words) = split_words {
|
|
||||||
matching_phrases.push(*split_words);
|
|
||||||
}
|
|
||||||
for one_typo in one_typo {
|
|
||||||
matching_words.push(*one_typo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// two typos
|
|
||||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
|
||||||
if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
|
|
||||||
for two_typos in two_typos {
|
|
||||||
matching_words.push(*two_typos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(matching_phrases, matching_words)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MatchingWords {
|
impl MatchingWords {
|
||||||
pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||||
let mut phrases = Vec::new();
|
let mut phrases = Vec::new();
|
||||||
let mut words = Vec::new();
|
let mut words = Vec::new();
|
||||||
|
|
||||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
|
// Extract and centralize the different phrases and words to match stored in a QueryTerm
|
||||||
// and wrap them in dedicated structures.
|
// and wrap them in dedicated structures.
|
||||||
for located_term in located_terms {
|
for located_term in located_terms {
|
||||||
let term = ctx.term_interner.get(located_term.value);
|
let term = ctx.term_interner.get(located_term.value);
|
||||||
let (matching_phrases, matching_words) = extract_matching_terms(term);
|
let (matching_words, matching_phrases) = term.all_computed_derivations();
|
||||||
|
|
||||||
for matching_phrase in matching_phrases {
|
for matching_phrase in matching_phrases {
|
||||||
phrases.push(LocatedMatchingPhrase {
|
phrases.push(LocatedMatchingPhrase {
|
||||||
@ -106,8 +52,8 @@ impl MatchingWords {
|
|||||||
words.push(LocatedMatchingWords {
|
words.push(LocatedMatchingWords {
|
||||||
value: matching_words,
|
value: matching_words,
|
||||||
positions: located_term.positions.clone(),
|
positions: located_term.positions.clone(),
|
||||||
is_prefix: term.is_prefix,
|
is_prefix: term.is_cached_prefix(),
|
||||||
original_char_count: ctx.word_interner.get(term.original).chars().count(),
|
original_char_count: term.original_word(&ctx).chars().count(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,7 +137,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
}
|
}
|
||||||
// partial match is now full, we keep this matches and we advance positions
|
// partial match is now full, we keep this matches and we advance positions
|
||||||
Some(MatchType::Full { char_len, ids }) => {
|
Some(MatchType::Full { char_len, ids }) => {
|
||||||
let ids: Vec<_> = ids.clone().into_iter().collect();
|
let ids: Vec<_> = ids.clone().collect();
|
||||||
// save previously matched tokens as matches.
|
// save previously matched tokens as matches.
|
||||||
let iter = potential_matches.into_iter().map(
|
let iter = potential_matches.into_iter().map(
|
||||||
|(token_position, word_position, match_len)| Match {
|
|(token_position, word_position, match_len)| Match {
|
||||||
@ -192,7 +192,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
// we match, we save the current token as a match,
|
// we match, we save the current token as a match,
|
||||||
// then we continue the rest of the tokens.
|
// then we continue the rest of the tokens.
|
||||||
MatchType::Full { char_len, ids } => {
|
MatchType::Full { char_len, ids } => {
|
||||||
let ids: Vec<_> = ids.clone().into_iter().collect();
|
let ids: Vec<_> = ids.clone().collect();
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
match_len: char_len,
|
||||||
ids,
|
ids,
|
||||||
|
@ -35,20 +35,20 @@ pub use logger::detailed::DetailedSearchLogger;
|
|||||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||||
use query_graph::{QueryGraph, QueryNode};
|
use query_graph::{QueryGraph, QueryNode};
|
||||||
use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
|
use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
|
||||||
use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
||||||
use resolve_query_graph::PhraseDocIdsCache;
|
use resolve_query_graph::PhraseDocIdsCache;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use words::Words;
|
use words::Words;
|
||||||
|
|
||||||
use self::bucket_sort::BucketSortOutput;
|
|
||||||
use self::exact_attribute::ExactAttribute;
|
|
||||||
use self::graph_based_ranking_rule::Exactness;
|
|
||||||
use self::interner::Interner;
|
|
||||||
use self::ranking_rules::{BoxRankingRule, RankingRule};
|
|
||||||
use self::resolve_query_graph::compute_query_graph_docids;
|
|
||||||
use self::sort::Sort;
|
|
||||||
use crate::search::new::distinct::apply_distinct_rule;
|
use crate::search::new::distinct::apply_distinct_rule;
|
||||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
|
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
|
||||||
|
use bucket_sort::BucketSortOutput;
|
||||||
|
use exact_attribute::ExactAttribute;
|
||||||
|
use graph_based_ranking_rule::Exactness;
|
||||||
|
use interner::Interner;
|
||||||
|
use ranking_rules::{BoxRankingRule, RankingRule};
|
||||||
|
use resolve_query_graph::compute_query_graph_docids;
|
||||||
|
use sort::Sort;
|
||||||
|
|
||||||
/// A structure used throughout the execution of a search query.
|
/// A structure used throughout the execution of a search query.
|
||||||
pub struct SearchContext<'ctx> {
|
pub struct SearchContext<'ctx> {
|
||||||
@ -361,6 +361,7 @@ pub fn execute_search(
|
|||||||
Ok(PartialSearchResult {
|
Ok(PartialSearchResult {
|
||||||
candidates: all_candidates,
|
candidates: all_candidates,
|
||||||
documents_ids: docids,
|
documents_ids: docids,
|
||||||
|
located_query_terms,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -188,7 +188,8 @@ impl QueryTermSubset {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let original = ctx.term_interner.get_mut(self.original);
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
if !self.zero_typo_subset.is_empty() {
|
match &self.zero_typo_subset {
|
||||||
|
NTypoTermSubset::All => {
|
||||||
let ZeroTypoTerm {
|
let ZeroTypoTerm {
|
||||||
phrase: _,
|
phrase: _,
|
||||||
exact: zero_typo,
|
exact: zero_typo,
|
||||||
@ -198,7 +199,24 @@ impl QueryTermSubset {
|
|||||||
} = &original.zero_typo;
|
} = &original.zero_typo;
|
||||||
result.extend(zero_typo.iter().copied());
|
result.extend(zero_typo.iter().copied());
|
||||||
result.extend(prefix_of.iter().copied());
|
result.extend(prefix_of.iter().copied());
|
||||||
};
|
}
|
||||||
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
|
let ZeroTypoTerm {
|
||||||
|
phrase: _,
|
||||||
|
exact: zero_typo,
|
||||||
|
prefix_of,
|
||||||
|
synonyms: _,
|
||||||
|
use_prefix_db: _,
|
||||||
|
} = &original.zero_typo;
|
||||||
|
if let Some(zero_typo) = zero_typo {
|
||||||
|
if words.contains(zero_typo) {
|
||||||
|
result.insert(*zero_typo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.extend(prefix_of.intersection(words).copied());
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Nothing => {}
|
||||||
|
}
|
||||||
|
|
||||||
match &self.one_typo_subset {
|
match &self.one_typo_subset {
|
||||||
NTypoTermSubset::All => {
|
NTypoTermSubset::All => {
|
||||||
@ -248,12 +266,25 @@ impl QueryTermSubset {
|
|||||||
result.extend(phrase.iter().copied());
|
result.extend(phrase.iter().copied());
|
||||||
result.extend(synonyms.iter().copied());
|
result.extend(synonyms.iter().copied());
|
||||||
|
|
||||||
if !self.one_typo_subset.is_empty() {
|
match &self.one_typo_subset {
|
||||||
|
NTypoTermSubset::All => {
|
||||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
||||||
panic!();
|
panic!();
|
||||||
};
|
};
|
||||||
result.extend(split_words.iter().copied());
|
result.extend(split_words.iter().copied());
|
||||||
}
|
}
|
||||||
|
NTypoTermSubset::Subset { phrases, .. } => {
|
||||||
|
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
||||||
|
panic!();
|
||||||
|
};
|
||||||
|
if let Some(split_words) = split_words {
|
||||||
|
if phrases.contains(split_words) {
|
||||||
|
result.insert(*split_words);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Nothing => {}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
@ -368,3 +399,34 @@ impl LocatedQueryTerm {
|
|||||||
interner.get(self.value).is_empty()
|
interner.get(self.value).is_empty()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl QueryTerm {
|
||||||
|
pub fn is_cached_prefix(&self) -> bool {
|
||||||
|
self.zero_typo.use_prefix_db.is_some()
|
||||||
|
}
|
||||||
|
pub fn original_word(&self, ctx: &SearchContext) -> String {
|
||||||
|
ctx.word_interner.get(self.original).clone()
|
||||||
|
}
|
||||||
|
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
||||||
|
let mut words = BTreeSet::new();
|
||||||
|
let mut phrases = BTreeSet::new();
|
||||||
|
|
||||||
|
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
|
||||||
|
&self.zero_typo;
|
||||||
|
words.extend(zero_typo.iter().copied());
|
||||||
|
words.extend(prefix_of.iter().copied());
|
||||||
|
phrases.extend(phrase.iter().copied());
|
||||||
|
phrases.extend(synonyms.iter().copied());
|
||||||
|
|
||||||
|
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
|
||||||
|
words.extend(one_typo.iter().copied());
|
||||||
|
phrases.extend(split_words.iter().copied());
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
|
||||||
|
words.extend(two_typos.iter().copied());
|
||||||
|
};
|
||||||
|
|
||||||
|
(words.into_iter().collect(), phrases.into_iter().collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user