mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Don't compute split_words for phrases
This commit is contained in:
parent
a37da36766
commit
5758268866
@ -462,7 +462,7 @@ fill: \"#B6E2D3\"
|
|||||||
shape: class
|
shape: class
|
||||||
max_nbr_typo: {}",
|
max_nbr_typo: {}",
|
||||||
term_subset.description(ctx),
|
term_subset.description(ctx),
|
||||||
term_subset.max_nbr_typos(ctx)
|
term_subset.max_typo_cost(ctx)
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||||
|
@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo {
|
|||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
|
if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
|
||||||
assert!(s.two_typo.is_uninit());
|
assert!(s.two_typo.is_uninit());
|
||||||
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
|
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
|
||||||
self.initialize_one_typo_subterm(ctx)?;
|
self.initialize_one_typo_subterm(ctx)?;
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
assert!(s.one_typo.is_init());
|
assert!(s.one_typo.is_init());
|
||||||
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||||
} else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
|
} else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
|
||||||
assert!(s.two_typo.is_uninit());
|
assert!(s.two_typo.is_uninit());
|
||||||
self.initialize_one_and_two_typo_subterm(ctx)?;
|
self.initialize_one_and_two_typo_subterm(ctx)?;
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word(
|
|||||||
original: ctx.word_interner.insert(word.to_owned()),
|
original: ctx.word_interner.insert(word.to_owned()),
|
||||||
ngram_words: None,
|
ngram_words: None,
|
||||||
is_prefix: false,
|
is_prefix: false,
|
||||||
max_nbr_typos: 0,
|
max_levenshtein_distance: 0,
|
||||||
zero_typo: <_>::default(),
|
zero_typo: <_>::default(),
|
||||||
one_typo: Lazy::Init(<_>::default()),
|
one_typo: Lazy::Init(<_>::default()),
|
||||||
two_typo: Lazy::Init(<_>::default()),
|
two_typo: Lazy::Init(<_>::default()),
|
||||||
@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word(
|
|||||||
Ok(QueryTerm {
|
Ok(QueryTerm {
|
||||||
original: word_interned,
|
original: word_interned,
|
||||||
ngram_words: None,
|
ngram_words: None,
|
||||||
max_nbr_typos: max_typo,
|
max_levenshtein_distance: max_typo,
|
||||||
is_prefix,
|
is_prefix,
|
||||||
zero_typo,
|
zero_typo,
|
||||||
one_typo: Lazy::Uninit,
|
one_typo: Lazy::Uninit,
|
||||||
@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
|
|||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
let self_mut = ctx.term_interner.get_mut(self);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
|
|
||||||
|
let allows_split_words = self_mut.allows_split_words();
|
||||||
|
let QueryTerm {
|
||||||
|
original,
|
||||||
|
is_prefix,
|
||||||
|
one_typo,
|
||||||
|
max_levenshtein_distance: max_nbr_typos,
|
||||||
|
..
|
||||||
|
} = self_mut;
|
||||||
|
|
||||||
let original = *original;
|
let original = *original;
|
||||||
let is_prefix = *is_prefix;
|
let is_prefix = *is_prefix;
|
||||||
// let original_str = ctx.word_interner.get(*original).to_owned();
|
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
@ -300,13 +309,17 @@ impl Interned<QueryTerm> {
|
|||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let split_words = if allows_split_words {
|
||||||
let original_str = ctx.word_interner.get(original).to_owned();
|
let original_str = ctx.word_interner.get(original).to_owned();
|
||||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
find_split_words(ctx, original_str.as_str())?
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
let self_mut = ctx.term_interner.get_mut(self);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
|
||||||
// Only add the split words to the derivations if:
|
// Only add the split words to the derivations if:
|
||||||
// 1. the term is not an ngram; OR
|
// 1. the term is neither an ngram nor a phrase; OR
|
||||||
// 2. the term is an ngram, but the split words are different from the ngram's component words
|
// 2. the term is an ngram, but the split words are different from the ngram's component words
|
||||||
let split_words = if let Some((ngram_words, split_words)) =
|
let split_words = if let Some((ngram_words, split_words)) =
|
||||||
self_mut.ngram_words.as_ref().zip(split_words.as_ref())
|
self_mut.ngram_words.as_ref().zip(split_words.as_ref())
|
||||||
@ -328,7 +341,13 @@ impl Interned<QueryTerm> {
|
|||||||
}
|
}
|
||||||
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
let self_mut = ctx.term_interner.get_mut(self);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
|
let QueryTerm {
|
||||||
|
original,
|
||||||
|
is_prefix,
|
||||||
|
two_typo,
|
||||||
|
max_levenshtein_distance: max_nbr_typos,
|
||||||
|
..
|
||||||
|
} = self_mut;
|
||||||
let original_str = ctx.word_interner.get(*original).to_owned();
|
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
if two_typo.is_init() {
|
if two_typo.is_init() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
@ -43,7 +43,7 @@ pub struct QueryTermSubset {
|
|||||||
pub struct QueryTerm {
|
pub struct QueryTerm {
|
||||||
original: Interned<String>,
|
original: Interned<String>,
|
||||||
ngram_words: Option<Vec<Interned<String>>>,
|
ngram_words: Option<Vec<Interned<String>>>,
|
||||||
max_nbr_typos: u8,
|
max_levenshtein_distance: u8,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
zero_typo: ZeroTypoTerm,
|
zero_typo: ZeroTypoTerm,
|
||||||
// May not be computed yet
|
// May not be computed yet
|
||||||
@ -342,10 +342,16 @@ impl QueryTermSubset {
|
|||||||
}
|
}
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
|
pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 {
|
||||||
let t = ctx.term_interner.get(self.original);
|
let t = ctx.term_interner.get(self.original);
|
||||||
match t.max_nbr_typos {
|
match t.max_levenshtein_distance {
|
||||||
0 => 0,
|
0 => {
|
||||||
|
if t.allows_split_words() {
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
}
|
||||||
1 => {
|
1 => {
|
||||||
if self.one_typo_subset.is_empty() {
|
if self.one_typo_subset.is_empty() {
|
||||||
0
|
0
|
||||||
@ -438,6 +444,9 @@ impl QueryTerm {
|
|||||||
|
|
||||||
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
||||||
}
|
}
|
||||||
|
fn allows_split_words(&self) -> bool {
|
||||||
|
self.zero_typo.phrase.is_none()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
|
@ -217,7 +217,7 @@ pub fn make_ngram(
|
|||||||
original: ngram_str_interned,
|
original: ngram_str_interned,
|
||||||
ngram_words: Some(words_interned),
|
ngram_words: Some(words_interned),
|
||||||
is_prefix,
|
is_prefix,
|
||||||
max_nbr_typos,
|
max_levenshtein_distance: max_nbr_typos,
|
||||||
zero_typo: term.zero_typo,
|
zero_typo: term.zero_typo,
|
||||||
one_typo: Lazy::Uninit,
|
one_typo: Lazy::Uninit,
|
||||||
two_typo: Lazy::Uninit,
|
two_typo: Lazy::Uninit,
|
||||||
@ -271,7 +271,7 @@ impl PhraseBuilder {
|
|||||||
QueryTerm {
|
QueryTerm {
|
||||||
original: ctx.word_interner.insert(phrase_desc),
|
original: ctx.word_interner.insert(phrase_desc),
|
||||||
ngram_words: None,
|
ngram_words: None,
|
||||||
max_nbr_typos: 0,
|
max_levenshtein_distance: 0,
|
||||||
is_prefix: false,
|
is_prefix: false,
|
||||||
zero_typo: ZeroTypoTerm {
|
zero_typo: ZeroTypoTerm {
|
||||||
phrase: Some(phrase),
|
phrase: Some(phrase),
|
||||||
|
@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
// 3-gram -> equivalent to 2 typos
|
// 3-gram -> equivalent to 2 typos
|
||||||
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
||||||
|
|
||||||
for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
|
for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
|
||||||
let mut term = term.clone();
|
let mut term = term.clone();
|
||||||
match nbr_typos {
|
match nbr_typos {
|
||||||
0 => {
|
0 => {
|
||||||
|
Loading…
Reference in New Issue
Block a user