3742: Compute split words derivations of terms that don't accept typos r=ManyTheFish a=loiclec

Allows looking for the split-word derivation for short words in the user's query (like `the -> "t he"` or `door -> do or`) as well as for 3grams.

Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
meili-bors[bot] 2023-05-10 12:12:52 +00:00 committed by GitHub
commit ad5f25d880
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 109 additions and 50 deletions

View File

@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo {
impl Interned<QueryTerm> { impl Interned<QueryTerm> {
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
let s = ctx.term_interner.get_mut(self); let s = ctx.term_interner.get_mut(self);
if s.max_nbr_typos == 0 { if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
s.one_typo = Lazy::Init(OneTypoTerm::default());
s.two_typo = Lazy::Init(TwoTypoTerm::default());
} else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
assert!(s.two_typo.is_uninit()); assert!(s.two_typo.is_uninit());
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
self.initialize_one_typo_subterm(ctx)?; self.initialize_one_typo_subterm(ctx)?;
let s = ctx.term_interner.get_mut(self); let s = ctx.term_interner.get_mut(self);
assert!(s.one_typo.is_init()); assert!(s.one_typo.is_init());
@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
impl Interned<QueryTerm> { impl Interned<QueryTerm> {
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
let self_mut = ctx.term_interner.get_mut(self); let self_mut = ctx.term_interner.get_mut(self);
let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
let original = *original; let original = *original;
let is_prefix = *is_prefix; let is_prefix = *is_prefix;
// let original_str = ctx.word_interner.get(*original).to_owned(); // let original_str = ctx.word_interner.get(*original).to_owned();
@ -286,19 +284,22 @@ impl Interned<QueryTerm> {
} }
let mut one_typo_words = BTreeSet::new(); let mut one_typo_words = BTreeSet::new();
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { if *max_nbr_typos > 0 {
match nbr_typos { find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
ZeroOrOneTypo::Zero => {} match nbr_typos {
ZeroOrOneTypo::One => { ZeroOrOneTypo::Zero => {}
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { ZeroOrOneTypo::One => {
one_typo_words.insert(derived_word); if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
} else { one_typo_words.insert(derived_word);
return Ok(ControlFlow::Break(())); } else {
return Ok(ControlFlow::Break(()));
}
} }
} }
} Ok(ControlFlow::Continue(()))
Ok(ControlFlow::Continue(())) })?;
})?; }
let original_str = ctx.word_interner.get(original).to_owned(); let original_str = ctx.word_interner.get(original).to_owned();
let split_words = find_split_words(ctx, original_str.as_str())?; let split_words = find_split_words(ctx, original_str.as_str())?;
@ -327,7 +328,7 @@ impl Interned<QueryTerm> {
} }
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
let self_mut = ctx.term_interner.get_mut(self); let self_mut = ctx.term_interner.get_mut(self);
let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
let original_str = ctx.word_interner.get(*original).to_owned(); let original_str = ctx.word_interner.get(*original).to_owned();
if two_typo.is_init() { if two_typo.is_init() {
return Ok(()); return Ok(());
@ -335,34 +336,37 @@ impl Interned<QueryTerm> {
let mut one_typo_words = BTreeSet::new(); let mut one_typo_words = BTreeSet::new();
let mut two_typo_words = BTreeSet::new(); let mut two_typo_words = BTreeSet::new();
find_zero_one_two_typo_derivations( if *max_nbr_typos > 0 {
*original, find_zero_one_two_typo_derivations(
*is_prefix, *original,
ctx.index.words_fst(ctx.txn)?, *is_prefix,
&mut ctx.word_interner, ctx.index.words_fst(ctx.txn)?,
|derived_word, nbr_typos| { &mut ctx.word_interner,
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT |derived_word, nbr_typos| {
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
{ && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
// No chance we will add either one- or two-typo derivations anymore, stop iterating. {
return Ok(ControlFlow::Break(())); // No chance we will add either one- or two-typo derivations anymore, stop iterating.
} return Ok(ControlFlow::Break(()));
match nbr_typos { }
NumberOfTypos::Zero => {} match nbr_typos {
NumberOfTypos::One => { NumberOfTypos::Zero => {}
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { NumberOfTypos::One => {
one_typo_words.insert(derived_word); if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
one_typo_words.insert(derived_word);
}
}
NumberOfTypos::Two => {
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
two_typo_words.insert(derived_word);
}
} }
} }
NumberOfTypos::Two => { Ok(ControlFlow::Continue(()))
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { },
two_typo_words.insert(derived_word); )?;
} }
}
}
Ok(ControlFlow::Continue(()))
},
)?;
let split_words = find_split_words(ctx, original_str.as_str())?; let split_words = find_split_words(ctx, original_str.as_str())?;
let self_mut = ctx.term_interner.get_mut(self); let self_mut = ctx.term_interner.get_mut(self);

View File

@ -3,9 +3,9 @@ This module tests the following properties:
1. Two consecutive words from a query can be combined into a "2gram" 1. Two consecutive words from a query can be combined into a "2gram"
2. Three consecutive words from a query can be combined into a "3gram" 2. Three consecutive words from a query can be combined into a "3gram"
3. A word from the query can be split into two consecutive words (split words) 3. A word from the query can be split into two consecutive words (split words), no matter how short it is
4. A 2gram can be split into two words 4. A 2gram can be split into two words
5. A 3gram cannot be split into two words 5. A 3gram can be split into two words
6. 2grams can contain up to 1 typo 6. 2grams can contain up to 1 typo
7. 3grams cannot have typos 7. 3grams cannot have typos
8. 2grams and 3grams can be prefix tolerant 8. 2grams and 3grams can be prefix tolerant
@ -14,6 +14,7 @@ This module tests the following properties:
11. Disabling typo tolerance does not disable ngram tolerance 11. Disabling typo tolerance does not disable ngram tolerance
12. Prefix tolerance is disabled for the last word if a space follows it 12. Prefix tolerance is disabled for the last word if a space follows it
13. Ngrams cannot be formed by combining a phrase and a word or two phrases 13. Ngrams cannot be formed by combining a phrase and a word or two phrases
14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings
*/ */
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
@ -56,6 +57,10 @@ fn create_index() -> TempIndex {
{ {
"id": 5, "id": 5,
"text": "sunflowering is not a verb" "text": "sunflowering is not a verb"
},
{
"id": 6,
"text": "xy z"
} }
])) ]))
.unwrap(); .unwrap();
@ -263,10 +268,11 @@ fn test_disable_split_words() {
s.query("sunflower "); s.query("sunflower ");
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
// no document containing `sun flower` // no document containing `sun flower`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"the sun flower is tall\"",
"\"the sunflower is tall\"", "\"the sunflower is tall\"",
] ]
"###); "###);
@ -307,10 +313,11 @@ fn test_3gram_no_split_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
// no document with `sun flower` // no document with `sun flower`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"the sun flower is tall\"",
"\"the sunflowers are pretty\"", "\"the sunflowers are pretty\"",
"\"the sunflower is tall\"", "\"the sunflower is tall\"",
"\"sunflowering is not a verb\"", "\"sunflowering is not a verb\"",
@ -369,3 +376,50 @@ fn test_no_ngram_phrases() {
] ]
"###); "###);
} }
#[test]
fn test_short_split_words() {
let index = create_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("xyz");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"xy z\"",
]
"###);
}
#[test]
fn test_split_words_never_disabled() {
let index = create_index();
index
.update_settings(|s| {
s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect());
s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect());
})
.unwrap();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the sunflower is tall");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flower is tall\"",
"\"the sunflower is tall\"",
]
"###);
}

View File

@ -9,7 +9,7 @@ This module tests the following properties:
6. A typo on the first letter of a word counts as two typos 6. A typo on the first letter of a word counts as two typos
7. Phrases are not typo tolerant 7. Phrases are not typo tolerant
8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos`
9. 3grams are not typo tolerant 9. 3grams are not typo tolerant (but they can be split into two words)
10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly
if `words` doesn't exist before it. if `words` doesn't exist before it.
11. The `typo` ranking rule places documents with the same number of typos in the same bucket 11. The `typo` ranking rule places documents with the same number of typos in the same bucket
@ -287,16 +287,17 @@ fn test_typo_exact_word() {
] ]
"###); "###);
// exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower) // exact words do not disable prefix (sunflowering OK, but no sunflowar)
let mut s = Search::new(&txn, &index); let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All); s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("network interconnection sunflower"); s.query("network interconnection sunflower");
let SearchResult { documents_ids, .. } = s.execute().unwrap(); let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"network interconnection sunflower\"", "\"network interconnection sunflower\"",
"\"network interconnection sun flower\"",
"\"network interconnection sunflowering\"", "\"network interconnection sunflowering\"",
] ]
"###); "###);