mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Merge #3742
3742: Compute split words derivations of terms that don't accept typos r=ManyTheFish a=loiclec Allows looking for the split-word derivation for short words in the user's query (like `the -> "t he"` or `door -> do or`) as well as for 3grams. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
commit
ad5f25d880
@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo {
|
|||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
if s.max_nbr_typos == 0 {
|
if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
|
||||||
s.one_typo = Lazy::Init(OneTypoTerm::default());
|
|
||||||
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
|
||||||
} else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
|
|
||||||
assert!(s.two_typo.is_uninit());
|
assert!(s.two_typo.is_uninit());
|
||||||
|
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
|
||||||
self.initialize_one_typo_subterm(ctx)?;
|
self.initialize_one_typo_subterm(ctx)?;
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
assert!(s.one_typo.is_init());
|
assert!(s.one_typo.is_init());
|
||||||
@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
|
|||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
let self_mut = ctx.term_interner.get_mut(self);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
|
let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
|
||||||
let original = *original;
|
let original = *original;
|
||||||
let is_prefix = *is_prefix;
|
let is_prefix = *is_prefix;
|
||||||
// let original_str = ctx.word_interner.get(*original).to_owned();
|
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
@ -286,19 +284,22 @@ impl Interned<QueryTerm> {
|
|||||||
}
|
}
|
||||||
let mut one_typo_words = BTreeSet::new();
|
let mut one_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
if *max_nbr_typos > 0 {
|
||||||
match nbr_typos {
|
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||||
ZeroOrOneTypo::Zero => {}
|
match nbr_typos {
|
||||||
ZeroOrOneTypo::One => {
|
ZeroOrOneTypo::Zero => {}
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
ZeroOrOneTypo::One => {
|
||||||
one_typo_words.insert(derived_word);
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
} else {
|
one_typo_words.insert(derived_word);
|
||||||
return Ok(ControlFlow::Break(()));
|
} else {
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
Ok(ControlFlow::Continue(()))
|
||||||
Ok(ControlFlow::Continue(()))
|
})?;
|
||||||
})?;
|
}
|
||||||
|
|
||||||
let original_str = ctx.word_interner.get(original).to_owned();
|
let original_str = ctx.word_interner.get(original).to_owned();
|
||||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||||
|
|
||||||
@ -327,7 +328,7 @@ impl Interned<QueryTerm> {
|
|||||||
}
|
}
|
||||||
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
let self_mut = ctx.term_interner.get_mut(self);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
|
let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
|
||||||
let original_str = ctx.word_interner.get(*original).to_owned();
|
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
if two_typo.is_init() {
|
if two_typo.is_init() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@ -335,34 +336,37 @@ impl Interned<QueryTerm> {
|
|||||||
let mut one_typo_words = BTreeSet::new();
|
let mut one_typo_words = BTreeSet::new();
|
||||||
let mut two_typo_words = BTreeSet::new();
|
let mut two_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
find_zero_one_two_typo_derivations(
|
if *max_nbr_typos > 0 {
|
||||||
*original,
|
find_zero_one_two_typo_derivations(
|
||||||
*is_prefix,
|
*original,
|
||||||
ctx.index.words_fst(ctx.txn)?,
|
*is_prefix,
|
||||||
&mut ctx.word_interner,
|
ctx.index.words_fst(ctx.txn)?,
|
||||||
|derived_word, nbr_typos| {
|
&mut ctx.word_interner,
|
||||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
|derived_word, nbr_typos| {
|
||||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||||
{
|
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
{
|
||||||
return Ok(ControlFlow::Break(()));
|
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||||
}
|
return Ok(ControlFlow::Break(()));
|
||||||
match nbr_typos {
|
}
|
||||||
NumberOfTypos::Zero => {}
|
match nbr_typos {
|
||||||
NumberOfTypos::One => {
|
NumberOfTypos::Zero => {}
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
NumberOfTypos::One => {
|
||||||
one_typo_words.insert(derived_word);
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
|
one_typo_words.insert(derived_word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NumberOfTypos::Two => {
|
||||||
|
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
||||||
|
two_typo_words.insert(derived_word);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NumberOfTypos::Two => {
|
Ok(ControlFlow::Continue(()))
|
||||||
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
},
|
||||||
two_typo_words.insert(derived_word);
|
)?;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(ControlFlow::Continue(()))
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||||
let self_mut = ctx.term_interner.get_mut(self);
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
|
||||||
|
@ -3,9 +3,9 @@ This module tests the following properties:
|
|||||||
|
|
||||||
1. Two consecutive words from a query can be combined into a "2gram"
|
1. Two consecutive words from a query can be combined into a "2gram"
|
||||||
2. Three consecutive words from a query can be combined into a "3gram"
|
2. Three consecutive words from a query can be combined into a "3gram"
|
||||||
3. A word from the query can be split into two consecutive words (split words)
|
3. A word from the query can be split into two consecutive words (split words), no matter how short it is
|
||||||
4. A 2gram can be split into two words
|
4. A 2gram can be split into two words
|
||||||
5. A 3gram cannot be split into two words
|
5. A 3gram can be split into two words
|
||||||
6. 2grams can contain up to 1 typo
|
6. 2grams can contain up to 1 typo
|
||||||
7. 3grams cannot have typos
|
7. 3grams cannot have typos
|
||||||
8. 2grams and 3grams can be prefix tolerant
|
8. 2grams and 3grams can be prefix tolerant
|
||||||
@ -14,6 +14,7 @@ This module tests the following properties:
|
|||||||
11. Disabling typo tolerance does not disable ngram tolerance
|
11. Disabling typo tolerance does not disable ngram tolerance
|
||||||
12. Prefix tolerance is disabled for the last word if a space follows it
|
12. Prefix tolerance is disabled for the last word if a space follows it
|
||||||
13. Ngrams cannot be formed by combining a phrase and a word or two phrases
|
13. Ngrams cannot be formed by combining a phrase and a word or two phrases
|
||||||
|
14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
@ -56,6 +57,10 @@ fn create_index() -> TempIndex {
|
|||||||
{
|
{
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"text": "sunflowering is not a verb"
|
"text": "sunflowering is not a verb"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"text": "xy z"
|
||||||
}
|
}
|
||||||
]))
|
]))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@ -263,10 +268,11 @@ fn test_disable_split_words() {
|
|||||||
s.query("sunflower ");
|
s.query("sunflower ");
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
// no document containing `sun flower`
|
// no document containing `sun flower`
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
"\"the sunflower is tall\"",
|
"\"the sunflower is tall\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
@ -307,10 +313,11 @@ fn test_3gram_no_split_words() {
|
|||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
// no document with `sun flower`
|
// no document with `sun flower`
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]");
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
"\"the sunflowers are pretty\"",
|
"\"the sunflowers are pretty\"",
|
||||||
"\"the sunflower is tall\"",
|
"\"the sunflower is tall\"",
|
||||||
"\"sunflowering is not a verb\"",
|
"\"sunflowering is not a verb\"",
|
||||||
@ -369,3 +376,50 @@ fn test_no_ngram_phrases() {
|
|||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_short_split_words() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("xyz");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"xy z\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_words_never_disabled() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect());
|
||||||
|
s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect());
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the sunflower is tall");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
@ -9,7 +9,7 @@ This module tests the following properties:
|
|||||||
6. A typo on the first letter of a word counts as two typos
|
6. A typo on the first letter of a word counts as two typos
|
||||||
7. Phrases are not typo tolerant
|
7. Phrases are not typo tolerant
|
||||||
8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos`
|
8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos`
|
||||||
9. 3grams are not typo tolerant
|
9. 3grams are not typo tolerant (but they can be split into two words)
|
||||||
10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly
|
10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly
|
||||||
if `words` doesn't exist before it.
|
if `words` doesn't exist before it.
|
||||||
11. The `typo` ranking rule places documents with the same number of typos in the same bucket
|
11. The `typo` ranking rule places documents with the same number of typos in the same bucket
|
||||||
@ -287,16 +287,17 @@ fn test_typo_exact_word() {
|
|||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
// exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower)
|
// exact words do not disable prefix (sunflowering OK, but no sunflowar)
|
||||||
let mut s = Search::new(&txn, &index);
|
let mut s = Search::new(&txn, &index);
|
||||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
s.query("network interconnection sunflower");
|
s.query("network interconnection sunflower");
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]");
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
"\"network interconnection sunflower\"",
|
"\"network interconnection sunflower\"",
|
||||||
|
"\"network interconnection sun flower\"",
|
||||||
"\"network interconnection sunflowering\"",
|
"\"network interconnection sunflowering\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
|
Loading…
Reference in New Issue
Block a user