diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs new file mode 100644 index 000000000..eec4c62ec --- /dev/null +++ b/milli/src/search/new/tests/mod.rs @@ -0,0 +1,3 @@ +pub mod ngram_split_words; +pub mod typo; +pub mod words_tms; diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs new file mode 100644 index 000000000..06c49274c --- /dev/null +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -0,0 +1,255 @@ +/*! +This module tests the following properties: + +1. Two consecutive words from a query can be combined into a "2gram" +2. Three consecutive words from a query can be combined into a "3gram" +3. A word from the query can be split into two consecutive words (split words) +4. A 2gram can be split into two words +5. A 3gram cannot be split into two words +6. 2grams can contain up to 1 typo +7. 3grams cannot have typos +8. 2grams and 3grams can be prefix tolerant +9. Disabling typo tolerance also disable the split words feature +10. Disabling typo tolerance does not disable prefix tolerance +11. Disabling typo tolerance does not disable ngram tolerance +12. Prefix tolerance is disabled for the last word if a space follows it +13. Ngrams cannot be formed by combining a phrase and a word or two phrases +*/ + +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the sun flowers are pretty" + }, + { + "id": 1, + "text": "the sun flower is tall" + }, + { + "id": 2, + "text": "the sunflowers are pretty" + }, + { + "id": 3, + "text": "the sunflower is tall" + } + ])) + .unwrap(); + index +} + +#[test] +fn test_2gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // will also match documents with "sun flower" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} +#[test] +fn test_3gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower s are"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]"); +} + +#[test] +fn test_2gram_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flawer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} + +#[test] +fn test_no_disable_ngrams() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing `sunflower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); +} + +#[test] +fn test_2gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flow"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing words beginning with `sunflow` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} + +#[test] +fn test_3gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("su nf l"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // documents containing a word beginning with sunfl + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); +} + +#[test] +fn test_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with either `sunflower` or `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); +} + +#[test] +fn test_disable_split_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // no document containing `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); +} + +#[test] +fn test_2gram_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with "sunflower", "sun flower", or (sunflower + 1 typo) + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); +} + +#[test] +fn test_3gram_no_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lo wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // no document with `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); +} + +#[test] +fn test_3gram_no_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf la wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_no_ngram_phrases() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" \"flower\""); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs new file mode 100644 index 000000000..6ac8f5516 --- /dev/null +++ b/milli/src/search/new/tests/typo.rs @@ -0,0 +1,363 @@ +/*! +This module tests the following properties: + +1. The `words` ranking rule is typo-tolerant +2. Typo-tolerance handles missing letters, extra letters, replaced letters, and swapped letters (at least) +3. Words which are < `min_word_len_one_typo` are not typo tolerant +4. Words which are >= `min_word_len_one_typo` but < `min_word_len_two_typos` can have one typo +5. Words which are >= `min_word_len_two_typos` can have two typos +6. A typo on the first letter of a word counts as two typos +7. Phrases are not typo tolerant +8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` +9. 3grams are not typo tolerant +10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly +if `words` doesn't exist before it. +11. The `typo` ranking rule places documents with the same number of typos in the same bucket +12. Prefix tolerance costs nothing according to the typo ranking rule +13. Split words cost 1 typo according to the typo ranking rule +14. Synonyms cost nothing according to the typo ranking rule +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, Criterion, + Search, SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the quick brown fox jumps over the lazy dog" + }, + { + "id": 1, + "text": "the quick brown foxes jump over the lazy dog" + }, + { + "id": 2, + "text": "the quick brown fax sends a letter to the dog" + }, + { + "id": 3, + "text": "the quickest brownest fox jumps over the laziest dog" + }, + { + "id": 4, + "text": "a fox doesn't quack, that crown goes to the duck." + }, + { + "id": 5, + "text": "the quicker browner fox jumped over the lazier dog" + }, + { + "id": 6, + "text": "the extravagant fox skyrocketed over the languorous dog" // thanks thesaurus + }, + { + "id": 7, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the" + }, + { + "id": 9, + "text": "the quick brown fox jumps over" + }, + { + "id": 10, + "text": "the quick brown fox jumps" + }, + { + "id": 11, + "text": "the quick brown fox" + }, + { + "id": 12, + "text": "the quick brown" + }, + { + "id": 13, + "text": "the quick" + }, + { + "id": 14, + "text": "netwolk interconections sunflawar" + }, + { + "id": 15, + "text": "network interconnections sunflawer" + }, + { + "id": 16, + "text": "network interconnection sunflower" + }, + { + "id": 17, + "text": "network interconnection sun flower" + }, + { + "id": 18, + "text": "network interconnection sunflowering" + }, + { + "id": 19, + "text": "network interconnection sun flowering" + }, + { + "id": 20, + "text": "network interconnection sunflowar" + }, + { + "id": 21, + "text": "the fast brownish fox jumps over the lackadaisical dog" + }, + { + "id": 22, + "text": "the quick brown fox jumps over the lackadaisical dog" + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_no_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); +} + +#[test] +fn test_default_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // 0 typo + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 typo on one word, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 typo on one word, missing letter, extra letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + + // 1 typo on one word, swapped letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quikc borwn fox jupms over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 first letter typo on a word <5 bytes, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the nuick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 first letter typo on a word <5 bytes, missing letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the uick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 typo on all words >=5 bytes, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brawn fox junps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 2 typos on words < 9 bytes + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quckest brawnert fox jumps over the aziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravant fox kyrocketed over the lamguorout dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); +} + +#[test] +fn test_phrase_no_typo_allowed() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the \"quick brewn\" fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_ngram_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the ex tra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} +#[test] +fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + assert_eq!(ids_1, ids_2); +} + +#[test] +fn test_typo_bucketing() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + // First do the search with just the Words ranking rule + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]"); + + // Then with the typo ranking rule + drop(txn); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]"); +} + +#[test] +fn test_typo_synonyms() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + + let mut synonyms = HashMap::new(); + synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]); + synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]); + + s.set_synonyms(synonyms); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lackadaisical dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the fast brownish fox jumps over the lackadaisical dog"); + + // TODO: is this correct? interaction of ngrams + synonyms means that the + // multi-word synonyms end up having a typo cost. This is probably not what we want. + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); +} diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs new file mode 100644 index 000000000..8b5c0153f --- /dev/null +++ b/milli/src/search/new/tests/words_tms.rs @@ -0,0 +1,266 @@ +/*! +This module tests the following properties: + +1. The `last` term matching strategy starts removing terms from the query +starting from the end if no more results match it. +2. Phrases are never deleted by the `last` term matching strategy +3. Duplicate words don't affect the ranking of a document according to the `words` ranking rule +4. The proximity of the first and last word of a phrase to its adjacent terms is taken into +account by the proximity ranking rule. +5. Unclosed double quotes still make a phrase +6. The `all` term matching strategy does not remove any term from the query +7. The search is capable of returning no results if no documents match the query +*/ + +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_quick_brown_fox_trivial_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quick", + }, + { + "id": 3, + "text": "the quick brown", + }, + { + "id": 4, + "text": "the quick brown fox", + }, + { + "id": 5, + "text": "the quick brown fox jumps", + }, + { + "id": 6, + "text": "the quick brown fox jumps over", + }, + { + "id": 7, + "text": "the quick brown fox jumps over the", + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 10, + "text": "the brown quick fox jumps over the lazy dog", + }, + { + "id": 11, + "text": "the quick brown fox talks to the lazy and slow dog", + }, + { + "id": 12, + "text": "the quick brown fox talks to the lazy dog", + }, + { + "id": 13, + "text": "the mighty and quick brown fox jumps over the lazy dog", + }, + { + "id": 14, + "text": "the great quick brown fox jumps over the lazy dog", + }, + { + "id": 15, + "text": "this quick brown and very scary fox jumps over the lazy dog", + }, + { + "id": 16, + "text": "this quick brown and scary fox jumps over the lazy dog", + }, + { + "id": 17, + "text": "the quick brown fox jumps over the really lazy dog", + }, + { + "id": 18, + "text": "the brown quick fox jumps over the really lazy dog", + }, + { + "id": 19, + "text": "the brown quick fox immediately jumps over the really lazy dog", + }, + { + "id": 20, + "text": "the brown quick fox immediately jumps over the really lazy blue dog", + }, + { + "id": 21, + "text": "the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.", + }, + { + "id": 22, + "text": "the, quick, brown, fox, jumps, over, the, lazy, dog", + } + ])) + .unwrap(); + index +} + +#[test] +fn test_words_tms_last_simple() { + let index = create_quick_brown_fox_trivial_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 6 and 7 have the same score because "the" appears twice + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("extravagant the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_words_tms_last_phrase() { + let index = create_quick_brown_fox_trivial_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "The quick brown fox" is a phrase, not deleted by this term matching strategy + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the \"lazy\" dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "lazy" is a phrase, not deleted by this term matching strategy + // but words before it can be deleted + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is a phrase, no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is still a phrase, even without closing quotes, so no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); +} + +#[test] +fn test_words_proximity_tms_last_simple() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 7 is better than 6 because of the proximity between "the" and its surrounding terms + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("the brown quick fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 10 is better than 9 because of the proximity between "quick" and "brown" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); +} + +#[test] +fn test_words_proximity_tms_last_phrase() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" \"fox jumps\" over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account. + // The same applies to `fox jumps`. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]"); +} + +#[test] +fn test_words_tms_all() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]"); + + let mut s = Search::new(&txn, &index); + s.query("extravagant"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +}