Add more search tests

This commit is contained in:
Loïc Lecrenier 2023-04-05 11:20:04 +02:00
parent ce328c329d
commit c69cbec64a
5 changed files with 766 additions and 27 deletions

View File

@ -18,5 +18,5 @@ fn test_kanji_language_detection() {
search.query("東京");
let SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids, vec![1]);
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]");
}

View File

@ -16,7 +16,10 @@ This module tests the following properties:
13. Ngrams cannot be formed by combining a phrase and a word or two phrases
*/
use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy};
use crate::{
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
SearchResult, TermsMatchingStrategy,
};
fn create_index() -> TempIndex {
let index = TempIndex::new();
@ -46,6 +49,14 @@ fn create_index() -> TempIndex {
{
"id": 3,
"text": "the sunflower is tall"
},
{
"id": 4,
"text": "the sunflawer is tall"
},
{
"id": 5,
"text": "sunflowering is not a verb"
}
]))
.unwrap();
@ -67,8 +78,18 @@ fn test_2gram_simple() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("sun flower");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// will also match documents with "sun flower"
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]");
// will also match documents with "sunflower" + prefix tolerance
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flowers are pretty\"",
"\"the sun flower is tall\"",
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"sunflowering is not a verb\"",
]
"###);
}
#[test]
fn test_3gram_simple() {
@ -87,6 +108,13 @@ fn test_3gram_simple() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flowers are pretty\"",
"\"the sunflowers are pretty\"",
]
"###);
}
#[test]
@ -99,7 +127,18 @@ fn test_2gram_typo() {
s.query("sun flawer");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flowers are pretty\"",
"\"the sun flower is tall\"",
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"the sunflawer is tall\"",
"\"sunflowering is not a verb\"",
]
"###);
}
#[test]
@ -119,6 +158,13 @@ fn test_no_disable_ngrams() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// documents containing `sunflower`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flower is tall\"",
"\"the sunflower is tall\"",
]
"###);
}
#[test]
@ -137,7 +183,17 @@ fn test_2gram_prefix() {
s.query("sun flow");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// documents containing words beginning with `sunflow`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flowers are pretty\"",
"\"the sun flower is tall\"",
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"sunflowering is not a verb\"",
]
"###);
}
#[test]
@ -157,7 +213,16 @@ fn test_3gram_prefix() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// documents containing a word beginning with sunfl
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"the sunflawer is tall\"",
"\"sunflowering is not a verb\"",
]
"###);
}
#[test]
@ -170,8 +235,17 @@ fn test_split_words() {
s.query("sunflower ");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// all the documents with either `sunflower` or `sun flower`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]");
// all the documents with either `sunflower` or `sun flower` + eventual typo
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flower is tall\"",
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"the sunflawer is tall\"",
]
"###);
}
#[test]
@ -191,6 +265,12 @@ fn test_disable_split_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// no document containing `sun flower`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sunflower is tall\"",
]
"###);
}
#[test]
@ -203,8 +283,18 @@ fn test_2gram_split_words() {
s.query("sunf lower");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// all the documents with "sunflower", "sun flower", or (sunflower + 1 typo)
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]");
// all the documents with "sunflower", "sun flower", (sunflower + 1 typo), or (sunflower as prefix)
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flower is tall\"",
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"the sunflawer is tall\"",
"\"sunflowering is not a verb\"",
]
"###);
}
#[test]
@ -218,7 +308,15 @@ fn test_3gram_no_split_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
// no document with `sun flower`
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sunflowers are pretty\"",
"\"the sunflower is tall\"",
"\"sunflowering is not a verb\"",
]
"###);
}
#[test]
@ -231,7 +329,13 @@ fn test_3gram_no_typos() {
s.query("sunf la wer");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sunflawer is tall\"",
]
"###);
}
#[test]
@ -245,6 +349,13 @@ fn test_no_ngram_phrases() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flowers are pretty\"",
"\"the sun flower is tall\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
@ -252,4 +363,10 @@ fn test_no_ngram_phrases() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the sun flower is tall\"",
]
"###);
}

View File

@ -0,0 +1,317 @@
/*!
This module tests the Proximity ranking rule:
1. A proximity of >7 always has the same cost.
2. Phrase terms can be in proximity to other terms via their start and end words,
but we need to make sure that the phrase exists in the document that meets this
proximity condition. This is especially relevant with split words and synonyms.
3. An ngram has the same proximity cost as its component words being consecutive.
e.g. `sunflower` equivalent to `sun flower`.
4. The prefix databases can be used to find the proximity between two words, but
they store fewer proximities than the regular word proximity DB.
*/
use std::collections::HashMap;
use crate::{
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
SearchResult, TermsMatchingStrategy,
};
fn create_simple_index() -> TempIndex {
let index = TempIndex::new();
index
.update_settings(|s| {
s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["text".to_owned()]);
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
})
.unwrap();
index
.add_documents(documents!([
{
"id": 0,
"text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog"
},
{
"id": 1,
"text": "the. quick brown fox jumps over the lazy. dog"
},
{
"id": 2,
"text": "the quick brown fox jumps over the lazy. dog"
},
{
"id": 3,
"text": "dog the quick brown fox jumps over the lazy"
},
{
"id": 4,
"text": "the quickbrown fox jumps over the lazy dog"
},
{
"id": 5,
"text": "brown quick fox jumps over the lazy dog"
},
{
"id": 6,
"text": "the really quick brown fox jumps over the very lazy dog"
},
{
"id": 7,
"text": "the really quick brown fox jumps over the lazy dog"
},
{
"id": 8,
"text": "the quick brown fox jumps over the lazy"
},
{
"id": 9,
"text": "the quack brown fox jumps over the lazy"
},
{
"id": 9,
"text": "the quack brown fox jumps over the lazy dog"
},
{
"id": 10,
"text": "the quick brown fox jumps over the lazy dog"
}
]))
.unwrap();
index
}
fn create_edge_cases_index() -> TempIndex {
let index = TempIndex::new();
index
.update_settings(|s| {
s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["text".to_owned()]);
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
})
.unwrap();
index.add_documents(documents!([
{
// This document will insert "s" in the prefix database
"id": 0,
"text": "
saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz
sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz
sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz
sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz
sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez
sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz
sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz
ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz
sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz
sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz
sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz
soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz
spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz
sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz
sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz
ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
"
},
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
// If the query is `sunflower wilting`, then we should make sure that
// the proximity condition `flower wilting: prox N` also comes with the condition
// `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now.
// We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which
// is better than nothing but not the best.
{
"id": 1,
"text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat."
},
{
"id": 2,
"text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat."
},
{
"id": 3,
// This document matches the query `sunflower wilting`, but the proximity condition
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
// which would reduce to only `flower` and `wilting` being in proximity.
"text": "A flower wilting under the sun, unlike a sunflower"
},
{
// This should be the best document for `sunflower wilting`
"id": 4,
"text": "sun flower wilting under the heat"
},
{
// This is also the best document for `sunflower wilting`
"id": 5,
"text": "sunflower wilting under the heat"
},
{
// Prox MAX between `best` and `s` prefix
"id": 6,
"text": "this is the best meal I have ever had in such a beautiful summer day"
},
{
// Prox 5 between `best` and `s` prefix
"id": 7,
"text": "this is the best cooked meal of the summer"
},
{
// Prox 4 between `best` and `s` prefix
"id": 8,
"text": "this is the best meal of the summer"
},
{
// Prox 3 between `best` and `s` prefix
"id": 9,
"text": "this is the best meal of summer"
},
{
// Prox 1 between `best` and `s` prefix
"id": 10,
"text": "this is the best summer meal"
},
{
// Reverse Prox 3 between `best` and `s` prefix
"id": 11,
"text": "summer x y best"
},
{
// Reverse Prox 2 between `best` and `s` prefix
"id": 12,
"text": "summer x best"
},
{
// Reverse Prox 1 between `best` and `s` prefix
"id": 13,
"text": "summer best"
},
])).unwrap();
index
}
#[test]
fn test_proximity_simple() {
let index = create_simple_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quickbrown fox jumps over the lazy dog\"",
"\"the quack brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the very lazy dog\"",
"\"brown quick fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy. dog\"",
"\"dog the quick brown fox jumps over the lazy\"",
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
"\"the. quick brown fox jumps over the lazy. dog\"",
]
"###);
}
#[test]
fn test_proximity_split_word() {
let index = create_edge_cases_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("sunflower wilting");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
"\"sun flower wilting under the heat\"",
"\"sunflower wilting under the heat\"",
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
"\"A flower wilting under the sun, unlike a sunflower\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("\"sun flower\" wilting");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
"\"sun flower wilting under the heat\"",
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
]
"###);
drop(txn);
index
.update_settings(|s| {
let mut syns = HashMap::new();
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
s.set_synonyms(syns);
})
.unwrap();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("xyz wilting");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
"\"sun flower wilting under the heat\"",
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
]
"###);
}
#[test]
fn test_proximity_prefix_db() {
let index = create_edge_cases_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("best s");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// This test illustrates the loss of precision from using the prefix DB
insta::assert_debug_snapshot!(texts, @r###"
[
"\"this is the best summer meal\"",
"\"summer best\"",
"\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal of the summer\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"",
"\"summer x y best\"",
]
"###);
}

View File

@ -21,8 +21,8 @@ if `words` doesn't exist before it.
use std::collections::HashMap;
use crate::{
index::tests::TempIndex, Criterion,
Search, SearchResult, TermsMatchingStrategy,
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
SearchResult, TermsMatchingStrategy,
};
fn create_index() -> TempIndex {
@ -130,6 +130,10 @@ fn create_index() -> TempIndex {
"id": 22,
"text": "the quick brown fox jumps over the lackadaisical dog"
},
{
"id": 23,
"text": "the quivk brown fox jumps over the lazy dog"
},
]))
.unwrap();
index
@ -151,6 +155,12 @@ fn test_no_typo() {
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
]
"###);
}
#[test]
@ -168,7 +178,14 @@ fn test_default_typo() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 23]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quivk brown fox jumps over the lazy dog\"",
]
"###);
// 1 typo on one word, replaced letter
let mut s = Search::new(&txn, &index);
@ -176,6 +193,12 @@ fn test_default_typo() {
s.query("the quack brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
]
"###);
// 1 typo on one word, missing letter, extra letter
let mut s = Search::new(&txn, &index);
@ -183,6 +206,12 @@ fn test_default_typo() {
s.query("the quicest brownest fox jummps over the laziest dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quickest brownest fox jumps over the laziest dog\"",
]
"###);
// 1 typo on one word, swapped letters
let mut s = Search::new(&txn, &index);
@ -190,6 +219,12 @@ fn test_default_typo() {
s.query("the quikc borwn fox jupms over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
]
"###);
// 1 first letter typo on a word <5 bytes, replaced letter
let mut s = Search::new(&txn, &index);
@ -211,6 +246,12 @@ fn test_default_typo() {
s.query("the quack brawn fox junps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
]
"###);
// 2 typos on words < 9 bytes
let mut s = Search::new(&txn, &index);
@ -225,6 +266,12 @@ fn test_default_typo() {
s.query("the extravant fox kyrocketed over the lamguorout dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the extravagant fox skyrocketed over the languorous dog\"",
]
"###);
// 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters
let mut s = Search::new(&txn, &index);
@ -232,6 +279,12 @@ fn test_default_typo() {
s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the extravagant fox skyrocketed over the languorous dog\"",
]
"###);
}
#[test]
@ -244,6 +297,8 @@ fn test_phrase_no_typo_allowed() {
s.query("the \"quick brewn\" fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @"[]");
}
#[test]
@ -256,12 +311,20 @@ fn test_ngram_typos() {
s.query("the extra lagant fox skyrocketed over the languorous dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the extravagant fox skyrocketed over the languorous dog\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the ex tra lagant fox skyrocketed over the languorous dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @"[]");
}
#[test]
fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() {
@ -278,7 +341,29 @@ fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() {
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]");
insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]");
let texts = collect_field_values(&index, &txn, "text", &ids_1);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quivk brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps over the lackadaisical dog\"",
"\"the quick brown fox jumps\"",
"\"the quick brown fox\"",
"\"the quick brown foxes jump over the lazy dog\"",
"\"the quick brown fax sends a letter to the dog\"",
"\"the quick brown\"",
"\"the quick\"",
"\"a fox doesn't quack, that crown goes to the duck.\"",
"\"the quickest brownest fox jumps over the laziest dog\"",
"\"the quicker browner fox jumped over the lazier dog\"",
"\"the extravagant fox skyrocketed over the languorous dog\"",
"\"the fast brownish fox jumps over the lackadaisical dog\"",
]
"###);
index
.update_settings(|s| {
@ -290,7 +375,7 @@ fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() {
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]");
insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]");
assert_eq!(ids_1, ids_2);
}
@ -307,6 +392,17 @@ fn test_typo_bucketing() {
s.query("network interconnection sunflower");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"netwolk interconections sunflawar\"",
"\"network interconnections sunflawer\"",
"\"network interconnection sunflower\"",
"\"network interconnection sun flower\"",
"\"network interconnection sunflowering\"",
"\"network interconnection sunflowar\"",
]
"###);
// Then with the typo ranking rule
drop(txn);
@ -322,12 +418,34 @@ fn test_typo_bucketing() {
s.query("network interconnection sunflower");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"network interconnection sunflower\"",
"\"network interconnection sunflowering\"",
"\"network interconnection sun flower\"",
"\"network interconnection sunflowar\"",
"\"network interconnections sunflawer\"",
"\"netwolk interconections sunflawar\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("network interconnection sun flower");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"network interconnection sun flower\"",
"\"network interconnection sun flowering\"",
"\"network interconnection sunflower\"",
"\"network interconnection sunflowering\"",
"\"network interconnection sunflowar\"",
"\"network interconnections sunflawer\"",
]
"###);
}
#[test]
@ -350,7 +468,15 @@ fn test_typo_synonyms() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox jumps over the lackadaisical dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 22, 23]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lackadaisical dog\"",
"\"the quivk brown fox jumps over the lazy dog\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
@ -359,5 +485,13 @@ fn test_typo_synonyms() {
// TODO: is this correct? interaction of ngrams + synonyms means that the
// multi-word synonyms end up having a typo cost. This is probably not what we want.
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the fast brownish fox jumps over the lackadaisical dog\"",
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lackadaisical dog\"",
]
"###);
}

View File

@ -12,9 +12,12 @@ account by the proximity ranking rule.
7. The search is capable of returning no results if no documents match the query
*/
use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy};
use crate::{
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
SearchResult, TermsMatchingStrategy,
};
fn create_quick_brown_fox_trivial_index() -> TempIndex {
fn create_index() -> TempIndex {
let index = TempIndex::new();
index
@ -126,7 +129,7 @@ fn create_quick_brown_fox_trivial_index() -> TempIndex {
#[test]
fn test_words_tms_last_simple() {
let index = create_quick_brown_fox_trivial_index();
let index = create_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
@ -136,6 +139,31 @@ fn test_words_tms_last_simple() {
// 6 and 7 have the same score because "the" appears twice
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the great quick brown fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps\"",
"\"the quick brown fox\"",
"\"the quick brown fox talks to the lazy and slow dog\"",
"\"the quick brown fox talks to the lazy dog\"",
"\"the quick brown\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("extravagant the quick brown fox jumps over the lazy dog");
@ -146,7 +174,7 @@ fn test_words_tms_last_simple() {
#[test]
fn test_words_tms_last_phrase() {
let index = create_quick_brown_fox_trivial_index();
let index = create_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
@ -156,6 +184,21 @@ fn test_words_tms_last_phrase() {
// "The quick brown fox" is a phrase, not deleted by this term matching strategy
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps\"",
"\"the quick brown fox\"",
"\"the quick brown fox talks to the lazy and slow dog\"",
"\"the quick brown fox talks to the lazy dog\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("\"the quick brown fox\" jumps over the \"lazy\" dog");
@ -165,6 +208,17 @@ fn test_words_tms_last_phrase() {
// "lazy" is a phrase, not deleted by this term matching strategy
// but words before it can be deleted
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox talks to the lazy and slow dog\"",
"\"the quick brown fox talks to the lazy dog\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("\"the quick brown fox jumps over the lazy dog\"");
@ -173,6 +227,12 @@ fn test_words_tms_last_phrase() {
// The whole query is a phrase, no terms are removed
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("\"the quick brown fox jumps over the lazy dog");
@ -181,11 +241,17 @@ fn test_words_tms_last_phrase() {
// The whole query is still a phrase, even without closing quotes, so no terms are removed
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
]
"###);
}
#[test]
fn test_words_proximity_tms_last_simple() {
let index = create_quick_brown_fox_trivial_index();
let index = create_index();
index
.update_settings(|s| {
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
@ -200,6 +266,31 @@ fn test_words_proximity_tms_last_simple() {
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps\"",
"\"the quick brown fox\"",
"\"the quick brown fox talks to the lazy and slow dog\"",
"\"the quick brown fox talks to the lazy dog\"",
"\"the quick brown\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("the brown quick fox jumps over the lazy dog");
@ -208,11 +299,36 @@ fn test_words_proximity_tms_last_simple() {
// 10 is better than 9 because of the proximity between "quick" and "brown"
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"the quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps\"",
"\"the quick brown fox\"",
"\"the quick brown fox talks to the lazy and slow dog\"",
"\"the quick brown fox talks to the lazy dog\"",
"\"the quick brown\"",
]
"###);
}
#[test]
fn test_words_proximity_tms_last_phrase() {
let index = create_quick_brown_fox_trivial_index();
let index = create_index();
index
.update_settings(|s| {
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
@ -228,6 +344,26 @@ fn test_words_proximity_tms_last_phrase() {
// "quick brown" is a phrase. The proximity of its first and last words
// to their adjacent query words should be taken into account
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps\"",
"\"the quick brown fox\"",
"\"the quick brown fox talks to the lazy and slow dog\"",
"\"the quick brown fox talks to the lazy dog\"",
"\"the quick brown\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("the \"quick brown\" \"fox jumps\" over the lazy dog");
@ -238,11 +374,27 @@ fn test_words_proximity_tms_last_phrase() {
// to their adjacent query words should be taken into account.
// The same applies to `fox jumps`.
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
"\"the quick brown fox jumps over\"",
"\"the quick brown fox jumps\"",
]
"###);
}
#[test]
fn test_words_tms_all() {
let index = create_quick_brown_fox_trivial_index();
let index = create_index();
index
.update_settings(|s| {
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
@ -256,6 +408,23 @@ fn test_words_tms_all() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quick brown fox jumps over the lazy dog\"",
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.query("extravagant");
@ -263,4 +432,6 @@ fn test_words_tms_all() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @"[]");
}