From c8af57269748d430c7b92e88ef9ee18895b056b3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 26 Apr 2023 16:10:26 +0200 Subject: [PATCH 1/2] Add tests for exact words and exact attributes --- milli/src/search/new/tests/typo.rs | 242 ++++++++++++++++++++--------- 1 file changed, 165 insertions(+), 77 deletions(-) diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 4df340e9b..33b165a94 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -20,10 +20,9 @@ if `words` doesn't exist before it. use std::collections::HashMap; -use crate::{ - index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, - SearchResult, TermsMatchingStrategy, -}; +use crate::index::tests::TempIndex; +use crate::search::new::tests::collect_field_values; +use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -134,6 +133,14 @@ fn create_index() -> TempIndex { "id": 23, "text": "the quivk brown fox jumps over the lazy dog" }, + { + "id": 24, + "tolerant_text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 25, + "tolerant_text": "the quivk brown fox jumps over the lazy dog", + }, ])) .unwrap(); index @@ -212,79 +219,6 @@ fn test_default_typo() { "\"the quickest brownest fox jumps over the laziest dog\"", ] "###); - - // 1 typo on one word, swapped letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quikc borwn fox jupms over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the quick brown fox jumps over the lazy dog\"", - ] - "###); - - // 1 first letter typo on a word <5 bytes, replaced letter - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the nuick brown fox jumps over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); - - // 1 first letter typo on a word <5 bytes, missing letter - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the uick brown fox jumps over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); - - // 1 typo on all words >=5 bytes, replaced letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quack brawn fox junps over the lazy dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the quick brown fox jumps over the lazy dog\"", - ] - "###); - - // 2 typos on words < 9 bytes - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the quckest brawnert fox jumps over the aziest dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); - - // 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the extravant fox kyrocketed over the lamguorout dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the extravagant fox skyrocketed over the languorous dog\"", - ] - "###); - - // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters - let mut s = Search::new(&txn, &index); - s.terms_matching_strategy(TermsMatchingStrategy::All); - s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); - let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); - let texts = collect_field_values(&index, &txn, "text", &documents_ids); - insta::assert_debug_snapshot!(texts, @r###" - [ - "\"the extravagant fox skyrocketed over the languorous dog\"", - ] - "###); } #[test] @@ -301,6 +235,160 @@ fn test_phrase_no_typo_allowed() { insta::assert_debug_snapshot!(texts, @"[]"); } +#[test] +fn test_typo_exact_word() { + let index = create_index(); + + index + .update_settings(|s| { + s.set_exact_words( + ["quick", "quack", "sunflower"].iter().map(ToString::to_string).collect(), + ) + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // don't match quivk + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); + + // Don't match quick + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // words not in exact_words (quicest, jummps) have normal typo handling + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickest brownest fox jumps over the laziest dog\"", + ] + "###); + + // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower) + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + ] + "###); +} + +#[test] +fn test_typo_exact_attribute() { + let index = create_index(); + + index + .update_settings(|s| { + s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect()); + s.set_searchable_fields( + ["text", "tolerant_text"].iter().map(ToString::to_string).collect(), + ); + s.set_exact_words(["quivk"].iter().map(ToString::to_string).collect()) + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // Exact match returns both exact attributes and tolerant ones. + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 24, 25]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let texts = collect_field_values(&index, &txn, "tolerant_text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "__does_not_exist__", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // 1 typo only returns the tolerant attribute + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quidk brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[24, 25]"); + let texts = collect_field_values(&index, &txn, "tolerant_text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // combine with exact words + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quivk brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 25]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quivk brown fox jumps over the lazy dog\"", + "__does_not_exist__", + ] + "###); + let texts = collect_field_values(&index, &txn, "tolerant_text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "__does_not_exist__", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // No result in tolerant attribute + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + #[test] fn test_ngram_typos() { let index = create_index(); From b41a6cbd7a30071d842cb0ac019d7ec454148842 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 26 Apr 2023 16:28:17 +0200 Subject: [PATCH 2/2] Check sort criteria also in placeholder search --- milli/src/search/new/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e9518bad5..375f7c774 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -330,6 +330,8 @@ pub fn execute_search( ctx.index.documents_ids(ctx.txn)? }; + check_sort_criteria(ctx, sort_criteria.as_ref())?; + let mut located_query_terms = None; let bucket_sort_output = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words @@ -352,8 +354,6 @@ pub fn execute_search( let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); - check_sort_criteria(ctx, sort_criteria.as_ref())?; - let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?;