Rename TermMatchingPolicies

This commit is contained in:
ManyTheFish 2022-08-18 17:36:08 +02:00
parent 60a7221827
commit 9640976c79
10 changed files with 222 additions and 125 deletions

View File

@ -42,7 +42,7 @@ pub use self::heed_codec::{
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{ pub use self::search::{
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord,
MatchingWords, Search, SearchResult, DEFAULT_VALUES_PER_FACET, MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
}; };
pub type Result<T> = std::result::Result<T, error::Error>; pub type Result<T> = std::result::Result<T, error::Error>;

View File

@ -44,7 +44,7 @@ pub struct Search<'a> {
offset: usize, offset: usize,
limit: usize, limit: usize,
sort_criteria: Option<Vec<AscDesc>>, sort_criteria: Option<Vec<AscDesc>>,
optional_words: bool, optional_words: TermsMatchingStrategy,
authorize_typos: bool, authorize_typos: bool,
words_limit: usize, words_limit: usize,
rtxn: &'a heed::RoTxn<'a>, rtxn: &'a heed::RoTxn<'a>,
@ -59,7 +59,7 @@ impl<'a> Search<'a> {
offset: 0, offset: 0,
limit: 20, limit: 20,
sort_criteria: None, sort_criteria: None,
optional_words: true, optional_words: TermsMatchingStrategy::default(),
authorize_typos: true, authorize_typos: true,
words_limit: 10, words_limit: 10,
rtxn, rtxn,
@ -87,7 +87,7 @@ impl<'a> Search<'a> {
self self
} }
pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { pub fn optional_words(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> {
self.optional_words = value; self.optional_words = value;
self self
} }
@ -286,6 +286,28 @@ pub struct SearchResult {
pub documents_ids: Vec<DocumentId>, pub documents_ids: Vec<DocumentId>,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TermsMatchingStrategy {
// remove last word first
Last,
// remove first word first
First,
// remove more frequent word first
Frequency,
// remove smallest word first
Size,
// only one of the word is mandatory
Any,
// all words are mandatory
All,
}
impl Default for TermsMatchingStrategy {
fn default() -> Self {
Self::Last
}
}
pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>; pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>;
pub fn word_derivations<'c>( pub fn word_derivations<'c>(

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::min;
use std::{cmp, fmt, mem}; use std::{cmp, fmt, mem};
use charabia::classifier::ClassifiedTokenIter; use charabia::classifier::ClassifiedTokenIter;
@ -8,6 +9,7 @@ use roaring::RoaringBitmap;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
use crate::search::TermsMatchingStrategy;
use crate::{Index, MatchingWords, Result}; use crate::{Index, MatchingWords, Result};
type IsOptionalWord = bool; type IsOptionalWord = bool;
@ -62,6 +64,13 @@ impl Operation {
if ops.len() == 1 { if ops.len() == 1 {
ops.pop().unwrap() ops.pop().unwrap()
} else { } else {
let ops = ops
.into_iter()
.flat_map(|o| match o {
Operation::Or(wb, children) if wb == word_branch => children,
op => vec![op],
})
.collect();
Self::Or(word_branch, ops) Self::Or(word_branch, ops)
} }
} }
@ -153,7 +162,7 @@ trait Context {
pub struct QueryTreeBuilder<'a> { pub struct QueryTreeBuilder<'a> {
rtxn: &'a heed::RoTxn<'a>, rtxn: &'a heed::RoTxn<'a>,
index: &'a Index, index: &'a Index,
optional_words: bool, optional_words: TermsMatchingStrategy,
authorize_typos: bool, authorize_typos: bool,
words_limit: Option<usize>, words_limit: Option<usize>,
exact_words: Option<fst::Set<Cow<'a, [u8]>>>, exact_words: Option<fst::Set<Cow<'a, [u8]>>>,
@ -190,7 +199,7 @@ impl<'a> QueryTreeBuilder<'a> {
Ok(Self { Ok(Self {
rtxn, rtxn,
index, index,
optional_words: true, optional_words: TermsMatchingStrategy::default(),
authorize_typos: true, authorize_typos: true,
words_limit: None, words_limit: None,
exact_words: index.exact_words(rtxn)?, exact_words: index.exact_words(rtxn)?,
@ -201,7 +210,7 @@ impl<'a> QueryTreeBuilder<'a> {
/// generated forcing all query words to be present in each matching documents /// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored). /// (the criterion `words` will be ignored).
/// default value if not called: `true` /// default value if not called: `true`
pub fn optional_words(&mut self, optional_words: bool) -> &mut Self { pub fn optional_words(&mut self, optional_words: TermsMatchingStrategy) -> &mut Self {
self.optional_words = optional_words; self.optional_words = optional_words;
self self
} }
@ -323,7 +332,7 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
/// Main function that creates the final query tree from the primitive query. /// Main function that creates the final query tree from the primitive query.
fn create_query_tree( fn create_query_tree(
ctx: &impl Context, ctx: &impl Context,
optional_words: bool, optional_words: TermsMatchingStrategy,
authorize_typos: bool, authorize_typos: bool,
query: &[PrimitiveQueryPart], query: &[PrimitiveQueryPart],
) -> Result<Operation> { ) -> Result<Operation> {
@ -363,6 +372,7 @@ fn create_query_tree(
ctx: &impl Context, ctx: &impl Context,
authorize_typos: bool, authorize_typos: bool,
query: &[PrimitiveQueryPart], query: &[PrimitiveQueryPart],
any_words: bool,
) -> Result<Operation> { ) -> Result<Operation> {
const MAX_NGRAM: usize = 3; const MAX_NGRAM: usize = 3;
let mut op_children = Vec::new(); let mut op_children = Vec::new();
@ -415,57 +425,93 @@ fn create_query_tree(
} }
if !is_last { if !is_last {
let ngrams = ngrams(ctx, authorize_typos, tail)?; let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?;
and_op_children.push(ngrams); and_op_children.push(ngrams);
} }
or_op_children.push(Operation::and(and_op_children));
if any_words {
or_op_children.push(Operation::or(false, and_op_children));
} else {
or_op_children.push(Operation::and(and_op_children));
}
} }
} }
op_children.push(Operation::or(false, or_op_children)); op_children.push(Operation::or(false, or_op_children));
} }
Ok(Operation::and(op_children)) if any_words {
} Ok(Operation::or(false, op_children))
} else {
/// Create a new branch removing the last non-phrase query parts. Ok(Operation::and(op_children))
fn optional_word(
ctx: &impl Context,
authorize_typos: bool,
query: PrimitiveQuery,
) -> Result<Operation> {
let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
let mut operation_children = Vec::new();
let start = number_phrases + (number_phrases == 0) as usize;
for len in start..=query.len() {
let mut word_count = len - number_phrases;
let query: Vec<_> = query
.iter()
.filter(|p| {
if p.is_phrase() {
true
} else if word_count != 0 {
word_count -= 1;
true
} else {
false
}
})
.cloned()
.collect();
let ngrams = ngrams(ctx, authorize_typos, &query)?;
operation_children.push(ngrams);
} }
Ok(Operation::or(true, operation_children))
} }
if optional_words { let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
optional_word(ctx, authorize_typos, query.to_vec()) let remove_count = query.len() - min(number_phrases, 1);
} else { if remove_count == 0 {
ngrams(ctx, authorize_typos, query) return ngrams(ctx, authorize_typos, query, false);
} }
let mut operation_children = Vec::new();
let mut query = query.to_vec();
for _ in 0..remove_count {
let pos = match optional_words {
TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false),
TermsMatchingStrategy::Any => {
let operation = Operation::Or(
true,
vec![
// branch allowing matching documents to contains any query word.
ngrams(ctx, authorize_typos, &query, true)?,
// branch forcing matching documents to contains all the query words,
// keeping this documents of the top of the resulted list.
ngrams(ctx, authorize_typos, &query, false)?,
],
);
return Ok(operation);
}
TermsMatchingStrategy::Last => query
.iter()
.enumerate()
.filter(|(_, part)| !part.is_phrase())
.last()
.map(|(pos, _)| pos),
TermsMatchingStrategy::First => {
query.iter().enumerate().find(|(_, part)| !part.is_phrase()).map(|(pos, _)| pos)
}
TermsMatchingStrategy::Size => query
.iter()
.enumerate()
.filter(|(_, part)| !part.is_phrase())
.min_by_key(|(_, part)| match part {
PrimitiveQueryPart::Word(s, _) => s.len(),
_ => unreachable!(),
})
.map(|(pos, _)| pos),
TermsMatchingStrategy::Frequency => query
.iter()
.enumerate()
.filter(|(_, part)| !part.is_phrase())
.max_by_key(|(_, part)| match part {
PrimitiveQueryPart::Word(s, _) => {
ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value())
}
_ => unreachable!(),
})
.map(|(pos, _)| pos),
};
// compute and push the current branch on the front
operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?);
// remove word from query before creating an new branch
match pos {
Some(pos) => query.remove(pos),
None => break,
};
}
Ok(Operation::Or(true, operation_children))
} }
/// Main function that matchings words used for crop and highlight. /// Main function that matchings words used for crop and highlight.
@ -750,7 +796,7 @@ mod test {
impl TestContext { impl TestContext {
fn build<A: AsRef<[u8]>>( fn build<A: AsRef<[u8]>>(
&self, &self,
optional_words: bool, optional_words: TermsMatchingStrategy,
authorize_typos: bool, authorize_typos: bool,
words_limit: Option<usize>, words_limit: Option<usize>,
query: ClassifiedTokenIter<A>, query: ClassifiedTokenIter<A>,
@ -852,8 +898,10 @@ mod test {
let query = "hey friends"; let query = "hey friends";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -869,8 +917,10 @@ mod test {
let query = "hey friends "; let query = "hey friends ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -886,8 +936,10 @@ mod test {
let query = "hello world "; let query = "hello world ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -911,8 +963,10 @@ mod test {
let query = "new york city "; let query = "new york city ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -932,12 +986,11 @@ mod test {
Exact { word: "city" } Exact { word: "city" }
Tolerant { word: "newyork", max typo: 1 } Tolerant { word: "newyork", max typo: 1 }
Exact { word: "city" } Exact { word: "city" }
OR Exact { word: "nyc" }
Exact { word: "nyc" } AND
AND Exact { word: "new" }
Exact { word: "new" } Exact { word: "york" }
Exact { word: "york" } Tolerant { word: "newyorkcity", max typo: 1 }
Tolerant { word: "newyorkcity", max typo: 1 }
"###); "###);
} }
@ -946,8 +999,10 @@ mod test {
let query = "n grams "; let query = "n grams ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -963,8 +1018,10 @@ mod test {
let query = "wordsplit fish "; let query = "wordsplit fish ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -982,8 +1039,10 @@ mod test {
let query = "\"hey friends\" \" \" \"wooop"; let query = "\"hey friends\" \" \" \"wooop";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
AND AND
@ -997,8 +1056,10 @@ mod test {
let query = "\"hey friends. wooop wooop\""; let query = "\"hey friends. wooop wooop\"";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
AND AND
@ -1012,8 +1073,10 @@ mod test {
let query = "hey my friend "; let query = "hey my friend ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::default(), true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR(WORD) OR(WORD)
@ -1043,8 +1106,10 @@ mod test {
let query = "\"hey my\""; let query = "\"hey my\"";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::default(), true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
PHRASE ["hey", "my"] PHRASE ["hey", "my"]
@ -1056,8 +1121,10 @@ mod test {
let query = r#""hey" my good "friend""#; let query = r#""hey" my good "friend""#;
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::default(), true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR(WORD) OR(WORD)
@ -1084,8 +1151,10 @@ mod test {
let query = "hey friends "; let query = "hey friends ";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, false, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
OR OR
@ -1101,8 +1170,10 @@ mod test {
let query = "\"hey my\" good friend"; let query = "\"hey my\" good friend";
let tokens = query.tokenize(); let tokens = query.tokenize();
let (query_tree, _) = let (query_tree, _) = TestContext::default()
TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); .build(TermsMatchingStrategy::All, false, Some(2), tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###" insta::assert_debug_snapshot!(query_tree, @r###"
AND AND
@ -1145,7 +1216,8 @@ mod test {
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
let context = TestContext { exact_words, ..Default::default() }; let context = TestContext { exact_words, ..Default::default() };
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); let (query_tree, _) =
context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap();
assert!(matches!( assert!(matches!(
query_tree, query_tree,

View File

@ -613,6 +613,7 @@ mod tests {
use super::*; use super::*;
use crate::documents::documents_batch_reader_from_objects; use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex; use crate::index::tests::TempIndex;
use crate::search::TermsMatchingStrategy;
use crate::update::DeleteDocuments; use crate::update::DeleteDocuments;
use crate::BEU16; use crate::BEU16;
@ -1207,7 +1208,7 @@ mod tests {
let mut search = crate::Search::new(&rtxn, &index); let mut search = crate::Search::new(&rtxn, &index);
search.query("document"); search.query("document");
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
// all documents should be returned // all documents should be returned
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids.len(), 4); assert_eq!(documents_ids.len(), 4);
@ -1313,7 +1314,7 @@ mod tests {
let mut search = crate::Search::new(&rtxn, &index); let mut search = crate::Search::new(&rtxn, &index);
search.query("document"); search.query("document");
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
// all documents should be returned // all documents should be returned
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids.len(), 4); assert_eq!(documents_ids.len(), 4);
@ -1512,7 +1513,7 @@ mod tests {
let mut search = crate::Search::new(&rtxn, &index); let mut search = crate::Search::new(&rtxn, &index);
search.query("化妆包"); search.query("化妆包");
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
// only 1 document should be returned // only 1 document should be returned
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();

View File

@ -2,7 +2,7 @@ use std::collections::HashSet;
use big_s::S; use big_s::S;
use milli::update::Settings; use milli::update::Settings;
use milli::{Criterion, Search, SearchResult}; use milli::{Criterion, Search, SearchResult, TermsMatchingStrategy};
use Criterion::*; use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
@ -28,24 +28,25 @@ macro_rules! test_distinct {
search.query(search::TEST_QUERY); search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap(); let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap();
assert_eq!(candidates.len(), $n_res); assert_eq!(candidates.len(), $n_res);
let mut distinct_values = HashSet::new(); let mut distinct_values = HashSet::new();
let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) let expected_external_ids: Vec<_> =
.into_iter() search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[])
.filter_map(|d| { .into_iter()
if distinct_values.contains(&d.$distinct) { .filter_map(|d| {
None if distinct_values.contains(&d.$distinct) {
} else { None
distinct_values.insert(d.$distinct.to_owned()); } else {
Some(d.id) distinct_values.insert(d.$distinct.to_owned());
} Some(d.id)
}) }
.collect(); })
.collect();
let documents_ids = search::internal_to_external_ids(&index, &documents_ids); let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
assert_eq!(documents_ids, expected_external_ids); assert_eq!(documents_ids, expected_external_ids);

View File

@ -1,5 +1,5 @@
use either::{Either, Left, Right}; use either::{Either, Left, Right};
use milli::{Criterion, Filter, Search, SearchResult}; use milli::{Criterion, Filter, Search, SearchResult, TermsMatchingStrategy};
use Criterion::*; use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
@ -19,16 +19,17 @@ macro_rules! test_filter {
search.query(search::TEST_QUERY); search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
search.filter(filter_conditions); search.filter(filter_conditions);
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let filtered_ids = search::expected_filtered_ids($filter); let filtered_ids = search::expected_filtered_ids($filter);
let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) let expected_external_ids: Vec<_> =
.into_iter() search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[])
.filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) .into_iter()
.collect(); .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None })
.collect();
let documents_ids = search::internal_to_external_ids(&index, &documents_ids); let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
assert_eq!(documents_ids, expected_external_ids); assert_eq!(documents_ids, expected_external_ids);

View File

@ -8,7 +8,7 @@ use heed::EnvOpenOptions;
use maplit::{hashmap, hashset}; use maplit::{hashmap, hashset};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object}; use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
@ -96,7 +96,7 @@ pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> V
pub fn expected_order( pub fn expected_order(
criteria: &[Criterion], criteria: &[Criterion],
authorize_typo: bool, authorize_typo: bool,
optional_words: bool, optional_words: TermsMatchingStrategy,
sort_by: &[AscDesc], sort_by: &[AscDesc],
) -> Vec<TestDocument> { ) -> Vec<TestDocument> {
let dataset = let dataset =
@ -155,9 +155,9 @@ pub fn expected_order(
groups = std::mem::take(&mut new_groups); groups = std::mem::take(&mut new_groups);
} }
if authorize_typo && optional_words { if authorize_typo && optional_words == TermsMatchingStrategy::default() {
groups.into_iter().flatten().collect() groups.into_iter().flatten().collect()
} else if optional_words { } else if optional_words == TermsMatchingStrategy::default() {
groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect() groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect()
} else if authorize_typo { } else if authorize_typo {
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()

View File

@ -7,7 +7,7 @@ use itertools::Itertools;
use maplit::hashset; use maplit::hashset;
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
use rand::Rng; use rand::Rng;
use Criterion::*; use Criterion::*;
@ -15,8 +15,8 @@ use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
const ALLOW_TYPOS: bool = true; const ALLOW_TYPOS: bool = true;
const DISALLOW_TYPOS: bool = false; const DISALLOW_TYPOS: bool = false;
const ALLOW_OPTIONAL_WORDS: bool = true; const ALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::Last;
const DISALLOW_OPTIONAL_WORDS: bool = false; const DISALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::All;
const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000; const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000;
macro_rules! test_criterion { macro_rules! test_criterion {
@ -359,7 +359,7 @@ fn criteria_mixup() {
let SearchResult { documents_ids, .. } = search.execute().unwrap(); let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_external_ids: Vec<_> = let expected_external_ids: Vec<_> =
search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, &[]) search::expected_order(&criteria, ALLOW_TYPOS, ALLOW_OPTIONAL_WORDS, &[])
.into_iter() .into_iter()
.map(|d| d.id) .map(|d| d.id)
.collect(); .collect();

View File

@ -1,6 +1,6 @@
use big_s::S; use big_s::S;
use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words}; use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words};
use milli::{AscDesc, Error, Member, Search, UserError}; use milli::{AscDesc, Error, Member, Search, TermsMatchingStrategy, UserError};
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
@ -15,7 +15,7 @@ fn sort_ranking_rule_missing() {
search.query(search::TEST_QUERY); search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]);
let result = search.execute(); let result = search.execute();

View File

@ -2,7 +2,7 @@ use std::collections::BTreeSet;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{Criterion, Index, Search}; use milli::{Criterion, Index, Search, TermsMatchingStrategy};
use serde_json::json; use serde_json::json;
use tempfile::tempdir; use tempfile::tempdir;
use Criterion::*; use Criterion::*;
@ -20,7 +20,7 @@ fn test_typo_tolerance_one_typo() {
search.query("zeal"); search.query("zeal");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
@ -29,7 +29,7 @@ fn test_typo_tolerance_one_typo() {
search.query("zean"); search.query("zean");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 0); assert_eq!(result.documents_ids.len(), 0);
@ -47,7 +47,7 @@ fn test_typo_tolerance_one_typo() {
search.query("zean"); search.query("zean");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
@ -66,7 +66,7 @@ fn test_typo_tolerance_two_typo() {
search.query("zealand"); search.query("zealand");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
@ -75,7 +75,7 @@ fn test_typo_tolerance_two_typo() {
search.query("zealemd"); search.query("zealemd");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 0); assert_eq!(result.documents_ids.len(), 0);
@ -93,7 +93,7 @@ fn test_typo_tolerance_two_typo() {
search.query("zealemd"); search.query("zealemd");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
@ -142,7 +142,7 @@ fn test_typo_disabled_on_word() {
search.query("zealand"); search.query("zealand");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 2); assert_eq!(result.documents_ids.len(), 2);
@ -162,7 +162,7 @@ fn test_typo_disabled_on_word() {
search.query("zealand"); search.query("zealand");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
@ -182,7 +182,7 @@ fn test_disable_typo_on_attribute() {
search.query("antebelum"); search.query("antebelum");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1); assert_eq!(result.documents_ids.len(), 1);
@ -200,7 +200,7 @@ fn test_disable_typo_on_attribute() {
search.query("antebelum"); search.query("antebelum");
search.limit(10); search.limit(10);
search.authorize_typos(true); search.authorize_typos(true);
search.optional_words(true); search.optional_words(TermsMatchingStrategy::default());
let result = search.execute().unwrap(); let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 0); assert_eq!(result.documents_ids.len(), 0);