mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Make all search tests pass, fix distinctAttribute bug
This commit is contained in:
parent
a7a0891210
commit
d1fdbb63da
17 changed files with 465 additions and 327 deletions
|
@ -28,7 +28,6 @@ pub struct Search<'a> {
|
|||
limit: usize,
|
||||
sort_criteria: Option<Vec<AscDesc>>,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
authorize_typos: bool,
|
||||
words_limit: usize,
|
||||
exhaustive_number_hits: bool,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
|
@ -44,7 +43,6 @@ impl<'a> Search<'a> {
|
|||
limit: 20,
|
||||
sort_criteria: None,
|
||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||
authorize_typos: true,
|
||||
exhaustive_number_hits: false,
|
||||
words_limit: 10,
|
||||
rtxn,
|
||||
|
@ -77,11 +75,6 @@ impl<'a> Search<'a> {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> {
|
||||
self.authorize_typos = value;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> {
|
||||
self.words_limit = value;
|
||||
self
|
||||
|
@ -99,13 +92,6 @@ impl<'a> Search<'a> {
|
|||
self
|
||||
}
|
||||
|
||||
// TODO!
|
||||
fn _is_typo_authorized(&self) -> Result<bool> {
|
||||
let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?;
|
||||
// only authorize typos if both the index and the query allow it.
|
||||
Ok(self.authorize_typos && index_authorizes_typos)
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
let PartialSearchResult { located_query_terms, candidates, documents_ids } =
|
||||
|
@ -142,7 +128,6 @@ impl fmt::Debug for Search<'_> {
|
|||
limit,
|
||||
sort_criteria,
|
||||
terms_matching_strategy,
|
||||
authorize_typos,
|
||||
words_limit,
|
||||
exhaustive_number_hits,
|
||||
rtxn: _,
|
||||
|
@ -155,7 +140,6 @@ impl fmt::Debug for Search<'_> {
|
|||
.field("limit", limit)
|
||||
.field("sort_criteria", sort_criteria)
|
||||
.field("terms_matching_strategy", terms_matching_strategy)
|
||||
.field("authorize_typos", authorize_typos)
|
||||
.field("exhaustive_number_hits", exhaustive_number_hits)
|
||||
.field("words_limit", words_limit)
|
||||
.finish()
|
||||
|
@ -231,92 +215,4 @@ mod test {
|
|||
|
||||
assert_eq!(documents_ids, vec![1]);
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn test_is_authorized_typos() {
|
||||
// let index = TempIndex::new();
|
||||
// let mut txn = index.write_txn().unwrap();
|
||||
|
||||
// let mut search = Search::new(&txn, &index);
|
||||
|
||||
// // default is authorized
|
||||
// assert!(search.is_typo_authorized().unwrap());
|
||||
|
||||
// search.authorize_typos(false);
|
||||
// assert!(!search.is_typo_authorized().unwrap());
|
||||
|
||||
// index.put_authorize_typos(&mut txn, false).unwrap();
|
||||
// txn.commit().unwrap();
|
||||
|
||||
// let txn = index.read_txn().unwrap();
|
||||
// let mut search = Search::new(&txn, &index);
|
||||
|
||||
// assert!(!search.is_typo_authorized().unwrap());
|
||||
|
||||
// search.authorize_typos(true);
|
||||
// assert!(!search.is_typo_authorized().unwrap());
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_one_typos_tolerance() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[("zealand".to_string(), 1)]);
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_one_typos_first_letter() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[]);
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_two_typos_tolerance() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[("zealand".to_string(), 2)]);
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_two_typos_first_letter() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[("zealand".to_string(), 2)]);
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_prefix() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[("zealand".to_string(), 0)]);
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_bad_prefix() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[]);
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_prefix_with_typo() {
|
||||
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
// let mut cache = HashMap::new();
|
||||
// let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
|
||||
|
||||
// assert_eq!(found, &[("zealand".to_string(), 1)]);
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -88,7 +88,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
};
|
||||
}
|
||||
|
||||
let mut all_candidates = RoaringBitmap::new();
|
||||
let mut all_candidates = universe.clone();
|
||||
let mut valid_docids = vec![];
|
||||
let mut cur_offset = 0usize;
|
||||
|
||||
|
@ -162,8 +162,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||
)?;
|
||||
}
|
||||
|
||||
all_candidates |= &ranking_rule_universes[0];
|
||||
|
||||
Ok(BucketSortOutput { docids: valid_docids, all_candidates })
|
||||
}
|
||||
|
||||
|
@ -193,12 +191,14 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
|
|||
apply_distinct_rule(ctx, distinct_fid, &candidates)?;
|
||||
for universe in ranking_rule_universes.iter_mut() {
|
||||
*universe -= &excluded;
|
||||
*all_candidates -= &excluded;
|
||||
}
|
||||
remaining
|
||||
} else {
|
||||
candidates.clone()
|
||||
};
|
||||
*all_candidates |= &candidates;
|
||||
|
||||
// if the candidates are empty, there is nothing to do;
|
||||
if candidates.is_empty() {
|
||||
return Ok(());
|
||||
|
@ -216,8 +216,8 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
|
|||
);
|
||||
} else {
|
||||
// otherwise, skip some of the documents and add some of the rest, in order of ids
|
||||
let all_candidates = candidates.iter().collect::<Vec<_>>();
|
||||
let (skipped_candidates, candidates) = all_candidates.split_at(from - *cur_offset);
|
||||
let candidates_vec = candidates.iter().collect::<Vec<_>>();
|
||||
let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);
|
||||
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
|
|
|
@ -243,7 +243,7 @@ pub(crate) mod tests {
|
|||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the" },
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
|
@ -305,7 +305,7 @@ pub(crate) mod tests {
|
|||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
|
|
|
@ -599,7 +599,7 @@ mod tests {
|
|||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>ôle"
|
||||
@"<em>Ŵôřlḑôle</em>"
|
||||
);
|
||||
|
||||
// Text containing unicode match.
|
||||
|
@ -621,7 +621,7 @@ mod tests {
|
|||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Westfáli</em>a"
|
||||
@"<em>Westfália</em>"
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -184,11 +184,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|||
for rr in settings_ranking_rules {
|
||||
// Add Words before any of: typo, proximity, attribute, exactness
|
||||
match rr {
|
||||
crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
// TODO: no exactness
|
||||
| crate::Criterion::Exactness => {
|
||||
crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => {
|
||||
if !words {
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
|
@ -339,6 +335,8 @@ pub fn execute_search(
|
|||
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
// TODO: if the exactness criterion is the first one, then
|
||||
// use a different strategy to find the universe (union of any term)
|
||||
universe = resolve_maximally_reduced_query_graph(
|
||||
ctx,
|
||||
&universe,
|
||||
|
|
|
@ -56,8 +56,13 @@ impl RankingRuleGraphTrait for PositionGraph {
|
|||
}
|
||||
|
||||
for phrase in term.term_subset.all_phrases(ctx)? {
|
||||
for &word in phrase.words(ctx).iter().flatten() {
|
||||
let positions = ctx.get_db_word_positions(word)?;
|
||||
// Only check the position of the first word in the phrase
|
||||
// this is not correct, but it is the best we can do, since
|
||||
// it is difficult/impossible to know the expected position
|
||||
// of a word in a phrase.
|
||||
// There is probably a more correct way to do it though.
|
||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||
let positions = ctx.get_db_word_positions(*word)?;
|
||||
all_positions.extend(positions);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -79,11 +79,6 @@ pub fn compute_docids(
|
|||
//
|
||||
// This is an optimisation to avoid checking for an excessive number of
|
||||
// pairs.
|
||||
// WAIT, NO.
|
||||
// This should only be done once per node.
|
||||
// Here, we'll potentially do is.. 16 times?
|
||||
// Maybe we should do it at edge-build time instead.
|
||||
// Same for the future attribute ranking rule.
|
||||
let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
|
||||
if right_derivs.len() > 1 {
|
||||
let universe = &universe;
|
||||
|
@ -190,11 +185,6 @@ fn compute_non_prefix_edges(
|
|||
docids: &mut RoaringBitmap,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let mut used_left_phrases = BTreeSet::new();
|
||||
let mut used_right_phrases = BTreeSet::new();
|
||||
let mut used_left_words = BTreeSet::new();
|
||||
let mut used_right_words = BTreeSet::new();
|
||||
|
||||
let mut universe = universe.clone();
|
||||
|
||||
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
|
||||
|
@ -204,25 +194,19 @@ fn compute_non_prefix_edges(
|
|||
return Ok(());
|
||||
}
|
||||
}
|
||||
if let Some(left_phrase) = left_phrase {
|
||||
used_left_phrases.insert(left_phrase);
|
||||
}
|
||||
if let Some(right_phrase) = right_phrase {
|
||||
used_right_phrases.insert(right_phrase);
|
||||
}
|
||||
|
||||
if let Some(new_docids) =
|
||||
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
|
||||
{
|
||||
let new_docids = &universe & new_docids;
|
||||
if !new_docids.is_empty() {
|
||||
used_left_words.insert(word1);
|
||||
used_right_words.insert(word2);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
if backward_proximity >= 1
|
||||
// no swapping when either term is a phrase
|
||||
// TODO: for now, we don't do any swapping when either term is a phrase
|
||||
// but maybe we should. We'd need to look at the first/last word of the phrase
|
||||
// depending on the context.
|
||||
&& left_phrase.is_none() && right_phrase.is_none()
|
||||
{
|
||||
if let Some(new_docids) =
|
||||
|
@ -230,8 +214,6 @@ fn compute_non_prefix_edges(
|
|||
{
|
||||
let new_docids = &universe & new_docids;
|
||||
if !new_docids.is_empty() {
|
||||
used_left_words.insert(word2);
|
||||
used_right_words.insert(word1);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -69,11 +69,16 @@ pub fn compute_query_term_subset_docids_within_field_id(
|
|||
}
|
||||
|
||||
for phrase in term.all_phrases(ctx)? {
|
||||
for &word in phrase.words(ctx).iter().flatten() {
|
||||
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? {
|
||||
docids |= word_fid_docids;
|
||||
let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
|
||||
// There may be false positives when resolving a phrase, so we're not
|
||||
// guaranteed that all of its words are within a single fid.
|
||||
// TODO: fix this?
|
||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
|
||||
phrase_docids &= word_fid_docids;
|
||||
}
|
||||
}
|
||||
docids |= phrase_docids;
|
||||
}
|
||||
|
||||
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
||||
|
@ -104,11 +109,16 @@ pub fn compute_query_term_subset_docids_within_position(
|
|||
}
|
||||
|
||||
for phrase in term.all_phrases(ctx)? {
|
||||
for &word in phrase.words(ctx).iter().flatten() {
|
||||
if let Some(word_position_docids) = ctx.get_db_word_position_docids(word, position)? {
|
||||
docids |= word_position_docids;
|
||||
let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
|
||||
// It's difficult to know the expected position of the words in the phrase,
|
||||
// so instead we just check the first one.
|
||||
// TODO: fix this?
|
||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
|
||||
phrase_docids &= word_position_docids;
|
||||
}
|
||||
}
|
||||
docids |= phrase_docids;
|
||||
}
|
||||
|
||||
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
||||
|
|
|
@ -1229,7 +1229,6 @@ mod tests {
|
|||
// testing the simple query search
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
search.query("document");
|
||||
search.authorize_typos(true);
|
||||
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
||||
// all documents should be returned
|
||||
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
@ -1335,7 +1334,6 @@ mod tests {
|
|||
// testing the simple query search
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
search.query("document");
|
||||
search.authorize_typos(true);
|
||||
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
||||
// all documents should be returned
|
||||
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
@ -1582,7 +1580,6 @@ mod tests {
|
|||
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
search.query("化妆包");
|
||||
search.authorize_typos(true);
|
||||
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
||||
|
||||
// only 1 document should be returned
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue