mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Merge remote-tracking branch 'origin/search-refactor-tests-doc' into search-refactor
This commit is contained in:
commit
f7d90ad19f
195
milli/src/search/new/bucket_sort.rs
Normal file
195
milli/src/search/new/bucket_sort.rs
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::logger::SearchLogger;
|
||||||
|
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
|
||||||
|
use super::SearchContext;
|
||||||
|
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
|
||||||
|
use crate::Result;
|
||||||
|
|
||||||
|
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||||
|
ctx: &mut SearchContext<'ctx>,
|
||||||
|
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||||
|
query: &Q,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
from: usize,
|
||||||
|
length: usize,
|
||||||
|
logger: &mut dyn SearchLogger<Q>,
|
||||||
|
) -> Result<Vec<u32>> {
|
||||||
|
logger.initial_query(query);
|
||||||
|
logger.ranking_rules(&ranking_rules);
|
||||||
|
logger.initial_universe(universe);
|
||||||
|
|
||||||
|
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
|
||||||
|
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
if universe.len() < from as u64 {
|
||||||
|
return Ok(vec![]);
|
||||||
|
}
|
||||||
|
if ranking_rules.is_empty() {
|
||||||
|
if let Some(distinct_fid) = distinct_fid {
|
||||||
|
let mut excluded = RoaringBitmap::new();
|
||||||
|
let mut results = vec![];
|
||||||
|
for docid in universe.iter() {
|
||||||
|
if results.len() >= from + length {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if excluded.contains(docid) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||||
|
results.push(docid);
|
||||||
|
}
|
||||||
|
return Ok(results);
|
||||||
|
} else {
|
||||||
|
return Ok(universe.iter().skip(from).take(length).collect());
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let ranking_rules_len = ranking_rules.len();
|
||||||
|
|
||||||
|
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
|
||||||
|
ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
|
||||||
|
|
||||||
|
let mut ranking_rule_universes: Vec<RoaringBitmap> =
|
||||||
|
vec![RoaringBitmap::default(); ranking_rules_len];
|
||||||
|
ranking_rule_universes[0] = universe.clone();
|
||||||
|
|
||||||
|
let mut cur_ranking_rule_index = 0;
|
||||||
|
|
||||||
|
/// Finish iterating over the current ranking rule, yielding
|
||||||
|
/// control to the parent (or finishing the search if not possible).
|
||||||
|
/// Update the candidates accordingly and inform the logger.
|
||||||
|
macro_rules! back {
|
||||||
|
() => {
|
||||||
|
assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty());
|
||||||
|
logger.end_iteration_ranking_rule(
|
||||||
|
cur_ranking_rule_index,
|
||||||
|
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||||
|
&ranking_rule_universes[cur_ranking_rule_index],
|
||||||
|
);
|
||||||
|
ranking_rule_universes[cur_ranking_rule_index].clear();
|
||||||
|
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
|
||||||
|
if cur_ranking_rule_index == 0 {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
cur_ranking_rule_index -= 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut results = vec![];
|
||||||
|
let mut cur_offset = 0usize;
|
||||||
|
|
||||||
|
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
||||||
|
/// into account and inform the logger.
|
||||||
|
macro_rules! maybe_add_to_results {
|
||||||
|
($candidates:expr) => {
|
||||||
|
// First apply the distinct rule on the candidates, reducing the universes if necessary
|
||||||
|
let candidates = if let Some(distinct_fid) = distinct_fid {
|
||||||
|
let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?;
|
||||||
|
for universe in ranking_rule_universes.iter_mut() {
|
||||||
|
*universe -= &excluded;
|
||||||
|
}
|
||||||
|
remaining
|
||||||
|
} else {
|
||||||
|
$candidates.clone()
|
||||||
|
};
|
||||||
|
let len = candidates.len();
|
||||||
|
// if the candidates are empty, there is nothing to do;
|
||||||
|
if !candidates.is_empty() {
|
||||||
|
// if we still haven't reached the first document to return
|
||||||
|
if cur_offset < from {
|
||||||
|
// and if no document from this bucket can be returned
|
||||||
|
if cur_offset + (candidates.len() as usize) < from {
|
||||||
|
// then just skip the bucket
|
||||||
|
logger.skip_bucket_ranking_rule(
|
||||||
|
cur_ranking_rule_index,
|
||||||
|
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||||
|
&candidates,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// otherwise, skip some of the documents and add some of the rest, in order of ids
|
||||||
|
let all_candidates = candidates.iter().collect::<Vec<_>>();
|
||||||
|
let (skipped_candidates, candidates) =
|
||||||
|
all_candidates.split_at(from - cur_offset);
|
||||||
|
logger.skip_bucket_ranking_rule(
|
||||||
|
cur_ranking_rule_index,
|
||||||
|
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||||
|
&skipped_candidates.into_iter().collect(),
|
||||||
|
);
|
||||||
|
let candidates = candidates
|
||||||
|
.iter()
|
||||||
|
.take(length - results.len())
|
||||||
|
.copied()
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
logger.add_to_results(&candidates);
|
||||||
|
results.extend(&candidates);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// if we have passed the offset already, add some of the documents (up to the limit)
|
||||||
|
let candidates =
|
||||||
|
candidates.iter().take(length - results.len()).collect::<Vec<u32>>();
|
||||||
|
logger.add_to_results(&candidates);
|
||||||
|
results.extend(&candidates);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cur_offset += len as usize;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
while results.len() < length {
|
||||||
|
// The universe for this bucket is zero or one element, so we don't need to sort
|
||||||
|
// anything, just extend the results and go back to the parent ranking rule.
|
||||||
|
if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 {
|
||||||
|
maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]);
|
||||||
|
ranking_rule_universes[cur_ranking_rule_index].clear();
|
||||||
|
back!();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else {
|
||||||
|
back!();
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.next_bucket_ranking_rule(
|
||||||
|
cur_ranking_rule_index,
|
||||||
|
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||||
|
&ranking_rule_universes[cur_ranking_rule_index],
|
||||||
|
&next_bucket.candidates,
|
||||||
|
);
|
||||||
|
|
||||||
|
debug_assert!(
|
||||||
|
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
|
||||||
|
);
|
||||||
|
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||||
|
|
||||||
|
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||||
|
|| next_bucket.candidates.len() <= 1
|
||||||
|
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
||||||
|
{
|
||||||
|
maybe_add_to_results!(&next_bucket.candidates);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur_ranking_rule_index += 1;
|
||||||
|
ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone();
|
||||||
|
logger.start_iteration_ranking_rule(
|
||||||
|
cur_ranking_rule_index,
|
||||||
|
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||||
|
&next_bucket.query,
|
||||||
|
&ranking_rule_universes[cur_ranking_rule_index],
|
||||||
|
);
|
||||||
|
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||||
|
ctx,
|
||||||
|
logger,
|
||||||
|
&next_bucket.candidates,
|
||||||
|
&next_bucket.query,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
@ -1,3 +1,4 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
|
|
||||||
@ -24,6 +25,8 @@ pub struct DatabaseCache<'ctx> {
|
|||||||
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||||
|
|
||||||
|
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||||
}
|
}
|
||||||
@ -51,6 +54,16 @@ impl<'ctx> DatabaseCache<'ctx> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<'ctx> SearchContext<'ctx> {
|
impl<'ctx> SearchContext<'ctx> {
|
||||||
|
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
||||||
|
if let Some(fst) = self.db_cache.words_fst.clone() {
|
||||||
|
Ok(fst)
|
||||||
|
} else {
|
||||||
|
let fst = self.index.words_fst(self.txn)?;
|
||||||
|
self.db_cache.words_fst = Some(fst.clone());
|
||||||
|
Ok(fst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieve or insert the given value in the `word_docids` database.
|
/// Retrieve or insert the given value in the `word_docids` database.
|
||||||
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> {
|
pub fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'ctx [u8]>> {
|
||||||
DatabaseCache::get_value(
|
DatabaseCache::get_value(
|
||||||
|
@ -41,7 +41,7 @@ pub fn apply_distinct_rule(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id.
|
/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id.
|
||||||
fn distinct_single_docid(
|
pub fn distinct_single_docid(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
|
@ -176,6 +176,9 @@ impl<T> Interner<T> {
|
|||||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||||
}
|
}
|
||||||
|
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||||
|
FixedSizeInterner { stable_store: self.stable_store }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A store of values of type `T`, each linked to a value of type `From`
|
/// A store of values of type `T`, each linked to a value of type `From`
|
||||||
|
@ -8,9 +8,7 @@ use roaring::RoaringBitmap;
|
|||||||
|
|
||||||
use crate::search::new::interner::{Interned, MappedInterner};
|
use crate::search::new::interner::{Interned, MappedInterner};
|
||||||
use crate::search::new::query_graph::QueryNodeData;
|
use crate::search::new::query_graph::QueryNodeData;
|
||||||
use crate::search::new::query_term::{
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
Lazy, LocatedQueryTermSubset, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
|
|
||||||
};
|
|
||||||
use crate::search::new::ranking_rule_graph::{
|
use crate::search::new::ranking_rule_graph::{
|
||||||
DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph,
|
DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph,
|
||||||
RankingRuleGraphTrait, TypoCondition, TypoGraph,
|
RankingRuleGraphTrait, TypoCondition, TypoGraph,
|
||||||
@ -439,87 +437,26 @@ results.{cur_ranking_rule}{cur_activated_id} {{
|
|||||||
positions: _,
|
positions: _,
|
||||||
term_ids: _,
|
term_ids: _,
|
||||||
}) => {
|
}) => {
|
||||||
let QueryTerm {
|
|
||||||
original,
|
|
||||||
is_ngram: _,
|
|
||||||
is_prefix: _,
|
|
||||||
max_nbr_typos,
|
|
||||||
zero_typo,
|
|
||||||
one_typo,
|
|
||||||
two_typo,
|
|
||||||
} = ctx.term_interner.get(term_subset.original);
|
|
||||||
|
|
||||||
let original = ctx.word_interner.get(*original);
|
|
||||||
writeln!(
|
writeln!(
|
||||||
file,
|
file,
|
||||||
"{node_idx} : \"{original}\" {{
|
"{node_idx} : \"{}\" {{
|
||||||
shape: class
|
shape: class
|
||||||
max_nbr_typo: {max_nbr_typos}"
|
max_nbr_typo: {}",
|
||||||
|
term_subset.description(ctx),
|
||||||
|
term_subset.max_nbr_typos(ctx)
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } =
|
for w in term_subset.all_single_words_except_prefix_db(ctx).unwrap() {
|
||||||
zero_typo;
|
let w = ctx.word_interner.get(w);
|
||||||
|
writeln!(file, "{w}: word").unwrap();
|
||||||
for w in zero_typo.iter().copied() {
|
|
||||||
if term_subset.zero_typo_subset.contains_word(w) {
|
|
||||||
let w = ctx.word_interner.get(w);
|
|
||||||
writeln!(file, "\"{w}\" : 0").unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for w in prefix_of.iter().copied() {
|
for p in term_subset.all_phrases(ctx).unwrap() {
|
||||||
if term_subset.zero_typo_subset.contains_word(w) {
|
writeln!(file, "{}: phrase", p.description(ctx)).unwrap();
|
||||||
let w = ctx.word_interner.get(w);
|
|
||||||
writeln!(file, "\"{w}\" : 0P").unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if let Some(w) = term_subset.use_prefix_db(ctx) {
|
||||||
if let Some(phrase) = phrase {
|
let w = ctx.word_interner.get(w);
|
||||||
if term_subset.zero_typo_subset.contains_phrase(*phrase) {
|
writeln!(file, "{w}: prefix db").unwrap();
|
||||||
let phrase = ctx.phrase_interner.get(*phrase);
|
|
||||||
let phrase_str = phrase.description(&ctx.word_interner);
|
|
||||||
writeln!(file, "\"{phrase_str}\" : phrase").unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for synonym in synonyms.iter().copied() {
|
|
||||||
if term_subset.zero_typo_subset.contains_phrase(synonym) {
|
|
||||||
let phrase = ctx.phrase_interner.get(synonym);
|
|
||||||
let phrase_str = phrase.description(&ctx.word_interner);
|
|
||||||
writeln!(file, "\"{phrase_str}\" : synonym").unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(use_prefix_db) = use_prefix_db {
|
|
||||||
if term_subset.zero_typo_subset.contains_word(*use_prefix_db) {
|
|
||||||
let p = ctx.word_interner.get(*use_prefix_db);
|
|
||||||
writeln!(file, "use prefix DB : {p}").unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Lazy::Init(one_typo) = one_typo {
|
|
||||||
let OneTypoTerm { split_words, one_typo } = one_typo;
|
|
||||||
|
|
||||||
for w in one_typo.iter().copied() {
|
|
||||||
if term_subset.one_typo_subset.contains_word(w) {
|
|
||||||
let w = ctx.word_interner.get(w);
|
|
||||||
writeln!(file, "\"{w}\" : 1").unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(split_words) = split_words {
|
|
||||||
if term_subset.one_typo_subset.contains_phrase(*split_words) {
|
|
||||||
let phrase = ctx.phrase_interner.get(*split_words);
|
|
||||||
let phrase_str = phrase.description(&ctx.word_interner);
|
|
||||||
writeln!(file, "\"{phrase_str}\" : split_words").unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Lazy::Init(two_typo) = two_typo {
|
|
||||||
let TwoTypoTerm { two_typos } = two_typo;
|
|
||||||
for w in two_typos.iter().copied() {
|
|
||||||
if term_subset.two_typo_subset.contains_word(w) {
|
|
||||||
let w = ctx.word_interner.get(w);
|
|
||||||
writeln!(file, "\"{w}\" : 2").unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
writeln!(file, "}}").unwrap();
|
writeln!(file, "}}").unwrap();
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
// #[cfg(test)]
|
// #[cfg(test)]
|
||||||
pub mod detailed;
|
pub mod detailed;
|
||||||
|
|
||||||
|
pub mod test_logger;
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::interner::{Interned, MappedInterner};
|
use super::interner::{Interned, MappedInterner};
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
mod bucket_sort;
|
||||||
mod db_cache;
|
mod db_cache;
|
||||||
mod distinct;
|
mod distinct;
|
||||||
mod graph_based_ranking_rule;
|
mod graph_based_ranking_rule;
|
||||||
@ -18,6 +19,9 @@ mod sort;
|
|||||||
// TODO: documentation + comments
|
// TODO: documentation + comments
|
||||||
mod words;
|
mod words;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use charabia::TokenizerBuilder;
|
use charabia::TokenizerBuilder;
|
||||||
@ -29,7 +33,8 @@ pub use logger::detailed::DetailedSearchLogger;
|
|||||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||||
use query_graph::{QueryGraph, QueryNode};
|
use query_graph::{QueryGraph, QueryNode};
|
||||||
use query_term::{located_query_terms_from_string, Phrase, QueryTerm};
|
use query_term::{located_query_terms_from_string, Phrase, QueryTerm};
|
||||||
use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
||||||
|
use bucket_sort::bucket_sort;
|
||||||
use resolve_query_graph::PhraseDocIdsCache;
|
use resolve_query_graph::PhraseDocIdsCache;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use words::Words;
|
use words::Words;
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
use super::interner::{FixedSizeInterner, Interned};
|
use super::interner::{FixedSizeInterner, Interned};
|
||||||
use super::query_term::{
|
use super::query_term::{
|
||||||
self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset,
|
self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset,
|
||||||
QueryTermSubset,
|
|
||||||
};
|
};
|
||||||
use super::small_bitmap::SmallBitmap;
|
use super::small_bitmap::SmallBitmap;
|
||||||
use super::SearchContext;
|
use super::SearchContext;
|
||||||
use crate::search::new::interner::DedupInterner;
|
use crate::search::new::interner::Interner;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
@ -107,12 +106,7 @@ impl QueryGraph {
|
|||||||
let new_node_idx = add_node(
|
let new_node_idx = add_node(
|
||||||
&mut nodes_data,
|
&mut nodes_data,
|
||||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||||
term_subset: QueryTermSubset {
|
term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)),
|
||||||
original: Interned::from_raw(term_idx as u16),
|
|
||||||
zero_typo_subset: NTypoTermSubset::All,
|
|
||||||
one_typo_subset: NTypoTermSubset::All,
|
|
||||||
two_typo_subset: NTypoTermSubset::All,
|
|
||||||
},
|
|
||||||
positions: terms[term_idx].positions.clone(),
|
positions: terms[term_idx].positions.clone(),
|
||||||
term_ids: term_idx as u8..=term_idx as u8,
|
term_ids: term_idx as u8..=term_idx as u8,
|
||||||
}),
|
}),
|
||||||
@ -126,12 +120,7 @@ impl QueryGraph {
|
|||||||
let ngram_idx = add_node(
|
let ngram_idx = add_node(
|
||||||
&mut nodes_data,
|
&mut nodes_data,
|
||||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||||
term_subset: QueryTermSubset {
|
term_subset: QueryTermSubset::full(ngram.value),
|
||||||
original: ngram.value,
|
|
||||||
zero_typo_subset: NTypoTermSubset::All,
|
|
||||||
one_typo_subset: NTypoTermSubset::All,
|
|
||||||
two_typo_subset: NTypoTermSubset::All,
|
|
||||||
},
|
|
||||||
positions: ngram.positions,
|
positions: ngram.positions,
|
||||||
term_ids: term_idx as u8 - 1..=term_idx as u8,
|
term_ids: term_idx as u8 - 1..=term_idx as u8,
|
||||||
}),
|
}),
|
||||||
@ -146,12 +135,7 @@ impl QueryGraph {
|
|||||||
let ngram_idx = add_node(
|
let ngram_idx = add_node(
|
||||||
&mut nodes_data,
|
&mut nodes_data,
|
||||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||||
term_subset: QueryTermSubset {
|
term_subset: QueryTermSubset::full(ngram.value),
|
||||||
original: ngram.value,
|
|
||||||
zero_typo_subset: NTypoTermSubset::All,
|
|
||||||
one_typo_subset: NTypoTermSubset::All,
|
|
||||||
two_typo_subset: NTypoTermSubset::All,
|
|
||||||
},
|
|
||||||
positions: ngram.positions,
|
positions: ngram.positions,
|
||||||
term_ids: term_idx as u8 - 2..=term_idx as u8,
|
term_ids: term_idx as u8 - 2..=term_idx as u8,
|
||||||
}),
|
}),
|
||||||
@ -329,7 +313,7 @@ impl QueryGraph {
|
|||||||
let mut at_least_one_phrase = false;
|
let mut at_least_one_phrase = false;
|
||||||
for (node_id, node) in self.nodes.iter() {
|
for (node_id, node) in self.nodes.iter() {
|
||||||
let QueryNodeData::Term(t) = &node.data else { continue };
|
let QueryNodeData::Term(t) = &node.data else { continue };
|
||||||
if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() {
|
if t.term_subset.original_phrase(ctx).is_some() {
|
||||||
at_least_one_phrase = true;
|
at_least_one_phrase = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -361,29 +345,13 @@ impl QueryGraph {
|
|||||||
Build a query graph from a list of paths
|
Build a query graph from a list of paths
|
||||||
|
|
||||||
The paths are composed of source and dest terms.
|
The paths are composed of source and dest terms.
|
||||||
If the source term is `None`, then the last dest term is used
|
|
||||||
as the predecessor of the dest term. If the source is Some(_),
|
|
||||||
then an edge is built between the last dest term and the source,
|
|
||||||
and between the source and new dest term.
|
|
||||||
|
|
||||||
Note that the resulting graph will not correspond to a perfect
|
|
||||||
representation of the set of paths.
|
|
||||||
For example, consider the following paths:
|
For example, consider the following paths:
|
||||||
```txt
|
```txt
|
||||||
PATH 1 : a -> b1 -> c1 -> d -> e1
|
PATH 1 : a -> b1 -> c1 -> d -> e1
|
||||||
PATH 2 : a -> b2 -> c2 -> d -> e2
|
PATH 2 : a -> b2 -> c2 -> d -> e2
|
||||||
```
|
```
|
||||||
Then the resulting graph will be:
|
Then the resulting graph will be:
|
||||||
```txt
|
|
||||||
┌────┐ ┌────┐ ┌────┐
|
|
||||||
┌──│ b1 │──│ c1 │─┐ ┌──│ e1 │
|
|
||||||
┌────┐ │ └────┘ └────┘ │ ┌────┐ │ └────┘
|
|
||||||
│ a │─┤ ├─│ d │─┤
|
|
||||||
└────┘ │ ┌────┐ ┌────┐ │ └────┘ │ ┌────┐
|
|
||||||
└──│ b2 │──│ c2 │─┘ └──│ e2 │
|
|
||||||
└────┘ └────┘ └────┘
|
|
||||||
```
|
|
||||||
which is different from the fully correct representation:
|
|
||||||
```txt
|
```txt
|
||||||
┌────┐ ┌────┐ ┌────┐ ┌────┐
|
┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||||
┌──│ b1 │──│ c1 │───│ d │───│ e1 │
|
┌──│ b1 │──│ c1 │───│ d │───│ e1 │
|
||||||
@ -399,21 +367,51 @@ impl QueryGraph {
|
|||||||
pub fn build_from_paths(
|
pub fn build_from_paths(
|
||||||
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
|
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let mut node_data = DedupInterner::default();
|
let mut node_data = Interner::default();
|
||||||
let root_node = node_data.insert(QueryNodeData::Start);
|
let root_node = node_data.push(QueryNodeData::Start);
|
||||||
let end_node = node_data.insert(QueryNodeData::End);
|
let end_node = node_data.push(QueryNodeData::End);
|
||||||
|
|
||||||
|
let mut paths_with_single_terms = vec![];
|
||||||
|
|
||||||
|
for path in paths {
|
||||||
|
let mut processed_path = vec![];
|
||||||
|
let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
|
||||||
|
for (start_term, dest_term) in path {
|
||||||
|
if let Some(prev_dest_term) = prev_dest_term.take() {
|
||||||
|
if let Some(mut start_term) = start_term {
|
||||||
|
if start_term.term_ids == prev_dest_term.term_ids {
|
||||||
|
start_term.term_subset.intersect(&prev_dest_term.term_subset);
|
||||||
|
processed_path.push(start_term);
|
||||||
|
} else {
|
||||||
|
processed_path.push(prev_dest_term);
|
||||||
|
processed_path.push(start_term);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
processed_path.push(prev_dest_term);
|
||||||
|
}
|
||||||
|
} else if let Some(start_term) = start_term {
|
||||||
|
processed_path.push(start_term);
|
||||||
|
}
|
||||||
|
prev_dest_term = Some(dest_term);
|
||||||
|
}
|
||||||
|
if let Some(prev_dest_term) = prev_dest_term {
|
||||||
|
processed_path.push(prev_dest_term);
|
||||||
|
}
|
||||||
|
paths_with_single_terms.push(processed_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes
|
||||||
|
|
||||||
let mut paths_with_ids = vec![];
|
let mut paths_with_ids = vec![];
|
||||||
for path in paths {
|
for path in paths_with_single_terms {
|
||||||
let mut path_with_ids = vec![];
|
let mut path_with_ids = vec![];
|
||||||
for node in path {
|
for term in path {
|
||||||
let (start_term, end_term) = node;
|
let id = node_data.push(QueryNodeData::Term(term));
|
||||||
let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x)));
|
path_with_ids.push(Interned::from_raw(id.into_raw()));
|
||||||
let dest_node_id = node_data.insert(QueryNodeData::Term(end_term));
|
|
||||||
path_with_ids.push((src_node_id, dest_node_id));
|
|
||||||
}
|
}
|
||||||
paths_with_ids.push(path_with_ids);
|
paths_with_ids.push(path_with_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
let nodes_data = node_data.freeze();
|
let nodes_data = node_data.freeze();
|
||||||
let nodes_data_len = nodes_data.len();
|
let nodes_data_len = nodes_data.len();
|
||||||
let mut nodes = nodes_data.map_move(|n| QueryNode {
|
let mut nodes = nodes_data.map_move(|n| QueryNode {
|
||||||
@ -422,28 +420,22 @@ impl QueryGraph {
|
|||||||
successors: SmallBitmap::new(nodes_data_len),
|
successors: SmallBitmap::new(nodes_data_len),
|
||||||
});
|
});
|
||||||
|
|
||||||
let root_node = Interned::from_raw(root_node.into_raw());
|
let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
|
||||||
let end_node = Interned::from_raw(end_node.into_raw());
|
let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
|
||||||
|
|
||||||
for path in paths_with_ids {
|
for path in paths_with_ids {
|
||||||
let mut prev_node = root_node;
|
let mut prev_node_id = root_node;
|
||||||
for node in path {
|
for node_id in path {
|
||||||
let (start_term, dest_term) = node;
|
let prev_node = nodes.get_mut(prev_node_id);
|
||||||
let end_term = Interned::from_raw(dest_term.into_raw());
|
prev_node.successors.insert(node_id);
|
||||||
let src = if let Some(start_term) = start_term {
|
let node = nodes.get_mut(node_id);
|
||||||
let start_term = Interned::from_raw(start_term.into_raw());
|
node.predecessors.insert(prev_node_id);
|
||||||
nodes.get_mut(prev_node).successors.insert(start_term);
|
prev_node_id = node_id;
|
||||||
nodes.get_mut(start_term).predecessors.insert(prev_node);
|
|
||||||
start_term
|
|
||||||
} else {
|
|
||||||
prev_node
|
|
||||||
};
|
|
||||||
nodes.get_mut(src).successors.insert(end_term);
|
|
||||||
nodes.get_mut(end_term).predecessors.insert(src);
|
|
||||||
prev_node = end_term;
|
|
||||||
}
|
}
|
||||||
nodes.get_mut(prev_node).successors.insert(end_node);
|
let prev_node = nodes.get_mut(prev_node_id);
|
||||||
nodes.get_mut(end_node).predecessors.insert(prev_node);
|
prev_node.successors.insert(end_node);
|
||||||
|
let node = nodes.get_mut(end_node);
|
||||||
|
node.predecessors.insert(prev_node_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
QueryGraph { root_node, end_node, nodes }
|
QueryGraph { root_node, end_node, nodes }
|
||||||
|
File diff suppressed because it is too large
Load Diff
381
milli/src/search/new/query_term/compute_derivations.rs
Normal file
381
milli/src/search/new/query_term/compute_derivations.rs
Normal file
@ -0,0 +1,381 @@
|
|||||||
|
use fst::automaton::Str;
|
||||||
|
use fst::{Automaton, IntoStreamer, Streamer};
|
||||||
|
use heed::types::DecodeIgnore;
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
use std::ops::ControlFlow;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
|
use crate::search::new::query_term::TwoTypoTerm;
|
||||||
|
use crate::search::new::{limits, SearchContext};
|
||||||
|
use crate::search::{build_dfa, get_first};
|
||||||
|
use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
|
pub enum NumberOfTypos {
|
||||||
|
Zero,
|
||||||
|
One,
|
||||||
|
Two,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ZeroOrOneTypo {
|
||||||
|
Zero,
|
||||||
|
One,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Interned<QueryTerm> {
|
||||||
|
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
|
let s = ctx.term_interner.get_mut(self);
|
||||||
|
if s.max_nbr_typos == 0 {
|
||||||
|
s.one_typo = Lazy::Init(OneTypoTerm::default());
|
||||||
|
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||||
|
} else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
|
||||||
|
assert!(s.two_typo.is_uninit());
|
||||||
|
self.initialize_one_typo_subterm(ctx)?;
|
||||||
|
let s = ctx.term_interner.get_mut(self);
|
||||||
|
assert!(s.one_typo.is_init());
|
||||||
|
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||||
|
} else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
|
||||||
|
assert!(s.two_typo.is_uninit());
|
||||||
|
self.initialize_one_and_two_typo_subterm(ctx)?;
|
||||||
|
let s = ctx.term_interner.get_mut(self);
|
||||||
|
assert!(s.one_typo.is_init() && s.two_typo.is_init());
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_zero_typo_prefix_derivations(
|
||||||
|
word_interned: Interned<String>,
|
||||||
|
fst: fst::Set<Cow<[u8]>>,
|
||||||
|
word_interner: &mut DedupInterner<String>,
|
||||||
|
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let word = word_interner.get(word_interned).to_owned();
|
||||||
|
let word = word.as_str();
|
||||||
|
let prefix = Str::new(word).starts_with();
|
||||||
|
let mut stream = fst.search(prefix).into_stream();
|
||||||
|
|
||||||
|
while let Some(derived_word) = stream.next() {
|
||||||
|
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
|
||||||
|
let derived_word_interned = word_interner.insert(derived_word);
|
||||||
|
if derived_word_interned != word_interned {
|
||||||
|
let cf = visit(derived_word_interned)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_zero_one_typo_derivations(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
word_interned: Interned<String>,
|
||||||
|
is_prefix: bool,
|
||||||
|
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let fst = ctx.get_words_fst()?;
|
||||||
|
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||||
|
let word = word.as_str();
|
||||||
|
|
||||||
|
let dfa = build_dfa(word, 1, is_prefix);
|
||||||
|
let starts = StartsWith(Str::new(get_first(word)));
|
||||||
|
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
|
||||||
|
|
||||||
|
while let Some((derived_word, state)) = stream.next() {
|
||||||
|
let derived_word = std::str::from_utf8(derived_word)?;
|
||||||
|
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||||
|
let d = dfa.distance(state.1);
|
||||||
|
match d.to_u8() {
|
||||||
|
0 => {
|
||||||
|
if derived_word != word_interned {
|
||||||
|
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
1 => {
|
||||||
|
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
unreachable!("One typo dfa produced multiple typos")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_zero_one_two_typo_derivations(
|
||||||
|
word_interned: Interned<String>,
|
||||||
|
is_prefix: bool,
|
||||||
|
fst: fst::Set<Cow<[u8]>>,
|
||||||
|
word_interner: &mut DedupInterner<String>,
|
||||||
|
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let word = word_interner.get(word_interned).to_owned();
|
||||||
|
let word = word.as_str();
|
||||||
|
|
||||||
|
let starts = StartsWith(Str::new(get_first(word)));
|
||||||
|
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
|
||||||
|
let second_dfa = build_dfa(word, 2, is_prefix);
|
||||||
|
let second = Intersection(&second_dfa, &starts);
|
||||||
|
let automaton = Union(first, &second);
|
||||||
|
|
||||||
|
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||||
|
|
||||||
|
while let Some((derived_word, state)) = stream.next() {
|
||||||
|
let derived_word = std::str::from_utf8(derived_word)?;
|
||||||
|
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||||
|
// in the case the typo is on the first letter, we know the number of typo
|
||||||
|
// is two
|
||||||
|
if get_first(derived_word) != get_first(word) {
|
||||||
|
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Else, we know that it is the second dfa that matched and compute the
|
||||||
|
// correct distance
|
||||||
|
let d = second_dfa.distance((state.1).0);
|
||||||
|
match d.to_u8() {
|
||||||
|
0 => {
|
||||||
|
if derived_word_interned != word_interned {
|
||||||
|
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
1 => {
|
||||||
|
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
2 => {
|
||||||
|
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||||
|
if cf.is_break() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partially_initialized_term_from_word(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
word: &str,
|
||||||
|
max_typo: u8,
|
||||||
|
is_prefix: bool,
|
||||||
|
) -> Result<QueryTerm> {
|
||||||
|
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||||
|
|
||||||
|
if word.len() > MAX_WORD_LENGTH {
|
||||||
|
return Ok({
|
||||||
|
QueryTerm {
|
||||||
|
original: ctx.word_interner.insert(word.to_owned()),
|
||||||
|
ngram_words: None,
|
||||||
|
is_prefix: false,
|
||||||
|
max_nbr_typos: 0,
|
||||||
|
zero_typo: <_>::default(),
|
||||||
|
one_typo: Lazy::Init(<_>::default()),
|
||||||
|
two_typo: Lazy::Init(<_>::default()),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||||
|
|
||||||
|
let use_prefix_db = is_prefix
|
||||||
|
&& ctx
|
||||||
|
.index
|
||||||
|
.word_prefix_docids
|
||||||
|
.remap_data_type::<DecodeIgnore>()
|
||||||
|
.get(ctx.txn, word)?
|
||||||
|
.is_some();
|
||||||
|
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
||||||
|
|
||||||
|
let mut zero_typo = None;
|
||||||
|
let mut prefix_of = BTreeSet::new();
|
||||||
|
|
||||||
|
if fst.contains(word) {
|
||||||
|
zero_typo = Some(word_interned);
|
||||||
|
}
|
||||||
|
|
||||||
|
if is_prefix && use_prefix_db.is_none() {
|
||||||
|
find_zero_typo_prefix_derivations(
|
||||||
|
word_interned,
|
||||||
|
fst,
|
||||||
|
&mut ctx.word_interner,
|
||||||
|
|derived_word| {
|
||||||
|
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||||
|
prefix_of.insert(derived_word);
|
||||||
|
Ok(ControlFlow::Continue(()))
|
||||||
|
} else {
|
||||||
|
Ok(ControlFlow::Break(()))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||||
|
let mut synonym_word_count = 0;
|
||||||
|
let synonyms = synonyms
|
||||||
|
.get(&vec![word.to_owned()])
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.into_iter()
|
||||||
|
.take(limits::MAX_SYNONYM_PHRASE_COUNT)
|
||||||
|
.filter_map(|words| {
|
||||||
|
if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
synonym_word_count += words.len();
|
||||||
|
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||||
|
Some(ctx.phrase_interner.insert(Phrase { words }))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let zero_typo =
|
||||||
|
ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
|
||||||
|
|
||||||
|
Ok(QueryTerm {
|
||||||
|
original: word_interned,
|
||||||
|
ngram_words: None,
|
||||||
|
max_nbr_typos: max_typo,
|
||||||
|
is_prefix,
|
||||||
|
zero_typo,
|
||||||
|
one_typo: Lazy::Uninit,
|
||||||
|
two_typo: Lazy::Uninit,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
|
||||||
|
if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
||||||
|
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Interned<QueryTerm> {
|
||||||
|
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
|
||||||
|
let original = *original;
|
||||||
|
let is_prefix = *is_prefix;
|
||||||
|
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
|
if one_typo.is_init() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let mut one_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
|
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||||
|
match nbr_typos {
|
||||||
|
ZeroOrOneTypo::Zero => {}
|
||||||
|
ZeroOrOneTypo::One => {
|
||||||
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
|
one_typo_words.insert(derived_word);
|
||||||
|
} else {
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(ControlFlow::Continue(()))
|
||||||
|
})?;
|
||||||
|
let original_str = ctx.word_interner.get(original).to_owned();
|
||||||
|
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||||
|
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
|
||||||
|
|
||||||
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
self_mut.one_typo = Lazy::Init(one_typo);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||||
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
|
||||||
|
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||||
|
if two_typo.is_init() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let mut one_typo_words = BTreeSet::new();
|
||||||
|
let mut two_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
|
find_zero_one_two_typo_derivations(
|
||||||
|
*original,
|
||||||
|
*is_prefix,
|
||||||
|
ctx.index.words_fst(ctx.txn)?,
|
||||||
|
&mut ctx.word_interner,
|
||||||
|
|derived_word, nbr_typos| {
|
||||||
|
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||||
|
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||||
|
{
|
||||||
|
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||||
|
return Ok(ControlFlow::Break(()));
|
||||||
|
}
|
||||||
|
match nbr_typos {
|
||||||
|
NumberOfTypos::Zero => {}
|
||||||
|
NumberOfTypos::One => {
|
||||||
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
|
one_typo_words.insert(derived_word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NumberOfTypos::Two => {
|
||||||
|
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
||||||
|
two_typo_words.insert(derived_word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(ControlFlow::Continue(()))
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||||
|
let self_mut = ctx.term_interner.get_mut(self);
|
||||||
|
|
||||||
|
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
|
||||||
|
|
||||||
|
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
|
||||||
|
|
||||||
|
self_mut.one_typo = Lazy::Init(one_typo);
|
||||||
|
self_mut.two_typo = Lazy::Init(two_typo);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Split the original word into the two words that appear the
|
||||||
|
/// most next to each other in the index.
|
||||||
|
///
|
||||||
|
/// Return `None` if the original word cannot be split.
|
||||||
|
fn split_best_frequency(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
original: &str,
|
||||||
|
) -> Result<Option<(Interned<String>, Interned<String>)>> {
|
||||||
|
let chars = original.char_indices().skip(1);
|
||||||
|
let mut best = None;
|
||||||
|
|
||||||
|
for (i, _) in chars {
|
||||||
|
let (left, right) = original.split_at(i);
|
||||||
|
let left = ctx.word_interner.insert(left.to_owned());
|
||||||
|
let right = ctx.word_interner.insert(right.to_owned());
|
||||||
|
|
||||||
|
if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? {
|
||||||
|
let frequency =
|
||||||
|
CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
|
||||||
|
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||||
|
best = Some((frequency, left, right));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(best.map(|(_, left, right)| (left, right)))
|
||||||
|
}
|
370
milli/src/search/new/query_term/mod.rs
Normal file
370
milli/src/search/new/query_term/mod.rs
Normal file
@ -0,0 +1,370 @@
|
|||||||
|
mod compute_derivations;
|
||||||
|
mod ntypo_subset;
|
||||||
|
mod parse_query;
|
||||||
|
mod phrase;
|
||||||
|
|
||||||
|
use super::interner::{DedupInterner, Interned};
|
||||||
|
use super::{limits, SearchContext};
|
||||||
|
use crate::Result;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
use std::ops::RangeInclusive;
|
||||||
|
|
||||||
|
use either::Either;
|
||||||
|
pub use ntypo_subset::NTypoTermSubset;
|
||||||
|
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
||||||
|
pub use phrase::Phrase;
|
||||||
|
|
||||||
|
use compute_derivations::partially_initialized_term_from_word;
|
||||||
|
|
||||||
|
/// A set of word derivations attached to a location in the search query.
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct LocatedQueryTermSubset {
|
||||||
|
pub term_subset: QueryTermSubset,
|
||||||
|
pub positions: RangeInclusive<u16>,
|
||||||
|
pub term_ids: RangeInclusive<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct QueryTermSubset {
|
||||||
|
original: Interned<QueryTerm>,
|
||||||
|
zero_typo_subset: NTypoTermSubset,
|
||||||
|
one_typo_subset: NTypoTermSubset,
|
||||||
|
two_typo_subset: NTypoTermSubset,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct QueryTerm {
|
||||||
|
original: Interned<String>,
|
||||||
|
ngram_words: Option<Vec<Interned<String>>>,
|
||||||
|
max_nbr_typos: u8,
|
||||||
|
is_prefix: bool,
|
||||||
|
zero_typo: ZeroTypoTerm,
|
||||||
|
// May not be computed yet
|
||||||
|
one_typo: Lazy<OneTypoTerm>,
|
||||||
|
// May not be computed yet
|
||||||
|
two_typo: Lazy<TwoTypoTerm>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// SubTerms will be in a dedup interner
|
||||||
|
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||||
|
struct ZeroTypoTerm {
|
||||||
|
/// The original phrase, if any
|
||||||
|
phrase: Option<Interned<Phrase>>,
|
||||||
|
/// A single word equivalent to the original term, with zero typos
|
||||||
|
exact: Option<Interned<String>>,
|
||||||
|
/// All the words that contain the original word as prefix
|
||||||
|
prefix_of: BTreeSet<Interned<String>>,
|
||||||
|
/// All the synonyms of the original word or phrase
|
||||||
|
synonyms: BTreeSet<Interned<Phrase>>,
|
||||||
|
/// A prefix in the prefix databases matching the original word
|
||||||
|
use_prefix_db: Option<Interned<String>>,
|
||||||
|
}
|
||||||
|
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||||
|
struct OneTypoTerm {
|
||||||
|
/// The original word split into multiple consecutive words
|
||||||
|
split_words: Option<Interned<Phrase>>,
|
||||||
|
/// Words that are 1 typo away from the original word
|
||||||
|
one_typo: BTreeSet<Interned<String>>,
|
||||||
|
}
|
||||||
|
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||||
|
struct TwoTypoTerm {
|
||||||
|
/// Words that are 2 typos away from the original word
|
||||||
|
two_typos: BTreeSet<Interned<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub enum Lazy<T> {
|
||||||
|
Uninit,
|
||||||
|
Init(T),
|
||||||
|
}
|
||||||
|
impl<T> Lazy<T> {
|
||||||
|
pub fn is_init(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
Lazy::Uninit => false,
|
||||||
|
Lazy::Init(_) => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn is_uninit(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
Lazy::Uninit => true,
|
||||||
|
Lazy::Init(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum ExactTerm {
|
||||||
|
Phrase(Interned<Phrase>),
|
||||||
|
Word(Interned<String>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExactTerm {
|
||||||
|
pub fn interned_words<'ctx>(
|
||||||
|
&self,
|
||||||
|
ctx: &'ctx SearchContext<'ctx>,
|
||||||
|
) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
|
||||||
|
match *self {
|
||||||
|
ExactTerm::Phrase(phrase) => {
|
||||||
|
let phrase = ctx.phrase_interner.get(phrase);
|
||||||
|
Either::Left(phrase.words.iter().copied())
|
||||||
|
}
|
||||||
|
ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryTermSubset {
|
||||||
|
pub fn exact_term(&self, ctx: &SearchContext) -> Option<ExactTerm> {
|
||||||
|
let full_query_term = ctx.term_interner.get(self.original);
|
||||||
|
if full_query_term.ngram_words.is_some() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// TODO: included in subset
|
||||||
|
if let Some(phrase) = full_query_term.zero_typo.phrase {
|
||||||
|
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
|
||||||
|
} else if let Some(word) = full_query_term.zero_typo.exact {
|
||||||
|
self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
||||||
|
Self {
|
||||||
|
original: for_term,
|
||||||
|
zero_typo_subset: NTypoTermSubset::Nothing,
|
||||||
|
one_typo_subset: NTypoTermSubset::Nothing,
|
||||||
|
two_typo_subset: NTypoTermSubset::Nothing,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn full(for_term: Interned<QueryTerm>) -> Self {
|
||||||
|
Self {
|
||||||
|
original: for_term,
|
||||||
|
zero_typo_subset: NTypoTermSubset::All,
|
||||||
|
one_typo_subset: NTypoTermSubset::All,
|
||||||
|
two_typo_subset: NTypoTermSubset::All,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn union(&mut self, other: &Self) {
|
||||||
|
assert!(self.original == other.original);
|
||||||
|
self.zero_typo_subset.union(&other.zero_typo_subset);
|
||||||
|
self.one_typo_subset.union(&other.one_typo_subset);
|
||||||
|
self.two_typo_subset.union(&other.two_typo_subset);
|
||||||
|
}
|
||||||
|
pub fn intersect(&mut self, other: &Self) {
|
||||||
|
assert!(self.original == other.original);
|
||||||
|
self.zero_typo_subset.intersect(&other.zero_typo_subset);
|
||||||
|
self.one_typo_subset.intersect(&other.one_typo_subset);
|
||||||
|
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||||
|
let original = ctx.term_interner.get(self.original);
|
||||||
|
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
|
||||||
|
return None
|
||||||
|
};
|
||||||
|
match &self.zero_typo_subset {
|
||||||
|
NTypoTermSubset::All => Some(use_prefix_db),
|
||||||
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
|
// TODO: use a subset of prefix words instead
|
||||||
|
if words.contains(&use_prefix_db) {
|
||||||
|
Some(use_prefix_db)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Nothing => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn all_single_words_except_prefix_db(
|
||||||
|
&self,
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
) -> Result<BTreeSet<Interned<String>>> {
|
||||||
|
let mut result = BTreeSet::default();
|
||||||
|
// TODO: a compute_partially funtion
|
||||||
|
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||||
|
self.original.compute_fully_if_needed(ctx)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
|
if !self.zero_typo_subset.is_empty() {
|
||||||
|
let ZeroTypoTerm {
|
||||||
|
phrase: _,
|
||||||
|
exact: zero_typo,
|
||||||
|
prefix_of,
|
||||||
|
synonyms: _,
|
||||||
|
use_prefix_db: _,
|
||||||
|
} = &original.zero_typo;
|
||||||
|
result.extend(zero_typo.iter().copied());
|
||||||
|
result.extend(prefix_of.iter().copied());
|
||||||
|
};
|
||||||
|
|
||||||
|
match &self.one_typo_subset {
|
||||||
|
NTypoTermSubset::All => {
|
||||||
|
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||||
|
panic!()
|
||||||
|
};
|
||||||
|
result.extend(one_typo.iter().copied())
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
|
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||||
|
panic!()
|
||||||
|
};
|
||||||
|
result.extend(one_typo.intersection(words));
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Nothing => {}
|
||||||
|
};
|
||||||
|
|
||||||
|
match &self.two_typo_subset {
|
||||||
|
NTypoTermSubset::All => {
|
||||||
|
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||||
|
panic!()
|
||||||
|
};
|
||||||
|
result.extend(two_typos.iter().copied());
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||||
|
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||||
|
panic!()
|
||||||
|
};
|
||||||
|
result.extend(two_typos.intersection(words));
|
||||||
|
}
|
||||||
|
NTypoTermSubset::Nothing => {}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> {
|
||||||
|
let mut result = BTreeSet::default();
|
||||||
|
|
||||||
|
if !self.one_typo_subset.is_empty() {
|
||||||
|
// TODO: compute less than fully if possible
|
||||||
|
self.original.compute_fully_if_needed(ctx)?;
|
||||||
|
}
|
||||||
|
let original = ctx.term_interner.get_mut(self.original);
|
||||||
|
|
||||||
|
let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||||
|
&original.zero_typo;
|
||||||
|
result.extend(phrase.iter().copied());
|
||||||
|
result.extend(synonyms.iter().copied());
|
||||||
|
|
||||||
|
if !self.one_typo_subset.is_empty() {
|
||||||
|
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
||||||
|
panic!();
|
||||||
|
};
|
||||||
|
result.extend(split_words.iter().copied());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn original_phrase(&self, ctx: &SearchContext) -> Option<Interned<Phrase>> {
|
||||||
|
let t = ctx.term_interner.get(self.original);
|
||||||
|
if let Some(p) = t.zero_typo.phrase {
|
||||||
|
if self.zero_typo_subset.contains_phrase(p) {
|
||||||
|
return Some(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
|
||||||
|
let t = ctx.term_interner.get(self.original);
|
||||||
|
match t.max_nbr_typos {
|
||||||
|
0 => 0,
|
||||||
|
1 => {
|
||||||
|
if self.one_typo_subset.is_empty() {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
2 => {
|
||||||
|
if self.two_typo_subset.is_empty() {
|
||||||
|
if self.one_typo_subset.is_empty() {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
1
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => panic!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn clear_zero_typo_subset(&mut self) {
|
||||||
|
self.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||||
|
}
|
||||||
|
pub fn clear_one_typo_subset(&mut self) {
|
||||||
|
self.one_typo_subset = NTypoTermSubset::Nothing;
|
||||||
|
}
|
||||||
|
pub fn clear_two_typo_subset(&mut self) {
|
||||||
|
self.two_typo_subset = NTypoTermSubset::Nothing;
|
||||||
|
}
|
||||||
|
pub fn description(&self, ctx: &SearchContext) -> String {
|
||||||
|
let t = ctx.term_interner.get(self.original);
|
||||||
|
ctx.word_interner.get(t.original).to_owned()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ZeroTypoTerm {
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
||||||
|
phrase.is_none()
|
||||||
|
&& zero_typo.is_none()
|
||||||
|
&& prefix_of.is_empty()
|
||||||
|
&& synonyms.is_empty()
|
||||||
|
&& use_prefix_db.is_none()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl OneTypoTerm {
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
let OneTypoTerm { split_words, one_typo } = self;
|
||||||
|
one_typo.is_empty() && split_words.is_none()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl TwoTypoTerm {
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
let TwoTypoTerm { two_typos } = self;
|
||||||
|
two_typos.is_empty()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryTerm {
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
let Lazy::Init(one_typo) = &self.one_typo else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
let Lazy::Init(two_typo) = &self.two_typo else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Interned<QueryTerm> {
|
||||||
|
/// Return the original word from the given query term
|
||||||
|
fn original_single_word(self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||||
|
let self_ = ctx.term_interner.get(self);
|
||||||
|
if self_.ngram_words.is_some() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(self_.original)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A query term coupled with its position in the user's search query.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct LocatedQueryTerm {
|
||||||
|
pub value: Interned<QueryTerm>,
|
||||||
|
pub positions: RangeInclusive<u16>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LocatedQueryTerm {
|
||||||
|
/// Return `true` iff the term is empty
|
||||||
|
pub fn is_empty(&self, interner: &DedupInterner<QueryTerm>) -> bool {
|
||||||
|
interner.get(self.value).is_empty()
|
||||||
|
}
|
||||||
|
}
|
80
milli/src/search/new/query_term/ntypo_subset.rs
Normal file
80
milli/src/search/new/query_term/ntypo_subset.rs
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use crate::search::new::interner::Interned;
|
||||||
|
|
||||||
|
use super::Phrase;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub enum NTypoTermSubset {
|
||||||
|
All,
|
||||||
|
Subset {
|
||||||
|
words: BTreeSet<Interned<String>>,
|
||||||
|
phrases: BTreeSet<Interned<Phrase>>,
|
||||||
|
// TODO: prefixes: BTreeSet<Interned<String>>,
|
||||||
|
},
|
||||||
|
Nothing,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NTypoTermSubset {
|
||||||
|
pub fn contains_word(&self, word: Interned<String>) -> bool {
|
||||||
|
match self {
|
||||||
|
NTypoTermSubset::All => true,
|
||||||
|
NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word),
|
||||||
|
NTypoTermSubset::Nothing => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn contains_phrase(&self, phrase: Interned<Phrase>) -> bool {
|
||||||
|
match self {
|
||||||
|
NTypoTermSubset::All => true,
|
||||||
|
NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase),
|
||||||
|
NTypoTermSubset::Nothing => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
NTypoTermSubset::All => false,
|
||||||
|
NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(),
|
||||||
|
NTypoTermSubset::Nothing => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn union(&mut self, other: &Self) {
|
||||||
|
match self {
|
||||||
|
Self::All => {}
|
||||||
|
Self::Subset { words, phrases } => match other {
|
||||||
|
Self::All => {
|
||||||
|
*self = Self::All;
|
||||||
|
}
|
||||||
|
Self::Subset { words: w2, phrases: p2 } => {
|
||||||
|
words.extend(w2);
|
||||||
|
phrases.extend(p2);
|
||||||
|
}
|
||||||
|
Self::Nothing => {}
|
||||||
|
},
|
||||||
|
Self::Nothing => {
|
||||||
|
*self = other.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn intersect(&mut self, other: &Self) {
|
||||||
|
match self {
|
||||||
|
Self::All => *self = other.clone(),
|
||||||
|
Self::Subset { words, phrases } => match other {
|
||||||
|
Self::All => {}
|
||||||
|
Self::Subset { words: w2, phrases: p2 } => {
|
||||||
|
let mut ws = BTreeSet::new();
|
||||||
|
for w in words.intersection(w2) {
|
||||||
|
ws.insert(*w);
|
||||||
|
}
|
||||||
|
let mut ps = BTreeSet::new();
|
||||||
|
for p in phrases.intersection(p2) {
|
||||||
|
ps.insert(*p);
|
||||||
|
}
|
||||||
|
*words = ws;
|
||||||
|
*phrases = ps;
|
||||||
|
}
|
||||||
|
Self::Nothing => *self = Self::Nothing,
|
||||||
|
},
|
||||||
|
Self::Nothing => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
281
milli/src/search/new/query_term/parse_query.rs
Normal file
281
milli/src/search/new/query_term/parse_query.rs
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind};
|
||||||
|
|
||||||
|
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// Convert the tokenised search query into a list of located query terms.
|
||||||
|
// TODO: checking if the positions are correct for phrases, separators, ngrams
|
||||||
|
pub fn located_query_terms_from_string(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
query: NormalizedTokenIter<&[u8]>,
|
||||||
|
words_limit: Option<usize>,
|
||||||
|
) -> Result<Vec<LocatedQueryTerm>> {
|
||||||
|
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||||
|
|
||||||
|
let mut located_terms = Vec::new();
|
||||||
|
|
||||||
|
let mut phrase: Option<PhraseBuilder> = None;
|
||||||
|
|
||||||
|
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||||
|
|
||||||
|
// start with the last position as we will wrap around to position 0 at the beginning of the loop below.
|
||||||
|
let mut position = u16::MAX;
|
||||||
|
|
||||||
|
let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable();
|
||||||
|
while let Some(token) = peekable.next() {
|
||||||
|
// early return if word limit is exceeded
|
||||||
|
if located_terms.len() >= parts_limit {
|
||||||
|
return Ok(located_terms);
|
||||||
|
}
|
||||||
|
|
||||||
|
match token.kind {
|
||||||
|
TokenKind::Word | TokenKind::StopWord => {
|
||||||
|
// On first loop, goes from u16::MAX to 0, then normal increment.
|
||||||
|
position = position.wrapping_add(1);
|
||||||
|
|
||||||
|
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
||||||
|
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||||
|
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||||
|
if let Some(phrase) = &mut phrase {
|
||||||
|
phrase.push_word(ctx, &token, position)
|
||||||
|
} else if peekable.peek().is_some() {
|
||||||
|
match token.kind {
|
||||||
|
TokenKind::Word => {
|
||||||
|
let word = token.lemma();
|
||||||
|
let term = partially_initialized_term_from_word(
|
||||||
|
ctx,
|
||||||
|
word,
|
||||||
|
nbr_typos(word),
|
||||||
|
false,
|
||||||
|
)?;
|
||||||
|
let located_term = LocatedQueryTerm {
|
||||||
|
value: ctx.term_interner.push(term),
|
||||||
|
positions: position..=position,
|
||||||
|
};
|
||||||
|
located_terms.push(located_term);
|
||||||
|
}
|
||||||
|
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let word = token.lemma();
|
||||||
|
let term =
|
||||||
|
partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?;
|
||||||
|
let located_term = LocatedQueryTerm {
|
||||||
|
value: ctx.term_interner.push(term),
|
||||||
|
positions: position..=position,
|
||||||
|
};
|
||||||
|
located_terms.push(located_term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
TokenKind::Separator(separator_kind) => {
|
||||||
|
match separator_kind {
|
||||||
|
SeparatorKind::Hard => {
|
||||||
|
position += 1;
|
||||||
|
}
|
||||||
|
SeparatorKind::Soft => {
|
||||||
|
position += 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
phrase = 'phrase: {
|
||||||
|
let phrase = phrase.take();
|
||||||
|
|
||||||
|
// If we have a hard separator inside a phrase, we immediately start a new phrase
|
||||||
|
let phrase = if separator_kind == SeparatorKind::Hard {
|
||||||
|
if let Some(phrase) = phrase {
|
||||||
|
if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
|
located_terms.push(located_query_term)
|
||||||
|
}
|
||||||
|
Some(PhraseBuilder::empty())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
phrase
|
||||||
|
};
|
||||||
|
|
||||||
|
// We close and start a new phrase depending on the number of double quotes
|
||||||
|
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
||||||
|
if quote_count == 0 {
|
||||||
|
break 'phrase phrase;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume the closing quote and the phrase
|
||||||
|
if let Some(phrase) = phrase {
|
||||||
|
// Per the check above, quote_count > 0
|
||||||
|
quote_count -= 1;
|
||||||
|
if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
|
located_terms.push(located_query_term)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start new phrase if the token ends with an opening quote
|
||||||
|
(quote_count % 2 == 1).then_some(PhraseBuilder::empty())
|
||||||
|
};
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||||
|
if let Some(phrase) = phrase.take() {
|
||||||
|
if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
|
located_terms.push(located_query_term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(located_terms)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn number_of_typos_allowed<'ctx>(
|
||||||
|
ctx: &SearchContext<'ctx>,
|
||||||
|
) -> Result<impl Fn(&str) -> u8 + 'ctx> {
|
||||||
|
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
|
||||||
|
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
|
||||||
|
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
|
||||||
|
|
||||||
|
// TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
|
||||||
|
let exact_words = ctx.index.exact_words(ctx.txn)?;
|
||||||
|
|
||||||
|
Ok(Box::new(move |word: &str| {
|
||||||
|
if !authorize_typos
|
||||||
|
|| word.len() < min_len_one_typo as usize
|
||||||
|
|| exact_words.as_ref().map_or(false, |fst| fst.contains(word))
|
||||||
|
{
|
||||||
|
0
|
||||||
|
} else if word.len() < min_len_two_typos as usize {
|
||||||
|
1
|
||||||
|
} else {
|
||||||
|
2
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn make_ngram(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
terms: &[LocatedQueryTerm],
|
||||||
|
number_of_typos_allowed: &impl Fn(&str) -> u8,
|
||||||
|
) -> Result<Option<LocatedQueryTerm>> {
|
||||||
|
assert!(!terms.is_empty());
|
||||||
|
for t in terms {
|
||||||
|
if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for ts in terms.windows(2) {
|
||||||
|
let [t1, t2] = ts else { panic!() };
|
||||||
|
if *t1.positions.end() != t2.positions.start() - 1 {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut words_interned = vec![];
|
||||||
|
for term in terms {
|
||||||
|
if let Some(original_term_word) = term.value.original_single_word(ctx) {
|
||||||
|
words_interned.push(original_term_word);
|
||||||
|
} else {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let words =
|
||||||
|
words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let start = *terms.first().as_ref().unwrap().positions.start();
|
||||||
|
let end = *terms.last().as_ref().unwrap().positions.end();
|
||||||
|
let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix;
|
||||||
|
let ngram_str = words.join("");
|
||||||
|
if ngram_str.len() > MAX_WORD_LENGTH {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone());
|
||||||
|
|
||||||
|
let max_nbr_typos =
|
||||||
|
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
||||||
|
|
||||||
|
let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?;
|
||||||
|
|
||||||
|
// Now add the synonyms
|
||||||
|
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||||
|
|
||||||
|
term.zero_typo.synonyms.extend(
|
||||||
|
index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| {
|
||||||
|
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||||
|
ctx.phrase_interner.insert(Phrase { words })
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
let term = QueryTerm {
|
||||||
|
original: ngram_str_interned,
|
||||||
|
ngram_words: Some(words_interned),
|
||||||
|
is_prefix,
|
||||||
|
max_nbr_typos,
|
||||||
|
zero_typo: term.zero_typo,
|
||||||
|
one_typo: Lazy::Uninit,
|
||||||
|
two_typo: Lazy::Uninit,
|
||||||
|
};
|
||||||
|
|
||||||
|
let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end };
|
||||||
|
|
||||||
|
Ok(Some(term))
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PhraseBuilder {
|
||||||
|
words: Vec<Option<Interned<String>>>,
|
||||||
|
start: u16,
|
||||||
|
end: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PhraseBuilder {
|
||||||
|
fn empty() -> Self {
|
||||||
|
Self { words: Default::default(), start: u16::MAX, end: u16::MAX }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_empty(&self) -> bool {
|
||||||
|
self.words.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
// precondition: token has kind Word or StopWord
|
||||||
|
fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) {
|
||||||
|
if self.is_empty() {
|
||||||
|
self.start = position;
|
||||||
|
}
|
||||||
|
self.end = position;
|
||||||
|
if let TokenKind::StopWord = token.kind {
|
||||||
|
self.words.push(None);
|
||||||
|
} else {
|
||||||
|
// token has kind Word
|
||||||
|
let word = ctx.word_interner.insert(token.lemma().to_string());
|
||||||
|
// TODO: in a phrase, check that every word exists
|
||||||
|
// otherwise return an empty term
|
||||||
|
self.words.push(Some(word));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build(self, ctx: &mut SearchContext) -> Option<LocatedQueryTerm> {
|
||||||
|
if self.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(LocatedQueryTerm {
|
||||||
|
value: ctx.term_interner.push({
|
||||||
|
let phrase = ctx.phrase_interner.insert(Phrase { words: self.words });
|
||||||
|
let phrase_desc = phrase.description(ctx);
|
||||||
|
QueryTerm {
|
||||||
|
original: ctx.word_interner.insert(phrase_desc),
|
||||||
|
ngram_words: None,
|
||||||
|
max_nbr_typos: 0,
|
||||||
|
is_prefix: false,
|
||||||
|
zero_typo: ZeroTypoTerm {
|
||||||
|
phrase: Some(phrase),
|
||||||
|
exact: None,
|
||||||
|
prefix_of: BTreeSet::default(),
|
||||||
|
synonyms: BTreeSet::default(),
|
||||||
|
use_prefix_db: None,
|
||||||
|
},
|
||||||
|
one_typo: Lazy::Uninit,
|
||||||
|
two_typo: Lazy::Uninit,
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
positions: self.start..=self.end,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
16
milli/src/search/new/query_term/phrase.rs
Normal file
16
milli/src/search/new/query_term/phrase.rs
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
|
use crate::{search::new::interner::Interned, SearchContext};
|
||||||
|
|
||||||
|
/// A phrase in the user's search query, consisting of several words
|
||||||
|
/// that must appear side-by-side in the search results.
|
||||||
|
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct Phrase {
|
||||||
|
pub words: Vec<Option<Interned<String>>>,
|
||||||
|
}
|
||||||
|
impl Interned<Phrase> {
|
||||||
|
pub fn description(self, ctx: &SearchContext) -> String {
|
||||||
|
let p = ctx.phrase_interner.get(self);
|
||||||
|
p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ")
|
||||||
|
}
|
||||||
|
}
|
@ -57,9 +57,7 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
Ok(format!("{cost}: cost"))
|
Ok(format!("{cost}: cost"))
|
||||||
}
|
}
|
||||||
ProximityCondition::Term { term } => {
|
ProximityCondition::Term { term } => {
|
||||||
let original_term = ctx.term_interner.get(term.term_subset.original);
|
Ok(format!("{} : exists", term.term_subset.description(ctx)))
|
||||||
let original_word = ctx.word_interner.get(original_term.original);
|
|
||||||
Ok(format!("{original_word} : exists"))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ use roaring::RoaringBitmap;
|
|||||||
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
||||||
use crate::search::new::logger::SearchLogger;
|
use crate::search::new::logger::SearchLogger;
|
||||||
use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset};
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||||
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
@ -43,8 +43,7 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
_from: Option<&LocatedQueryTermSubset>,
|
_from: Option<&LocatedQueryTermSubset>,
|
||||||
to_term: &LocatedQueryTermSubset,
|
to_term: &LocatedQueryTermSubset,
|
||||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||||
let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term;
|
let term = to_term;
|
||||||
let original_full_term = ctx.term_interner.get(term.term_subset.original);
|
|
||||||
|
|
||||||
let mut edges = vec![];
|
let mut edges = vec![];
|
||||||
// Ngrams have a base typo cost
|
// Ngrams have a base typo cost
|
||||||
@ -52,20 +51,20 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
// 3-gram -> equivalent to 2 typos
|
// 3-gram -> equivalent to 2 typos
|
||||||
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
||||||
|
|
||||||
for nbr_typos in 0..=original_full_term.max_nbr_typos {
|
for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
|
||||||
let mut term = term.clone();
|
let mut term = term.clone();
|
||||||
match nbr_typos {
|
match nbr_typos {
|
||||||
0 => {
|
0 => {
|
||||||
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
|
term.term_subset.clear_one_typo_subset();
|
||||||
term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
|
term.term_subset.clear_two_typo_subset();
|
||||||
}
|
}
|
||||||
1 => {
|
1 => {
|
||||||
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
|
term.term_subset.clear_zero_typo_subset();
|
||||||
term.term_subset.two_typo_subset = NTypoTermSubset::Nothing;
|
term.term_subset.clear_two_typo_subset();
|
||||||
}
|
}
|
||||||
2 => {
|
2 => {
|
||||||
term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing;
|
term.term_subset.clear_zero_typo_subset();
|
||||||
term.term_subset.one_typo_subset = NTypoTermSubset::Nothing;
|
term.term_subset.clear_one_typo_subset();
|
||||||
}
|
}
|
||||||
_ => panic!(),
|
_ => panic!(),
|
||||||
};
|
};
|
||||||
@ -92,9 +91,6 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
|
|
||||||
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
|
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
|
||||||
let TypoCondition { term, nbr_typos } = condition;
|
let TypoCondition { term, nbr_typos } = condition;
|
||||||
let original_term = ctx.term_interner.get(term.term_subset.original);
|
Ok(format!("{}: {nbr_typos}", term.term_subset.description(ctx)))
|
||||||
let original = ctx.word_interner.get(original_term.original);
|
|
||||||
|
|
||||||
Ok(format!("{original}: {nbr_typos}"))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,6 @@ use roaring::RoaringBitmap;
|
|||||||
|
|
||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
use super::{QueryGraph, SearchContext};
|
use super::{QueryGraph, SearchContext};
|
||||||
// use crate::search::new::sort::Sort;
|
|
||||||
use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput};
|
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
/// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`]
|
/// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`]
|
||||||
@ -69,171 +67,3 @@ pub struct RankingRuleOutput<Q> {
|
|||||||
/// The allowed candidates for the child ranking rule
|
/// The allowed candidates for the child ranking rule
|
||||||
pub candidates: RoaringBitmap,
|
pub candidates: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|
||||||
ctx: &mut SearchContext<'ctx>,
|
|
||||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
|
||||||
query: &Q,
|
|
||||||
universe: &RoaringBitmap,
|
|
||||||
from: usize,
|
|
||||||
length: usize,
|
|
||||||
logger: &mut dyn SearchLogger<Q>,
|
|
||||||
) -> Result<Vec<u32>> {
|
|
||||||
logger.initial_query(query);
|
|
||||||
logger.ranking_rules(&ranking_rules);
|
|
||||||
logger.initial_universe(universe);
|
|
||||||
|
|
||||||
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
|
|
||||||
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
if universe.len() < from as u64 {
|
|
||||||
return Ok(vec![]);
|
|
||||||
}
|
|
||||||
|
|
||||||
let ranking_rules_len = ranking_rules.len();
|
|
||||||
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
|
|
||||||
ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
|
|
||||||
|
|
||||||
let mut ranking_rule_universes: Vec<RoaringBitmap> =
|
|
||||||
vec![RoaringBitmap::default(); ranking_rules_len];
|
|
||||||
ranking_rule_universes[0] = universe.clone();
|
|
||||||
|
|
||||||
let mut cur_ranking_rule_index = 0;
|
|
||||||
|
|
||||||
/// Finish iterating over the current ranking rule, yielding
|
|
||||||
/// control to the parent (or finishing the search if not possible).
|
|
||||||
/// Update the candidates accordingly and inform the logger.
|
|
||||||
macro_rules! back {
|
|
||||||
() => {
|
|
||||||
assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty());
|
|
||||||
logger.end_iteration_ranking_rule(
|
|
||||||
cur_ranking_rule_index,
|
|
||||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
|
||||||
&ranking_rule_universes[cur_ranking_rule_index],
|
|
||||||
);
|
|
||||||
ranking_rule_universes[cur_ranking_rule_index].clear();
|
|
||||||
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
|
|
||||||
if cur_ranking_rule_index == 0 {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
cur_ranking_rule_index -= 1;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut results = vec![];
|
|
||||||
let mut cur_offset = 0usize;
|
|
||||||
|
|
||||||
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
|
||||||
/// into account and inform the logger.
|
|
||||||
macro_rules! maybe_add_to_results {
|
|
||||||
($candidates:expr) => {
|
|
||||||
// First apply the distinct rule on the candidates, reducing the universes if necessary
|
|
||||||
let candidates = if let Some(distinct_fid) = distinct_fid {
|
|
||||||
let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?;
|
|
||||||
for universe in ranking_rule_universes.iter_mut() {
|
|
||||||
*universe -= &excluded;
|
|
||||||
}
|
|
||||||
remaining
|
|
||||||
} else {
|
|
||||||
$candidates.clone()
|
|
||||||
};
|
|
||||||
let len = candidates.len();
|
|
||||||
// if the candidates are empty, there is nothing to do;
|
|
||||||
if !candidates.is_empty() {
|
|
||||||
// if we still haven't reached the first document to return
|
|
||||||
if cur_offset < from {
|
|
||||||
// and if no document from this bucket can be returned
|
|
||||||
if cur_offset + (candidates.len() as usize) < from {
|
|
||||||
// then just skip the bucket
|
|
||||||
logger.skip_bucket_ranking_rule(
|
|
||||||
cur_ranking_rule_index,
|
|
||||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
|
||||||
&candidates,
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// otherwise, skip some of the documents and add some of the rest, in order of ids
|
|
||||||
let all_candidates = candidates.iter().collect::<Vec<_>>();
|
|
||||||
let (skipped_candidates, candidates) =
|
|
||||||
all_candidates.split_at(from - cur_offset);
|
|
||||||
logger.skip_bucket_ranking_rule(
|
|
||||||
cur_ranking_rule_index,
|
|
||||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
|
||||||
&skipped_candidates.into_iter().collect(),
|
|
||||||
);
|
|
||||||
let candidates = candidates
|
|
||||||
.iter()
|
|
||||||
.take(length - results.len())
|
|
||||||
.copied()
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
logger.add_to_results(&candidates);
|
|
||||||
results.extend(&candidates);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// if we have passed the offset already, add some of the documents (up to the limit)
|
|
||||||
let candidates =
|
|
||||||
candidates.iter().take(length - results.len()).collect::<Vec<u32>>();
|
|
||||||
logger.add_to_results(&candidates);
|
|
||||||
results.extend(&candidates);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur_offset += len as usize;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
while results.len() < length {
|
|
||||||
// The universe for this bucket is zero or one element, so we don't need to sort
|
|
||||||
// anything, just extend the results and go back to the parent ranking rule.
|
|
||||||
if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 {
|
|
||||||
maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]);
|
|
||||||
ranking_rule_universes[cur_ranking_rule_index].clear();
|
|
||||||
back!();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else {
|
|
||||||
back!();
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
logger.next_bucket_ranking_rule(
|
|
||||||
cur_ranking_rule_index,
|
|
||||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
|
||||||
&ranking_rule_universes[cur_ranking_rule_index],
|
|
||||||
&next_bucket.candidates,
|
|
||||||
);
|
|
||||||
|
|
||||||
debug_assert!(
|
|
||||||
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
|
|
||||||
);
|
|
||||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
|
||||||
|
|
||||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
|
||||||
|| next_bucket.candidates.len() <= 1
|
|
||||||
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
|
||||||
{
|
|
||||||
maybe_add_to_results!(&next_bucket.candidates);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
cur_ranking_rule_index += 1;
|
|
||||||
ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone();
|
|
||||||
logger.start_iteration_ranking_rule(
|
|
||||||
cur_ranking_rule_index,
|
|
||||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
|
||||||
&next_bucket.query,
|
|
||||||
&ranking_rule_universes[cur_ranking_rule_index],
|
|
||||||
);
|
|
||||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
|
||||||
ctx,
|
|
||||||
logger,
|
|
||||||
&next_bucket.candidates,
|
|
||||||
&next_bucket.query,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(results)
|
|
||||||
}
|
|
||||||
|
590
milli/src/search/new/tests/distinct.rs
Normal file
590
milli/src/search/new/tests/distinct.rs
Normal file
@ -0,0 +1,590 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the "distinct attribute" feature, and its
|
||||||
|
interaction with other ranking rules.
|
||||||
|
|
||||||
|
1. no duplicate distinct attributes are ever returned
|
||||||
|
2. only the best document (according to the search rules) for each distinct value appears in the result
|
||||||
|
3. if a document does not have a distinct attribute, then the distinct rule does not apply to it
|
||||||
|
|
||||||
|
It doesn't test properly:
|
||||||
|
- combination of distinct + exhaustive_nbr_hits (because we know it's incorrect)
|
||||||
|
- distinct attributes with arrays (because we know it's incorrect as well)
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use big_s::S;
|
||||||
|
use heed::RoTxn;
|
||||||
|
use maplit::hashset;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, AscDesc, Criterion, Index, Member, Search, SearchResult,
|
||||||
|
TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::collect_field_values;
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_sortable_fields(hashset! { S("rank1"), S("letter") });
|
||||||
|
s.set_distinct_field("letter".to_owned());
|
||||||
|
s.set_criteria(vec![Criterion::Words]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"letter": "A",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brown fox jamps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"letter": "A",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quick brown fox jumpes over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"letter": "B",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brown foxjumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"letter": "B",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quick brown fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"letter": "B",
|
||||||
|
"rank1": 2,
|
||||||
|
"text": "the quick brown fox jumps over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"letter": "C",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quickbrownfox jumps over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"letter": "C",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quick brown fox jumpss over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"letter": "C",
|
||||||
|
"rank1": 2,
|
||||||
|
"text": "the quick brown fox jumps over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"letter": "D",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brown fox jumps over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"letter": "E",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brown fox jumps over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"letter": "E",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quackbrown foxjunps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"letter": "E",
|
||||||
|
"rank1": 2,
|
||||||
|
"text": "the quicko browno fox junps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"letter": "E",
|
||||||
|
"rank1": 3,
|
||||||
|
"text": "the quicko browno fox jumps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"letter": "E",
|
||||||
|
"rank1": 4,
|
||||||
|
"text": "the quick brewn fox jumps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"letter": "E",
|
||||||
|
"rank1": 5,
|
||||||
|
"text": "the quick brown fox jumps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"letter": "F",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brownf fox jumps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"letter": "F",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quic brown fox jamps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"letter": "F",
|
||||||
|
"rank1": 2,
|
||||||
|
"text": "thequick browns fox jimps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"letter": "G",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the qick brown fox jumps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"letter": "G",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quick brownfoxjumps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"letter": "H",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brow fox jumps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"letter": "I",
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick brown fox jpmps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"letter": "I",
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quick brown fox jumps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"letter": "I",
|
||||||
|
"rank1": 2,
|
||||||
|
"text": "the quick",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 24,
|
||||||
|
"rank1": 0,
|
||||||
|
"text": "the quick",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 25,
|
||||||
|
"rank1": 1,
|
||||||
|
"text": "the quick brown",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 26,
|
||||||
|
"rank1": 2,
|
||||||
|
"text": "the quick brown fox",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 26,
|
||||||
|
"rank1": 3,
|
||||||
|
"text": "the quick brown fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec<String> {
|
||||||
|
let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids);
|
||||||
|
|
||||||
|
let mut unique = HashSet::new();
|
||||||
|
for v in vs.iter() {
|
||||||
|
if v == "__does_not_exist__" {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assert!(unique.insert(v.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
vs
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distinct_placeholder_no_ranking_rules() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let s = Search::new(&txn, &index);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"A\"",
|
||||||
|
"\"B\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"H\"",
|
||||||
|
"\"I\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distinct_placeholder_sort() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Sort]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"E\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"\"B\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"I\"",
|
||||||
|
"\"A\"",
|
||||||
|
"\"G\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"\"D\"",
|
||||||
|
"\"H\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||||
|
[
|
||||||
|
"5",
|
||||||
|
"3",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"I\"",
|
||||||
|
"\"H\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"B\"",
|
||||||
|
"\"A\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||||
|
[
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"3",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.sort_criteria(vec![
|
||||||
|
AscDesc::Desc(Member::Field(S("letter"))),
|
||||||
|
AscDesc::Desc(Member::Field(S("rank1"))),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"I\"",
|
||||||
|
"\"H\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"B\"",
|
||||||
|
"\"A\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||||
|
[
|
||||||
|
"2",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"2",
|
||||||
|
"5",
|
||||||
|
"0",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"1",
|
||||||
|
"3",
|
||||||
|
"1",
|
||||||
|
"0",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distinct_words() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Words]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"A\"",
|
||||||
|
"\"B\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"\"C\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"H\"",
|
||||||
|
"\"I\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
let text_values = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(text_values, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jamps over the lazy dog\"",
|
||||||
|
"\"the quick brown foxjumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quickbrownfox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brownf fox jumps over\"",
|
||||||
|
"\"the qick brown fox jumps\"",
|
||||||
|
"\"the quick brow fox jumps\"",
|
||||||
|
"\"the quick brown fox jpmps\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
"\"the quick\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distinct_sort_words() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Sort, Criterion::Words, Criterion::Desc(S("rank1"))]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"I\"",
|
||||||
|
"\"H\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"B\"",
|
||||||
|
"\"A\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||||
|
[
|
||||||
|
"1",
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"2",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"3",
|
||||||
|
"1",
|
||||||
|
"0",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let text_values = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(text_values, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brow fox jumps\"",
|
||||||
|
"\"the quick brownfoxjumps\"",
|
||||||
|
"\"the quic brown fox jamps over\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumpes over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
"\"the quick\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distinct_all_candidates() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Sort]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]);
|
||||||
|
s.exhaustive_number_hits(true);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap();
|
||||||
|
let candidates = candidates.iter().collect::<Vec<_>>();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
|
||||||
|
// TODO: this is incorrect!
|
||||||
|
insta::assert_snapshot!(format!("{candidates:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_distinct_typo() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
|
||||||
|
|
||||||
|
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||||
|
[
|
||||||
|
"\"B\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"\"A\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"I\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"H\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let text_values = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(text_values, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jamps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brownf fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the qick brown fox jumps\"",
|
||||||
|
"\"the quick brow fox jumps\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
"\"the quick\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
22
milli/src/search/new/tests/language.rs
Normal file
22
milli/src/search/new/tests/language.rs
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
use crate::{index::tests::TempIndex, Search, SearchResult};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_kanji_language_detection() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||||
|
{ "id": 1, "title": "東京のお寿司。" },
|
||||||
|
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.write_txn().unwrap();
|
||||||
|
let mut search = Search::new(&txn, &index);
|
||||||
|
|
||||||
|
search.query("東京");
|
||||||
|
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]");
|
||||||
|
}
|
30
milli/src/search/new/tests/mod.rs
Normal file
30
milli/src/search/new/tests/mod.rs
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
pub mod distinct;
|
||||||
|
#[cfg(feature = "default")]
|
||||||
|
pub mod language;
|
||||||
|
pub mod ngram_split_words;
|
||||||
|
pub mod proximity;
|
||||||
|
pub mod proximity_typo;
|
||||||
|
pub mod sort;
|
||||||
|
pub mod typo;
|
||||||
|
pub mod typo_proximity;
|
||||||
|
pub mod words_tms;
|
||||||
|
|
||||||
|
fn collect_field_values(
|
||||||
|
index: &crate::Index,
|
||||||
|
txn: &heed::RoTxn,
|
||||||
|
fid: &str,
|
||||||
|
docids: &[u32],
|
||||||
|
) -> Vec<String> {
|
||||||
|
let mut values = vec![];
|
||||||
|
let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap();
|
||||||
|
for doc in index.documents(txn, docids.iter().copied()).unwrap() {
|
||||||
|
if let Some(v) = doc.1.get(fid) {
|
||||||
|
let v: serde_json::Value = serde_json::from_slice(v).unwrap();
|
||||||
|
let v = v.to_string();
|
||||||
|
values.push(v);
|
||||||
|
} else {
|
||||||
|
values.push("__does_not_exist__".to_owned());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
values
|
||||||
|
}
|
372
milli/src/search/new/tests/ngram_split_words.rs
Normal file
372
milli/src/search/new/tests/ngram_split_words.rs
Normal file
@ -0,0 +1,372 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the following properties:
|
||||||
|
|
||||||
|
1. Two consecutive words from a query can be combined into a "2gram"
|
||||||
|
2. Three consecutive words from a query can be combined into a "3gram"
|
||||||
|
3. A word from the query can be split into two consecutive words (split words)
|
||||||
|
4. A 2gram can be split into two words
|
||||||
|
5. A 3gram cannot be split into two words
|
||||||
|
6. 2grams can contain up to 1 typo
|
||||||
|
7. 3grams cannot have typos
|
||||||
|
8. 2grams and 3grams can be prefix tolerant
|
||||||
|
9. Disabling typo tolerance also disable the split words feature
|
||||||
|
10. Disabling typo tolerance does not disable prefix tolerance
|
||||||
|
11. Disabling typo tolerance does not disable ngram tolerance
|
||||||
|
12. Prefix tolerance is disabled for the last word if a space follows it
|
||||||
|
13. Ngrams cannot be formed by combining a phrase and a word or two phrases
|
||||||
|
*/
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
||||||
|
SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"text": "the sun flowers are pretty"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "the sun flower is tall"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "the sunflowers are pretty"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "the sunflower is tall"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"text": "the sunflawer is tall"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"text": "sunflowering is not a verb"
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_2gram_simple() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sun flower");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
// will also match documents with "sunflower" + prefix tolerance
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flowers are pretty\"",
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"sunflowering is not a verb\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_3gram_simple() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sun flower s are");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flowers are pretty\"",
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_2gram_typo() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sun flawer");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flowers are pretty\"",
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"the sunflawer is tall\"",
|
||||||
|
"\"sunflowering is not a verb\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_no_disable_ngrams() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sun flower ");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
// documents containing `sunflower`
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_2gram_prefix() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sun flow");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
// documents containing words beginning with `sunflow`
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flowers are pretty\"",
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"sunflowering is not a verb\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_3gram_prefix() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("su nf l");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// documents containing a word beginning with sunfl
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"the sunflawer is tall\"",
|
||||||
|
"\"sunflowering is not a verb\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_split_words() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sunflower ");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// all the documents with either `sunflower` or `sun flower` + eventual typo
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"the sunflawer is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_disable_split_words() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sunflower ");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
// no document containing `sun flower`
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_2gram_split_words() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sunf lower");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// all the documents with "sunflower", "sun flower", (sunflower + 1 typo), or (sunflower as prefix)
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"the sunflawer is tall\"",
|
||||||
|
"\"sunflowering is not a verb\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_3gram_no_split_words() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sunf lo wer");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// no document with `sun flower`
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sunflowers are pretty\"",
|
||||||
|
"\"the sunflower is tall\"",
|
||||||
|
"\"sunflowering is not a verb\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_3gram_no_typos() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sunf la wer");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sunflawer is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_no_ngram_phrases() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("\"sun\" flower");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flowers are pretty\"",
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("\"sun\" \"flower\"");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the sun flower is tall\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
471
milli/src/search/new/tests/proximity.rs
Normal file
471
milli/src/search/new/tests/proximity.rs
Normal file
@ -0,0 +1,471 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the Proximity ranking rule:
|
||||||
|
|
||||||
|
1. A proximity of >7 always has the same cost.
|
||||||
|
|
||||||
|
2. Phrase terms can be in sprximity to other terms via their start and end words,
|
||||||
|
but we need to make sure that the phrase exists in the document that meets this
|
||||||
|
proximity condition. This is especially relevant with split words and synonyms.
|
||||||
|
|
||||||
|
3. An ngram has the same sprximity cost as its component words being consecutive.
|
||||||
|
e.g. `sunflower` equivalent to `sun flower`.
|
||||||
|
|
||||||
|
4. The prefix databases can be used to find the sprximity between two words, but
|
||||||
|
they store fewer sprximities than the regular word sprximity DB.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
||||||
|
SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_simple_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "the. quick brown fox jumps over the lazy. dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "the quick brown fox jumps over the lazy. dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "dog the quick brown fox jumps over the lazy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"text": "the quickbrown fox jumps over the lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"text": "brown quick fox jumps over the lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"text": "the really quick brown fox jumps over the very lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"text": "the really quick brown fox jumps over the lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"text": "the quick brown fox jumps over the lazy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"text": "the quack brown fox jumps over the lazy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"text": "the quack brown fox jumps over the lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"text": "the quick brown fox jumps over the lazy dog"
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_edge_cases_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index.add_documents(documents!([
|
||||||
|
{
|
||||||
|
// This document will insert "s" in the prefix database
|
||||||
|
"id": 0,
|
||||||
|
"text": "
|
||||||
|
saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz
|
||||||
|
sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz
|
||||||
|
sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz
|
||||||
|
sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz
|
||||||
|
sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez
|
||||||
|
sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz
|
||||||
|
sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz
|
||||||
|
ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz
|
||||||
|
sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz
|
||||||
|
sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz
|
||||||
|
sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz
|
||||||
|
soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz
|
||||||
|
spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz
|
||||||
|
sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz
|
||||||
|
sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz
|
||||||
|
ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz
|
||||||
|
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
|
||||||
|
"
|
||||||
|
},
|
||||||
|
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
|
||||||
|
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
|
||||||
|
// If the query is `sunflower wilting`, then we should make sure that
|
||||||
|
// the sprximity condition `flower wilting: sprx N` also comes with the condition
|
||||||
|
// `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now.
|
||||||
|
// We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which
|
||||||
|
// is better than nothing but not the best.
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
// This document matches the query `sunflower wilting`, but the sprximity condition
|
||||||
|
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
|
||||||
|
// which would reduce to only `flower` and `wilting` being in sprximity.
|
||||||
|
"text": "A flower wilting under the sun, unlike a sunflower"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This should be the best document for `sunflower wilting`
|
||||||
|
"id": 4,
|
||||||
|
"text": "sun flower wilting under the heat"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This is also the best document for `sunflower wilting`
|
||||||
|
"id": 5,
|
||||||
|
"text": "sunflower wilting under the heat"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox MAX between `best` and `s` prefix
|
||||||
|
"id": 6,
|
||||||
|
"text": "this is the best meal I have ever had in such a beautiful summer day"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 5 between `best` and `s` prefix
|
||||||
|
"id": 7,
|
||||||
|
"text": "this is the best cooked meal of the summer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 4 between `best` and `s` prefix
|
||||||
|
"id": 8,
|
||||||
|
"text": "this is the best meal of the summer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 3 between `best` and `s` prefix
|
||||||
|
"id": 9,
|
||||||
|
"text": "this is the best meal of summer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 1 between `best` and `s` prefix
|
||||||
|
"id": 10,
|
||||||
|
"text": "this is the best summer meal"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Reverse Prox 3 between `best` and `s` prefix
|
||||||
|
"id": 11,
|
||||||
|
"text": "summer x y best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Reverse Prox 2 between `best` and `s` prefix
|
||||||
|
"id": 12,
|
||||||
|
"text": "summer x best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Reverse Prox 1 between `best` and `s` prefix
|
||||||
|
"id": 13,
|
||||||
|
"text": "summer best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This document will insert "win" in the prefix database
|
||||||
|
"id": 14,
|
||||||
|
"text": "
|
||||||
|
winaa winab winac winae winaf winag winah winai winaj winak winal winam winan winao winap winaq winar winasa winat winau winav winaw winax winay winaz
|
||||||
|
winba winbb winbc winbe winbf winbg winbh winbi winbj winbk winbl winbm winbn winbo winbp winbq winbr winbsb winbt winbu winbv winbw winbx winby winbz
|
||||||
|
winca wincb wincc wince wincf wincg winch winci wincj winck wincl wincm wincn winco wincp wincq wincr wincsc winct wincu wincv wincw wincx wincy wincz
|
||||||
|
winda windb windc winde windf windg windh windi windj windk windl windm windn windo windp windq windr windsd windt windu windv windw windx windy windz
|
||||||
|
winea wineb winec winee winef wineg wineh winei winej winek winel winem winen wineo winep wineq winer winese winet wineu winev winew winex winey winez
|
||||||
|
winfa winfb winfc winfe winff winfg winfh winfi winfj winfk winfl winfm winfn winfo winfp winfq winfr winfsf winft winfu winfv winfw winfx winfy winfz
|
||||||
|
winga wingb wingc winge wingf wingg wingh wingi wingj wingk wingl wingm wingn wingo wingp wingq wingr wingsg wingt wingu wingv wingw wingx wingy wingz
|
||||||
|
winka winkb winkc winke winkf winkg winkh winki winkj winkk winkl winkm winkn winko winkp winkq winkr winksk winkt winku winkv winkw winkx winky winkz
|
||||||
|
winla winlb winlc winle winlf winlg winlh winli winlj winlk winll winlm winln winlo winlp winlq winlr winlsl winlt winlu winlv winlw winlx winly winlz
|
||||||
|
winma winmb winmc winme winmf winmg winmh winmi winmj winmk winml winmm winmn winmo winmp winmq winmr winmsm winmt winmu winmv winmw winmx winmy winmz
|
||||||
|
winna winnb winnc winne winnf winng winnh winni winnj winnk winnl winnm winnn winno winnp winnq winnr winnsn winnt winnu winnv winnw winnx winny winnz
|
||||||
|
winoa winob winoc winoe winof winog winoh winoi winoj winok winol winom winon winoo winop winoq winor winoso winot winou winov winow winox winoy winoz
|
||||||
|
winpa winpb winpc winpe winpf winpg winph winpi winpj winpk winpl winpm winpn winpo winpp winpq winpr winpsp winpt winpu winpv winpw winpx winpy winpz
|
||||||
|
winqa winqb winqc winqe winqf winqg winqh winqi winqj winqk winql winqm winqn winqo winqp winqq winqr winqsq winqt winqu winqv winqw winqx winqy winqz
|
||||||
|
winra winrb winrc winre winrf winrg winrh winri winrj winrk winrl winrm winrn winro winrp winrq winrr winrsr winrt winru winrv winrw winrx winry winrz
|
||||||
|
winsa winsb winsc winse winsf winsg winsh winsi winsj winsk winsl winsm winsn winso winsp winsq winsr winsss winst winsu winsv winsw winsx winsy winsz
|
||||||
|
winta wintb wintc winte wintf wintg winth winti wintj wintk wintl wintm wintn winto wintp wintq wintr wintst wintt wintu wintv wintw wintx winty wintz
|
||||||
|
"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox MAX between `best` and `win` prefix
|
||||||
|
"id": 15,
|
||||||
|
"text": "this is the best meal I have ever had in such a beautiful winter day"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 5 between `best` and `win` prefix
|
||||||
|
"id": 16,
|
||||||
|
"text": "this is the best cooked meal of the winter"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 4 between `best` and `win` prefix
|
||||||
|
"id": 17,
|
||||||
|
"text": "this is the best meal of the winter"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 3 between `best` and `win` prefix
|
||||||
|
"id": 18,
|
||||||
|
"text": "this is the best meal of winter"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Prox 1 between `best` and `win` prefix
|
||||||
|
"id": 19,
|
||||||
|
"text": "this is the best winter meal"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Reverse Prox 3 between `best` and `win` prefix
|
||||||
|
"id": 20,
|
||||||
|
"text": "winter x y best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Reverse Prox 2 between `best` and `win` prefix
|
||||||
|
"id": 21,
|
||||||
|
"text": "winter x best"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// Reverse Prox 1 between `best` and `win` prefix
|
||||||
|
"id": 22,
|
||||||
|
"text": "winter best"
|
||||||
|
},
|
||||||
|
])).unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_proximity_simple() {
|
||||||
|
let index = create_simple_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quickbrown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quack brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the really quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the really quick brown fox jumps over the very lazy dog\"",
|
||||||
|
"\"brown quick fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy. dog\"",
|
||||||
|
"\"dog the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
||||||
|
"\"the. quick brown fox jumps over the lazy. dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_proximity_split_word() {
|
||||||
|
let index = create_edge_cases_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("sunflower wilting");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
// TODO: "2" and "4" should be swapped ideally
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
||||||
|
"\"sun flower wilting under the heat\"",
|
||||||
|
"\"sunflower wilting under the heat\"",
|
||||||
|
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
||||||
|
"\"A flower wilting under the sun, unlike a sunflower\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("\"sun flower\" wilting");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
// TODO: "2" and "4" should be swapped ideally
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
||||||
|
"\"sun flower wilting under the heat\"",
|
||||||
|
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
drop(txn);
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
let mut syns = HashMap::new();
|
||||||
|
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
|
||||||
|
s.set_synonyms(syns);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("xyz wilting");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
// TODO: "2" and "4" should be swapped ideally
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
||||||
|
"\"sun flower wilting under the heat\"",
|
||||||
|
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_proximity_prefix_db() {
|
||||||
|
let index = create_edge_cases_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("best s");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
|
// This test illustrates the loss of precision from using the prefix DB
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"this is the best summer meal\"",
|
||||||
|
"\"summer best\"",
|
||||||
|
"\"this is the best meal of summer\"",
|
||||||
|
"\"summer x best\"",
|
||||||
|
"\"this is the best meal of the summer\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||||
|
"\"this is the best cooked meal of the summer\"",
|
||||||
|
"\"summer x y best\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// Difference when using the `su` prefix, which is not in the prefix DB
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("best su");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"this is the best summer meal\"",
|
||||||
|
"\"summer best\"",
|
||||||
|
"\"this is the best meal of summer\"",
|
||||||
|
"\"summer x best\"",
|
||||||
|
"\"this is the best meal of the summer\"",
|
||||||
|
"\"summer x y best\"",
|
||||||
|
"\"this is the best cooked meal of the summer\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// Note that there is a case where a prefix is in the prefix DB but not in the
|
||||||
|
// **proximity** prefix DB. In that case, its sprximity score will always be
|
||||||
|
// the maximum. This happens for prefixes that are larger than 2 bytes.
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("best win");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
|
"\"this is the best cooked meal of the winter\"",
|
||||||
|
"\"this is the best meal of the winter\"",
|
||||||
|
"\"this is the best meal of winter\"",
|
||||||
|
"\"this is the best winter meal\"",
|
||||||
|
"\"winter x y best\"",
|
||||||
|
"\"winter x best\"",
|
||||||
|
"\"winter best\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// Now using `wint`, which is not in the prefix DB:
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("best wint");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"this is the best winter meal\"",
|
||||||
|
"\"winter best\"",
|
||||||
|
"\"this is the best meal of winter\"",
|
||||||
|
"\"winter x best\"",
|
||||||
|
"\"this is the best meal of the winter\"",
|
||||||
|
"\"winter x y best\"",
|
||||||
|
"\"this is the best cooked meal of the winter\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// and using `wi` which is in the prefix DB and proximity prefix DB
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("best wi");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"this is the best winter meal\"",
|
||||||
|
"\"winter best\"",
|
||||||
|
"\"this is the best meal of winter\"",
|
||||||
|
"\"winter x best\"",
|
||||||
|
"\"this is the best meal of the winter\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
|
"\"this is the best cooked meal of the winter\"",
|
||||||
|
"\"winter x y best\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
75
milli/src/search/new/tests/proximity_typo.rs
Normal file
75
milli/src/search/new/tests/proximity_typo.rs
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the interactions between the proximity and typo ranking rules.
|
||||||
|
|
||||||
|
The proximity ranking rule should transform the query graph such that it
|
||||||
|
only contains the word pairs that it used to compute its bucket.
|
||||||
|
|
||||||
|
TODO: This is not currently implemented.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
||||||
|
SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
// Basic trap.
|
||||||
|
//
|
||||||
|
// We have one document with the perfect word pair: `sommer - holiday`
|
||||||
|
// and another with the perfect word pair: `sommer holidty`.
|
||||||
|
//
|
||||||
|
// The proximity ranking rule will put them both in the same bucket, and it
|
||||||
|
// should minify the query graph to make it represent:
|
||||||
|
// EITHER:
|
||||||
|
// sommer + holiday
|
||||||
|
// OR:
|
||||||
|
// sommer + holidty
|
||||||
|
//
|
||||||
|
// Such that the child typo ranking rule does not find any match
|
||||||
|
// for its zero-typo bucket `summer + holiday`, even though both documents
|
||||||
|
// contain these two exact words.
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"text": "summer. holiday. sommer holidty"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "summer. holiday. sommer holiday"
|
||||||
|
},
|
||||||
|
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_trap_basic() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("summer holiday");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
// TODO: this is incorrect, 1 should come before 0
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"summer. holiday. sommer holidty\"",
|
||||||
|
"\"summer. holiday. sommer holiday\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
316
milli/src/search/new/tests/sort.rs
Normal file
316
milli/src/search/new/tests/sort.rs
Normal file
@ -0,0 +1,316 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the `sort` ranking rule:
|
||||||
|
|
||||||
|
1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time
|
||||||
|
2. an error is returned if the fields-to-sort are not sortable
|
||||||
|
3. it is possible to add multiple fields-to-sort at search time
|
||||||
|
4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected
|
||||||
|
5. numbers appear before strings
|
||||||
|
6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket
|
||||||
|
7. boolean values are translated to strings
|
||||||
|
8. if a field contains an array, it is sorted by the best value in the array according to the sort rule
|
||||||
|
*/
|
||||||
|
|
||||||
|
use big_s::S;
|
||||||
|
use maplit::hashset;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, AscDesc, Criterion, Member,
|
||||||
|
Search, SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_sortable_fields(hashset! { S("rank"), S("vague"), S("letter") });
|
||||||
|
s.set_criteria(vec![Criterion::Sort]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"letter": "A",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"letter": "A",
|
||||||
|
"rank": 1,
|
||||||
|
"vague": "0",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"letter": "B",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"letter": "B",
|
||||||
|
"rank": 1,
|
||||||
|
"vague": "1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"letter": "B",
|
||||||
|
"rank": 2,
|
||||||
|
"vague": [1, 2],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"letter": "C",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": [1, "2"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"letter": "C",
|
||||||
|
"rank": 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"letter": "C",
|
||||||
|
"rank": 2,
|
||||||
|
"vague": null,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"letter": "D",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": [null, null, ""]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"letter": "E",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"letter": "E",
|
||||||
|
"rank": 1,
|
||||||
|
"vague": {
|
||||||
|
"sub": 0,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"letter": "E",
|
||||||
|
"rank": 2,
|
||||||
|
"vague": true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"letter": "E",
|
||||||
|
"rank": 3,
|
||||||
|
"vague": false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"letter": "E",
|
||||||
|
"rank": 4,
|
||||||
|
"vague": 1.5673,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"letter": "E",
|
||||||
|
"rank": 5,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"letter": "F",
|
||||||
|
"rank": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"letter": "F",
|
||||||
|
"rank": 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"letter": "F",
|
||||||
|
"rank": 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"letter": "G",
|
||||||
|
"rank": 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"letter": "G",
|
||||||
|
"rank": 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"letter": "H",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"letter": "I",
|
||||||
|
"rank": 0,
|
||||||
|
"vague": false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"letter": "I",
|
||||||
|
"rank": 1,
|
||||||
|
"vague": [1.1367, "help", null]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"letter": "I",
|
||||||
|
"rank": 2,
|
||||||
|
"vague": [1.2367, "hello"]
|
||||||
|
},
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_sort() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 22, 23, 20, 18, 19, 15, 16, 17, 9, 10, 11, 12, 13, 14, 8, 5, 6, 7, 2]");
|
||||||
|
|
||||||
|
let letter_values = collect_field_values(&index, &txn, "letter", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(letter_values, @r###"
|
||||||
|
[
|
||||||
|
"\"I\"",
|
||||||
|
"\"I\"",
|
||||||
|
"\"I\"",
|
||||||
|
"\"H\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"G\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"F\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"E\"",
|
||||||
|
"\"D\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"C\"",
|
||||||
|
"\"B\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 13, 12, 4, 7, 11, 17, 23, 1, 3, 6, 10, 16, 19, 22, 0, 2, 5, 8, 9]");
|
||||||
|
|
||||||
|
let rank_values = collect_field_values(&index, &txn, "rank", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||||
|
[
|
||||||
|
"5",
|
||||||
|
"4",
|
||||||
|
"3",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"2",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"1",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
"0",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.sort_criteria(vec![AscDesc::Asc(Member::Field(S("vague")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]");
|
||||||
|
|
||||||
|
let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(vague_values, @r###"
|
||||||
|
[
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"[1,2]",
|
||||||
|
"[1,\"2\"]",
|
||||||
|
"[1.1367,\"help\",null]",
|
||||||
|
"[1.2367,\"hello\"]",
|
||||||
|
"1.5673",
|
||||||
|
"\"0\"",
|
||||||
|
"\"1\"",
|
||||||
|
"false",
|
||||||
|
"false",
|
||||||
|
"true",
|
||||||
|
"true",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"null",
|
||||||
|
"[null,null,\"\"]",
|
||||||
|
"\"\"",
|
||||||
|
"{\"sub\":0}",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("vague")))]);
|
||||||
|
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 13, 23, 22, 2, 5, 0, 11, 20, 12, 21, 3, 1, 6, 7, 8, 9, 10, 14, 15]");
|
||||||
|
|
||||||
|
let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(vague_values, @r###"
|
||||||
|
[
|
||||||
|
"[1,2]",
|
||||||
|
"1.5673",
|
||||||
|
"[1.2367,\"hello\"]",
|
||||||
|
"[1.1367,\"help\",null]",
|
||||||
|
"1",
|
||||||
|
"[1,\"2\"]",
|
||||||
|
"0",
|
||||||
|
"true",
|
||||||
|
"true",
|
||||||
|
"false",
|
||||||
|
"false",
|
||||||
|
"\"1\"",
|
||||||
|
"\"0\"",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"null",
|
||||||
|
"[null,null,\"\"]",
|
||||||
|
"\"\"",
|
||||||
|
"{\"sub\":0}",
|
||||||
|
"__does_not_exist__",
|
||||||
|
"__does_not_exist__",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
497
milli/src/search/new/tests/typo.rs
Normal file
497
milli/src/search/new/tests/typo.rs
Normal file
@ -0,0 +1,497 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the following properties:
|
||||||
|
|
||||||
|
1. The `words` ranking rule is typo-tolerant
|
||||||
|
2. Typo-tolerance handles missing letters, extra letters, replaced letters, and swapped letters (at least)
|
||||||
|
3. Words which are < `min_word_len_one_typo` are not typo tolerant
|
||||||
|
4. Words which are >= `min_word_len_one_typo` but < `min_word_len_two_typos` can have one typo
|
||||||
|
5. Words which are >= `min_word_len_two_typos` can have two typos
|
||||||
|
6. A typo on the first letter of a word counts as two typos
|
||||||
|
7. Phrases are not typo tolerant
|
||||||
|
8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos`
|
||||||
|
9. 3grams are not typo tolerant
|
||||||
|
10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly
|
||||||
|
if `words` doesn't exist before it.
|
||||||
|
11. The `typo` ranking rule places documents with the same number of typos in the same bucket
|
||||||
|
12. Prefix tolerance costs nothing according to the typo ranking rule
|
||||||
|
13. Split words cost 1 typo according to the typo ranking rule
|
||||||
|
14. Synonyms cost nothing according to the typo ranking rule
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
||||||
|
SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"text": "the quick brown fox jumps over the lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "the quick brown foxes jump over the lazy dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "the quick brown fax sends a letter to the dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "the quickest brownest fox jumps over the laziest dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"text": "a fox doesn't quack, that crown goes to the duck."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"text": "the quicker browner fox jumped over the lazier dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"text": "the extravagant fox skyrocketed over the languorous dog" // thanks thesaurus
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"text": "the quick brown fox jumps over the lazy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"text": "the quick brown fox jumps over the"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"text": "the quick brown fox jumps over"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"text": "the quick brown fox jumps"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"text": "the quick brown fox"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"text": "the quick brown"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"text": "the quick"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"text": "netwolk interconections sunflawar"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"text": "network interconnections sunflawer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"text": "network interconnection sunflower"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"text": "network interconnection sun flower"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"text": "network interconnection sunflowering"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"text": "network interconnection sun flowering"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"text": "network interconnection sunflowar"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"text": "the fast brownish fox jumps over the lackadaisical dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"text": "the quick brown fox jumps over the lackadaisical dog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 23,
|
||||||
|
"text": "the quivk brown fox jumps over the lazy dog"
|
||||||
|
},
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_no_typo() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_autorize_typos(false);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_default_typo() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let ot = index.min_word_len_one_typo(&txn).unwrap();
|
||||||
|
let tt = index.min_word_len_two_typos(&txn).unwrap();
|
||||||
|
insta::assert_debug_snapshot!(ot, @"5");
|
||||||
|
insta::assert_debug_snapshot!(tt, @"9");
|
||||||
|
|
||||||
|
// 0 typo
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 23]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quivk brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// 1 typo on one word, replaced letter
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quack brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// 1 typo on one word, missing letter, extra letter
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quicest brownest fox jummps over the laziest dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quickest brownest fox jumps over the laziest dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// 1 typo on one word, swapped letters
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quikc borwn fox jupms over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// 1 first letter typo on a word <5 bytes, replaced letter
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the nuick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
|
||||||
|
// 1 first letter typo on a word <5 bytes, missing letter
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the uick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
|
||||||
|
// 1 typo on all words >=5 bytes, replaced letters
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quack brawn fox junps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// 2 typos on words < 9 bytes
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quckest brawnert fox jumps over the aziest dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
|
||||||
|
// 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the extravant fox kyrocketed over the lamguorout dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the extravagant fox skyrocketed over the languorous dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the extravagant fox skyrocketed over the languorous dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_phrase_no_typo_allowed() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the \"quick brewn\" fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @"[]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ngram_typos() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the extra lagant fox skyrocketed over the languorous dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the extravagant fox skyrocketed over the languorous dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the ex tra lagant fox skyrocketed over the languorous dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @"[]");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Typo]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &ids_1);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quivk brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps over the lackadaisical dog\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brown fox\"",
|
||||||
|
"\"the quick brown foxes jump over the lazy dog\"",
|
||||||
|
"\"the quick brown fax sends a letter to the dog\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
"\"the quick\"",
|
||||||
|
"\"a fox doesn't quack, that crown goes to the duck.\"",
|
||||||
|
"\"the quickest brownest fox jumps over the laziest dog\"",
|
||||||
|
"\"the quicker browner fox jumped over the lazier dog\"",
|
||||||
|
"\"the extravagant fox skyrocketed over the languorous dog\"",
|
||||||
|
"\"the fast brownish fox jumps over the lackadaisical dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]");
|
||||||
|
|
||||||
|
assert_eq!(ids_1, ids_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_typo_bucketing() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// First do the search with just the Words ranking rule
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("network interconnection sunflower");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"netwolk interconections sunflawar\"",
|
||||||
|
"\"network interconnections sunflawer\"",
|
||||||
|
"\"network interconnection sunflower\"",
|
||||||
|
"\"network interconnection sun flower\"",
|
||||||
|
"\"network interconnection sunflowering\"",
|
||||||
|
"\"network interconnection sunflowar\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// Then with the typo ranking rule
|
||||||
|
drop(txn);
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Typo]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("network interconnection sunflower");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"network interconnection sunflower\"",
|
||||||
|
"\"network interconnection sunflowering\"",
|
||||||
|
"\"network interconnection sun flower\"",
|
||||||
|
"\"network interconnection sunflowar\"",
|
||||||
|
"\"network interconnections sunflawer\"",
|
||||||
|
"\"netwolk interconections sunflawar\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("network interconnection sun flower");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"network interconnection sun flower\"",
|
||||||
|
"\"network interconnection sun flowering\"",
|
||||||
|
"\"network interconnection sunflower\"",
|
||||||
|
"\"network interconnection sunflowering\"",
|
||||||
|
"\"network interconnection sunflowar\"",
|
||||||
|
"\"network interconnections sunflawer\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_typo_synonyms() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Typo]);
|
||||||
|
|
||||||
|
let mut synonyms = HashMap::new();
|
||||||
|
synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]);
|
||||||
|
synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]);
|
||||||
|
|
||||||
|
s.set_synonyms(synonyms);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the quick brown fox jumps over the lackadaisical dog");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 22, 23]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lackadaisical dog\"",
|
||||||
|
"\"the quivk brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("the fast brownish fox jumps over the lackadaisical dog");
|
||||||
|
|
||||||
|
// TODO: is this correct? interaction of ngrams + synonyms means that the
|
||||||
|
// multi-word synonyms end up having a typo cost. This is probably not what we want.
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the fast brownish fox jumps over the lackadaisical dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lackadaisical dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
124
milli/src/search/new/tests/typo_proximity.rs
Normal file
124
milli/src/search/new/tests/typo_proximity.rs
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the interactions between the typo and proximity ranking rules.
|
||||||
|
|
||||||
|
The typo ranking rule should transform the query graph such that it only contains
|
||||||
|
the combinations of word derivations that it used to compute its bucket.
|
||||||
|
|
||||||
|
The proximity ranking rule should then look for proximities only between those specific derivations.
|
||||||
|
For example, given the the search query `beautiful summer` and the dataset:
|
||||||
|
```text
|
||||||
|
{ "id": 0, "text": "beautigul summer...... beautiful day in the summer" }
|
||||||
|
{ "id": 1, "text": "beautiful summer" }
|
||||||
|
```
|
||||||
|
Then the document with id `1` should be returned before `0`.
|
||||||
|
The proximity ranking rule is not allowed to look for the proximity between `beautigul` and `summer`
|
||||||
|
because the typo ranking rule before it only used the derivation `beautiful`.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
||||||
|
SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
// trap explained in the module documentation
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"text": "beautigul summer. beautiful x y z summer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "beautiful summer"
|
||||||
|
},
|
||||||
|
// the next 2 documents set up a more complicated trap
|
||||||
|
// with the query `beautiful summer`, we will have:
|
||||||
|
// 1. documents with no typos, id 0 and 1
|
||||||
|
// 2. documents with 1 typos: id 2 and 3, those are interpreted as EITHER
|
||||||
|
// - id 2: "beautigul + summer" ; OR
|
||||||
|
// - id 3: "beautiful + sommer"
|
||||||
|
// To sort these two documents, the proximity ranking rule must use only the
|
||||||
|
// word pairs: `beautigul -- summer` and `beautiful -- sommer` even though
|
||||||
|
// all variations of `beautiful` and `sommer` were used by the typo ranking rule.
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "beautigul sommer. beautigul x summer"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "beautiful sommer"
|
||||||
|
},
|
||||||
|
// The next two documents lay out an even more complex trap.
|
||||||
|
// With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of:
|
||||||
|
// - id 4: delicitous + sweet + dessert
|
||||||
|
// - id 5: beautiful + sweet + desgert
|
||||||
|
// The word pairs that the proximity ranking rules is allowed to use are
|
||||||
|
// EITHER:
|
||||||
|
// delicitous -- sweet AND sweet -- dessert
|
||||||
|
// OR
|
||||||
|
// delicious -- sweet AND sweet -- desgert
|
||||||
|
// So the word pair to use for the terms `summer` and `dessert` depend on the
|
||||||
|
// word pairs explored before them.
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"text": "delicitous. sweet. dessert. delicitous sweet desgert",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"text": "delicious. sweet desgert. delicious sweet desgert",
|
||||||
|
},
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_trap_basic_and_complex1() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("beautiful summer");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"beautiful summer\"",
|
||||||
|
"\"beautigul summer. beautiful x y z summer\"",
|
||||||
|
"\"beautiful sommer\"",
|
||||||
|
"\"beautigul sommer. beautigul x summer\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_trap_complex2() {
|
||||||
|
let index = create_index();
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
s.query("delicious sweet dessert");
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"delicious. sweet desgert. delicious sweet desgert\"",
|
||||||
|
"\"delicitous. sweet. dessert. delicitous sweet desgert\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
437
milli/src/search/new/tests/words_tms.rs
Normal file
437
milli/src/search/new/tests/words_tms.rs
Normal file
@ -0,0 +1,437 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the following properties:
|
||||||
|
|
||||||
|
1. The `last` term matching strategy starts removing terms from the query
|
||||||
|
starting from the end if no more results match it.
|
||||||
|
2. Phrases are never deleted by the `last` term matching strategy
|
||||||
|
3. Duplicate words don't affect the ranking of a document according to the `words` ranking rule
|
||||||
|
4. The proximity of the first and last word of a phrase to its adjacent terms is taken into
|
||||||
|
account by the proximity ranking rule.
|
||||||
|
5. Unclosed double quotes still make a phrase
|
||||||
|
6. The `all` term matching strategy does not remove any term from the query
|
||||||
|
7. The search is capable of returning no results if no documents match the query
|
||||||
|
*/
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
||||||
|
SearchResult, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
s.set_criteria(vec![Criterion::Words]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"text": "",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "the",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "the quick",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "the quick brown",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"text": "the quick brown fox",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"text": "the quick brown fox jumps",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"text": "the quick brown fox jumps over",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"text": "the quick brown fox jumps over the",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"text": "the quick brown fox jumps over the lazy",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"text": "the quick brown fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"text": "the brown quick fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"text": "the quick brown fox talks to the lazy and slow dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"text": "the quick brown fox talks to the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 13,
|
||||||
|
"text": "the mighty and quick brown fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 14,
|
||||||
|
"text": "the great quick brown fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 15,
|
||||||
|
"text": "this quick brown and very scary fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 16,
|
||||||
|
"text": "this quick brown and scary fox jumps over the lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 17,
|
||||||
|
"text": "the quick brown fox jumps over the really lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 18,
|
||||||
|
"text": "the brown quick fox jumps over the really lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 19,
|
||||||
|
"text": "the brown quick fox immediately jumps over the really lazy dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"text": "the brown quick fox immediately jumps over the really lazy blue dog",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"text": "the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 22,
|
||||||
|
"text": "the, quick, brown, fox, jumps, over, the, lazy, dog",
|
||||||
|
}
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_words_tms_last_simple() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// 6 and 7 have the same score because "the" appears twice
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the lazy dog\"",
|
||||||
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brown fox\"",
|
||||||
|
"\"the quick brown fox talks to the lazy and slow dog\"",
|
||||||
|
"\"the quick brown fox talks to the lazy dog\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("extravagant the quick brown fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_words_tms_last_phrase() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("\"the quick brown fox\" jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// "The quick brown fox" is a phrase, not deleted by this term matching strategy
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brown fox\"",
|
||||||
|
"\"the quick brown fox talks to the lazy and slow dog\"",
|
||||||
|
"\"the quick brown fox talks to the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("\"the quick brown fox\" jumps over the \"lazy\" dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// "lazy" is a phrase, not deleted by this term matching strategy
|
||||||
|
// but words before it can be deleted
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox talks to the lazy and slow dog\"",
|
||||||
|
"\"the quick brown fox talks to the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("\"the quick brown fox jumps over the lazy dog\"");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// The whole query is a phrase, no terms are removed
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("\"the quick brown fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// The whole query is still a phrase, even without closing quotes, so no terms are removed
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_words_proximity_tms_last_simple() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brown fox\"",
|
||||||
|
"\"the quick brown fox talks to the lazy and slow dog\"",
|
||||||
|
"\"the quick brown fox talks to the lazy dog\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the brown quick fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// 10 is better than 9 because of the proximity between "quick" and "brown"
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the brown quick fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brown fox\"",
|
||||||
|
"\"the quick brown fox talks to the lazy and slow dog\"",
|
||||||
|
"\"the quick brown fox talks to the lazy dog\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_words_proximity_tms_last_phrase() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the \"quick brown\" fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// "quick brown" is a phrase. The proximity of its first and last words
|
||||||
|
// to their adjacent query words should be taken into account
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
"\"the quick brown fox\"",
|
||||||
|
"\"the quick brown fox talks to the lazy and slow dog\"",
|
||||||
|
"\"the quick brown fox talks to the lazy dog\"",
|
||||||
|
"\"the quick brown\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the \"quick brown\" \"fox jumps\" over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
// "quick brown" is a phrase. The proximity of its first and last words
|
||||||
|
// to their adjacent query words should be taken into account.
|
||||||
|
// The same applies to `fox jumps`.
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
|
"\"the quick brown fox jumps over the\"",
|
||||||
|
"\"the quick brown fox jumps over\"",
|
||||||
|
"\"the quick brown fox jumps\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_words_tms_all() {
|
||||||
|
let index = create_index();
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
|
[
|
||||||
|
"\"the quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"",
|
||||||
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("extravagant");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
insta::assert_debug_snapshot!(texts, @"[]");
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user