Remove old criteria code

This commit is contained in:
Loïc Lecrenier 2023-03-23 09:35:53 +01:00
parent 9b2653427d
commit f5f5f03ec0
18 changed files with 88 additions and 5345 deletions

View File

@ -1,569 +0,0 @@
use std::mem::take;
use heed::BytesDecode;
use itertools::Itertools;
use log::debug;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates};
use crate::search::facet::{ascending_facet_sort, descending_facet_sort};
use crate::search::query_tree::Operation;
use crate::search::CriterionImplementationStrategy;
use crate::{FieldId, Index, Result};
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 1000;
pub struct AscDesc<'t> {
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
field_name: String,
field_id: Option<FieldId>,
is_ascending: bool,
query_tree: Option<Operation>,
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
allowed_candidates: RoaringBitmap,
initial_candidates: InitialCandidates,
faceted_candidates: RoaringBitmap,
implementation_strategy: CriterionImplementationStrategy,
parent: Box<dyn Criterion + 't>,
}
impl<'t> AscDesc<'t> {
pub fn asc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
implementation_strategy: CriterionImplementationStrategy,
) -> Result<Self> {
Self::new(index, rtxn, parent, field_name, true, implementation_strategy)
}
pub fn desc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
implementation_strategy: CriterionImplementationStrategy,
) -> Result<Self> {
Self::new(index, rtxn, parent, field_name, false, implementation_strategy)
}
fn new(
index: &'t Index,
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
is_ascending: bool,
implementation_strategy: CriterionImplementationStrategy,
) -> Result<Self> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let field_id = fields_ids_map.id(&field_name);
let faceted_candidates = match field_id {
Some(field_id) => {
let number_faceted =
index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?;
let string_faceted =
index.faceted_documents_ids(rtxn, field_id, FacetType::String)?;
number_faceted | string_faceted
}
None => RoaringBitmap::default(),
};
Ok(AscDesc {
index,
rtxn,
field_name,
field_id,
is_ascending,
query_tree: None,
candidates: Box::new(std::iter::empty()),
allowed_candidates: RoaringBitmap::new(),
faceted_candidates,
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
implementation_strategy,
parent,
})
}
}
impl<'t> Criterion for AscDesc<'t> {
#[logging_timer::time("AscDesc::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
self.allowed_candidates -= params.excluded_candidates;
loop {
debug!(
"Facet {}({}) iteration",
if self.is_ascending { "Asc" } else { "Desc" },
self.field_name
);
match self.candidates.next().transpose()? {
None if !self.allowed_candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(take(&mut self.allowed_candidates)),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree,
candidates,
filtered_candidates,
initial_candidates,
}) => {
self.query_tree = query_tree;
let mut candidates = match (&self.query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => {
let context = CriteriaBuilder::new(self.rtxn, self.index)?;
resolve_query_tree(&context, qt, params.wdcache)?
}
(None, None) => self.index.documents_ids(self.rtxn)?,
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
match initial_candidates {
Some(initial_candidates) => {
self.initial_candidates |= initial_candidates
}
None => self.initial_candidates.map_inplace(|c| c | &candidates),
}
if candidates.is_empty() {
continue;
}
self.allowed_candidates = &candidates - params.excluded_candidates;
self.candidates = match self.field_id {
Some(field_id) => facet_ordered(
self.index,
self.rtxn,
field_id,
self.is_ascending,
candidates & &self.faceted_candidates,
self.implementation_strategy,
)?,
None => Box::new(std::iter::empty()),
};
}
None => return Ok(None),
},
Some(mut candidates) => {
candidates -= params.excluded_candidates;
self.allowed_candidates -= &candidates;
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
}
}
}
}
fn facet_ordered_iterative<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
candidates: RoaringBitmap,
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
let number_iter = iterative_facet_number_ordered_iter(
index,
rtxn,
field_id,
is_ascending,
candidates.clone(),
)?;
let string_iter =
iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?;
Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box<dyn Iterator<Item = _>>)
}
fn facet_extreme_value<'t>(
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
) -> Result<Option<f64>> {
let extreme_value =
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
let (_, extreme_value) = extreme_value?;
Ok(OrderedF64Codec::bytes_decode(extreme_value))
}
pub fn facet_min_value<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
pub fn facet_max_value<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
fn facet_ordered_set_based<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
candidates: RoaringBitmap,
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
let number_db =
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let string_db =
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let (number_iter, string_iter) = if is_ascending {
let number_iter = ascending_facet_sort(rtxn, number_db, field_id, candidates.clone())?;
let string_iter = ascending_facet_sort(rtxn, string_db, field_id, candidates)?;
(itertools::Either::Left(number_iter), itertools::Either::Left(string_iter))
} else {
let number_iter = descending_facet_sort(rtxn, number_db, field_id, candidates.clone())?;
let string_iter = descending_facet_sort(rtxn, string_db, field_id, candidates)?;
(itertools::Either::Right(number_iter), itertools::Either::Right(string_iter))
};
Ok(Box::new(number_iter.chain(string_iter).map(|res| res.map(|(doc_ids, _)| doc_ids))))
}
/// Returns an iterator over groups of the given candidates in ascending or descending order.
///
/// It will either use an iterative or a recursive method on the whole facet database depending
/// on the number of candidates to rank.
fn facet_ordered<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
candidates: RoaringBitmap,
implementation_strategy: CriterionImplementationStrategy,
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
match implementation_strategy {
CriterionImplementationStrategy::OnlyIterative => {
facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates)
}
CriterionImplementationStrategy::OnlySetBased => {
facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates)
}
CriterionImplementationStrategy::Dynamic => {
if candidates.len() <= CANDIDATES_THRESHOLD {
facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates)
} else {
facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates)
}
}
}
}
/// Fetch the whole list of candidates facet number values one by one and order them by it.
///
/// This function is fast when the amount of candidates to rank is small.
fn iterative_facet_number_ordered_iter<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
let entry = if is_ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));
}
}
docids_values.sort_unstable_by_key(|(_, v)| *v);
let iter = docids_values.into_iter();
let iter = if is_ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
} else {
Box::new(iter.rev())
};
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
#[allow(clippy::needless_collect)]
let vec: Vec<_> = iter
.group_by(|(_, v)| *v)
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();
Ok(vec.into_iter())
}
/// Fetch the whole list of candidates facet string values one by one and order them by it.
///
/// This function is fast when the amount of candidates to rank is small.
fn iterative_facet_string_ordered_iter<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, "");
let right = (field_id, docid.saturating_add(1), "");
// FIXME Doing this means that it will never be possible to retrieve
// the document with id 2^32, not sure this is a real problem.
let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?;
let entry = if is_ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), _)) = entry.transpose()? {
docids_values.push((docid, value));
}
}
docids_values.sort_unstable_by_key(|(_, v)| *v);
let iter = docids_values.into_iter();
let iter = if is_ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
} else {
Box::new(iter.rev())
};
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
#[allow(clippy::needless_collect)]
let vec: Vec<_> = iter
.group_by(|(_, v)| *v)
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();
Ok(vec.into_iter())
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use big_s::S;
use maplit::hashset;
use crate::index::tests::TempIndex;
use crate::{AscDesc, Criterion, Filter, Search, SearchResult};
// Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD
// constant to 0 to ensure that the other sort algorithms are also correct.
#[test]
fn sort_criterion_placeholder() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings
.set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") });
settings.set_criteria(vec![Criterion::Sort]);
})
.unwrap();
let mut docs = vec![];
for i in 0..100 {
docs.push(
serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }),
);
}
index.add_documents(documents!(docs)).unwrap();
let all_ids = (0..100).collect::<Vec<_>>();
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.sort_criteria(vec![AscDesc::from_str("mod_10:desc").unwrap()]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 8, 18, 28, 38, 48, 58, 68, 78, 88, 98, 7, 17, 27, 37, 47, 57, 67, 77, 87, 97, 6, 16, 26, 36, 46, 56, 66, 76, 86, 96, 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 4, 14, 24, 34, 44, 54, 64, 74, 84, 94, 3, 13, 23, 33, 43, 53, 63, 73, 83, 93, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]");
documents_ids.sort();
assert_eq!(all_ids, documents_ids);
let mut search = Search::new(&rtxn, &index);
search.sort_criteria(vec![
AscDesc::from_str("mod_10:desc").unwrap(),
AscDesc::from_str("id:desc").unwrap(),
]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 89, 79, 69, 59, 49, 39, 29, 19, 9, 98, 88, 78, 68, 58, 48, 38, 28, 18, 8, 97, 87, 77, 67, 57, 47, 37, 27, 17, 7, 96, 86, 76, 66, 56, 46, 36, 26, 16, 6, 95, 85, 75, 65, 55, 45, 35, 25, 15, 5, 94, 84, 74, 64, 54, 44, 34, 24, 14, 4, 93, 83, 73, 63, 53, 43, 33, 23, 13, 3, 92, 82, 72, 62, 52, 42, 32, 22, 12, 2, 91, 81, 71, 61, 51, 41, 31, 21, 11, 1, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0]");
documents_ids.sort();
assert_eq!(all_ids, documents_ids);
let mut search = Search::new(&rtxn, &index);
search.sort_criteria(vec![
AscDesc::from_str("mod_10:desc").unwrap(),
AscDesc::from_str("mod_20:asc").unwrap(),
]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 29, 49, 69, 89, 19, 39, 59, 79, 99, 8, 28, 48, 68, 88, 18, 38, 58, 78, 98, 7, 27, 47, 67, 87, 17, 37, 57, 77, 97, 6, 26, 46, 66, 86, 16, 36, 56, 76, 96, 5, 25, 45, 65, 85, 15, 35, 55, 75, 95, 4, 24, 44, 64, 84, 14, 34, 54, 74, 94, 3, 23, 43, 63, 83, 13, 33, 53, 73, 93, 2, 22, 42, 62, 82, 12, 32, 52, 72, 92, 1, 21, 41, 61, 81, 11, 31, 51, 71, 91, 0, 20, 40, 60, 80, 10, 30, 50, 70, 90]");
documents_ids.sort();
assert_eq!(all_ids, documents_ids);
let mut search = Search::new(&rtxn, &index);
search.sort_criteria(vec![
AscDesc::from_str("mod_10:desc").unwrap(),
AscDesc::from_str("mod_20:desc").unwrap(),
]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 39, 59, 79, 99, 9, 29, 49, 69, 89, 18, 38, 58, 78, 98, 8, 28, 48, 68, 88, 17, 37, 57, 77, 97, 7, 27, 47, 67, 87, 16, 36, 56, 76, 96, 6, 26, 46, 66, 86, 15, 35, 55, 75, 95, 5, 25, 45, 65, 85, 14, 34, 54, 74, 94, 4, 24, 44, 64, 84, 13, 33, 53, 73, 93, 3, 23, 43, 63, 83, 12, 32, 52, 72, 92, 2, 22, 42, 62, 82, 11, 31, 51, 71, 91, 1, 21, 41, 61, 81, 10, 30, 50, 70, 90, 0, 20, 40, 60, 80]");
documents_ids.sort();
assert_eq!(all_ids, documents_ids);
let mut search = Search::new(&rtxn, &index);
search.sort_criteria(vec![
AscDesc::from_str("mod_10:desc").unwrap(),
AscDesc::from_str("mod_20:desc").unwrap(),
AscDesc::from_str("id:desc").unwrap(),
]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 79, 59, 39, 19, 89, 69, 49, 29, 9, 98, 78, 58, 38, 18, 88, 68, 48, 28, 8, 97, 77, 57, 37, 17, 87, 67, 47, 27, 7, 96, 76, 56, 36, 16, 86, 66, 46, 26, 6, 95, 75, 55, 35, 15, 85, 65, 45, 25, 5, 94, 74, 54, 34, 14, 84, 64, 44, 24, 4, 93, 73, 53, 33, 13, 83, 63, 43, 23, 3, 92, 72, 52, 32, 12, 82, 62, 42, 22, 2, 91, 71, 51, 31, 11, 81, 61, 41, 21, 1, 90, 70, 50, 30, 10, 80, 60, 40, 20, 0]");
documents_ids.sort();
assert_eq!(all_ids, documents_ids);
}
// Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD
// constant to 0 to ensure that the other sort algorithms are also correct.
#[test]
fn sort_criterion_non_placeholder() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") });
settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") });
settings.set_criteria(vec![Criterion::Sort]);
})
.unwrap();
let mut docs = vec![];
for i in 0..100 {
docs.push(
serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }),
);
}
index.add_documents(documents!(docs)).unwrap();
let rtxn = index.read_txn().unwrap();
let mut search = Search::new(&rtxn, &index);
search.filter(
Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]")
.unwrap()
.unwrap(),
);
search.sort_criteria(vec![
AscDesc::from_str("mod_10:desc").unwrap(),
AscDesc::from_str("mod_20:asc").unwrap(),
AscDesc::from_str("id:desc").unwrap(),
]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
// The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 93, 73, 53, 33, 13, 82, 62, 42, 22, 2, 92, 72, 52, 32, 12, 81, 61, 41, 21, 1, 91, 71, 51, 31, 11, 80, 60, 40, 20, 0, 90, 70, 50, 30, 10]");
let expected_ids = (0..100)
.filter(|id| {
[1, 0, 2].contains(&(id % 10))
|| [10, 13].contains(&(id % 20))
|| [5, 6].contains(id)
})
.collect::<Vec<_>>();
documents_ids.sort();
assert_eq!(expected_ids, documents_ids);
let mut search = Search::new(&rtxn, &index);
search.filter(
Filter::from_str("mod_10 IN [7, 8, 0] OR mod_20 IN [1, 15, 16] OR id IN [0, 4]")
.unwrap()
.unwrap(),
);
search.sort_criteria(vec![
AscDesc::from_str("mod_10:asc").unwrap(),
AscDesc::from_str("mod_20:asc").unwrap(),
AscDesc::from_str("id:desc").unwrap(),
]);
search.limit(100);
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
// The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[80, 60, 40, 20, 0, 90, 70, 50, 30, 10, 81, 61, 41, 21, 1, 4, 95, 75, 55, 35, 15, 96, 76, 56, 36, 16, 87, 67, 47, 27, 7, 97, 77, 57, 37, 17, 88, 68, 48, 28, 8, 98, 78, 58, 38, 18]");
let expected_ids = (0..100)
.filter(|id| {
[7, 8, 0].contains(&(id % 10))
|| [1, 15, 16].contains(&(id % 20))
|| [0, 4].contains(id)
})
.collect::<Vec<_>>();
documents_ids.sort();
assert_eq!(expected_ids, documents_ids);
let mut search = Search::new(&rtxn, &index);
search.filter(
Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]")
.unwrap()
.unwrap(),
);
search.sort_criteria(vec![AscDesc::from_str("id:desc").unwrap()]);
search.limit(100);
let SearchResult { documents_ids, .. } = search.execute().unwrap();
// The order should be in decreasing value of the id
let mut expected_ids = (0..100)
.filter(|id| {
[1, 0, 2].contains(&(id % 10))
|| [10, 13].contains(&(id % 20))
|| [5, 6].contains(id)
})
.collect::<Vec<_>>();
expected_ids.sort();
expected_ids.reverse();
assert_eq!(expected_ids, documents_ids);
}
}

View File

@ -1,710 +0,0 @@
use std::cmp::{self, Ordering};
use std::collections::binary_heap::PeekMut;
use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap};
use std::iter::Peekable;
use std::mem::take;
use roaring::RoaringBitmap;
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::{InitialCandidates, Query};
use crate::search::query_tree::{Operation, QueryKind};
use crate::search::{
build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache,
};
use crate::Result;
/// To be able to divide integers by the number of words in the query
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
const LCM_10_FIRST_NUMBERS: u32 = 2520;
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 500;
type FlattenedQueryTree = Vec<Vec<Vec<Query>>>;
pub struct Attribute<'t> {
ctx: &'t dyn Context<'t>,
state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>,
initial_candidates: InitialCandidates,
parent: Box<dyn Criterion + 't>,
linear_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
set_buckets: Option<BinaryHeap<Branch<'t>>>,
implementation_strategy: CriterionImplementationStrategy,
}
impl<'t> Attribute<'t> {
pub fn new(
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
implementation_strategy: CriterionImplementationStrategy,
) -> Self {
Attribute {
ctx,
state: None,
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
parent,
linear_buckets: None,
set_buckets: None,
implementation_strategy,
}
}
}
impl<'t> Criterion for Attribute<'t> {
#[logging_timer::time("Attribute::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
*allowed_candidates -= params.excluded_candidates;
}
loop {
match self.state.take() {
Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
candidates: Some(RoaringBitmap::new()),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
let found_candidates = if matches!(
self.implementation_strategy,
CriterionImplementationStrategy::OnlyIterative
) || (matches!(
self.implementation_strategy,
CriterionImplementationStrategy::Dynamic
) && allowed_candidates.len()
< CANDIDATES_THRESHOLD)
{
let linear_buckets = match self.linear_buckets.as_mut() {
Some(linear_buckets) => linear_buckets,
None => {
let new_buckets = initialize_linear_buckets(
self.ctx,
&flattened_query_tree,
&allowed_candidates,
)?;
self.linear_buckets.get_or_insert(new_buckets.into_iter())
}
};
match linear_buckets.next() {
Some((_score, candidates)) => candidates,
None => {
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
candidates: Some(RoaringBitmap::new()),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
}
} else {
let set_buckets = match self.set_buckets.as_mut() {
Some(set_buckets) => set_buckets,
None => {
let new_buckets = initialize_set_buckets(
self.ctx,
&flattened_query_tree,
&allowed_candidates,
params.wdcache,
)?;
self.set_buckets.get_or_insert(new_buckets)
}
};
match set_compute_candidates(set_buckets, &allowed_candidates)? {
Some((_score, candidates)) => candidates,
None => {
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
candidates: Some(allowed_candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
}
};
allowed_candidates -= &found_candidates;
self.state =
Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
candidates: Some(found_candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
initial_candidates,
}) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
let flattened_query_tree = flatten_query_tree(&query_tree);
match initial_candidates {
Some(initial_candidates) => {
self.initial_candidates |= initial_candidates
}
None => self.initial_candidates.map_inplace(|c| c | &candidates),
}
self.state = Some((query_tree, flattened_query_tree, candidates));
self.linear_buckets = None;
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}));
}
None => return Ok(None),
},
}
}
}
}
/// QueryPositionIterator is an Iterator over positions of a Query,
/// It contains iterators over words positions.
struct QueryPositionIterator<'t> {
#[allow(clippy::type_complexity)]
inner:
Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u16), RoaringBitmap)>> + 't>>>,
}
impl<'t> QueryPositionIterator<'t> {
fn new(
ctx: &'t dyn Context<'t>,
queries: &[Query],
wdcache: &mut WordDerivationsCache,
) -> Result<Self> {
let mut inner = Vec::with_capacity(queries.len());
for query in queries {
let in_prefix_cache = query.prefix && ctx.in_prefix_cache(query.kind.word());
match &query.kind {
QueryKind::Exact { word, .. } => {
if !query.prefix || in_prefix_cache {
let word = query.kind.word();
let iter = ctx.word_position_iterator(word, in_prefix_cache)?;
inner.push(iter.peekable());
} else {
for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)?
{
let iter = ctx.word_position_iterator(word, in_prefix_cache)?;
inner.push(iter.peekable());
}
}
}
QueryKind::Tolerant { typo, word } => {
for (word, _) in
word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?
{
let iter = ctx.word_position_iterator(word, in_prefix_cache)?;
inner.push(iter.peekable());
}
}
};
}
Ok(Self { inner })
}
}
impl<'t> Iterator for QueryPositionIterator<'t> {
type Item = heed::Result<(u16, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> {
// sort inner words from the closest next position to the farthest next position.
let expected_pos = self
.inner
.iter_mut()
.filter_map(|wli| match wli.peek() {
Some(Ok(((_, pos), _))) => Some(*pos),
_ => None,
})
.min()?;
let mut candidates = None;
for wli in self.inner.iter_mut() {
if let Some(Ok(((_, pos), _))) = wli.peek() {
if *pos > expected_pos {
continue;
}
}
match wli.next() {
Some(Ok((_, docids))) => {
candidates = match candidates.take() {
Some(candidates) => Some(candidates | docids),
None => Some(docids),
}
}
Some(Err(e)) => return Some(Err(e)),
None => continue,
}
}
candidates.map(|candidates| Ok((expected_pos, candidates)))
}
}
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
/// This branch allows us to iterate over meta-interval of positions.
struct Branch<'t> {
query_level_iterator: Vec<(u16, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
last_result: (u16, RoaringBitmap),
branch_size: u16,
}
impl<'t> Branch<'t> {
fn new(
ctx: &'t dyn Context<'t>,
flatten_branch: &[Vec<Query>],
wdcache: &mut WordDerivationsCache,
allowed_candidates: &RoaringBitmap,
) -> Result<Self> {
let mut query_level_iterator = Vec::new();
for queries in flatten_branch {
let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable();
let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new()));
query_level_iterator.push((pos, docids & allowed_candidates, qli));
}
let mut branch = Self {
query_level_iterator,
last_result: (0, RoaringBitmap::new()),
branch_size: flatten_branch.len() as u16,
};
branch.update_last_result();
Ok(branch)
}
/// return the next meta-interval of the branch,
/// and update inner interval in order to be ranked by the BinaryHeap.
fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> {
// update the first query.
let index = self.lowest_iterator_index();
match self.query_level_iterator.get_mut(index) {
Some((cur_pos, cur_docids, qli)) => match qli.next().transpose()? {
Some((next_pos, next_docids)) => {
*cur_pos = next_pos;
*cur_docids |= next_docids & allowed_candidates;
self.update_last_result();
Ok(true)
}
None => Ok(false),
},
None => Ok(false),
}
}
fn lowest_iterator_index(&mut self) -> usize {
let (index, _) = self
.query_level_iterator
.iter_mut()
.map(|(pos, docids, qli)| {
if docids.is_empty() {
0
} else {
match qli.peek() {
Some(result) => {
result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
}
None => u16::MAX,
}
}
})
.enumerate()
.min_by_key(|(_, diff)| *diff)
.unwrap_or((0, 0));
index
}
fn update_last_result(&mut self) {
let mut result_pos = 0;
let mut result_docids = None;
for (pos, docids, _qli) in self.query_level_iterator.iter() {
result_pos += pos;
result_docids = result_docids
.take()
.map_or_else(|| Some(docids.clone()), |candidates| Some(candidates & docids));
}
// remove last result docids from inner iterators
if let Some(docids) = result_docids.as_ref() {
for (_, query_docids, _) in self.query_level_iterator.iter_mut() {
*query_docids -= docids;
}
}
self.last_result = (result_pos, result_docids.unwrap_or_default());
}
/// return the score of the current inner interval.
fn compute_rank(&self) -> u32 {
// we compute a rank from the position.
let (pos, _) = self.last_result;
pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS
/ self.branch_size as u32
}
fn cmp(&self, other: &Self) -> Ordering {
let self_rank = self.compute_rank();
let other_rank = other.compute_rank();
// lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it.
self_rank.cmp(&other_rank).reverse()
}
}
impl<'t> Ord for Branch<'t> {
fn cmp(&self, other: &Self) -> Ordering {
self.cmp(other)
}
}
impl<'t> PartialOrd for Branch<'t> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'t> PartialEq for Branch<'t> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl<'t> Eq for Branch<'t> {}
fn initialize_set_buckets<'t>(
ctx: &'t dyn Context<'t>,
branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache,
) -> Result<BinaryHeap<Branch<'t>>> {
let mut heap = BinaryHeap::new();
for flatten_branch in branches {
let branch = Branch::new(ctx, flatten_branch, wdcache, allowed_candidates)?;
heap.push(branch);
}
Ok(heap)
}
fn set_compute_candidates(
branches_heap: &mut BinaryHeap<Branch>,
allowed_candidates: &RoaringBitmap,
) -> Result<Option<(u32, RoaringBitmap)>> {
let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
let mut allowed_candidates = allowed_candidates.clone();
while let Some(mut branch) = branches_heap.peek_mut() {
// if current is worst than best we break to return
// candidates that correspond to the best rank
let branch_rank = branch.compute_rank();
if let Some((best_rank, _)) = final_candidates {
if branch_rank > best_rank {
break;
}
}
let candidates = take(&mut branch.last_result.1);
if candidates.is_empty() {
// we don't have candidates, get next interval.
if !branch.next(&allowed_candidates)? {
PeekMut::pop(branch);
}
} else {
allowed_candidates -= &candidates;
final_candidates = match final_candidates.take() {
// we add current candidates to best candidates
Some((best_rank, mut best_candidates)) => {
best_candidates |= candidates;
branch.next(&allowed_candidates)?;
Some((best_rank, best_candidates))
}
// we take current candidates as best candidates
None => {
branch.next(&allowed_candidates)?;
Some((branch_rank, candidates))
}
};
}
}
Ok(final_candidates)
}
fn initialize_linear_buckets(
ctx: &dyn Context,
branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap,
) -> Result<BTreeMap<u64, RoaringBitmap>> {
fn compute_candidate_rank(
branches: &FlattenedQueryTree,
words_positions: HashMap<String, RoaringBitmap>,
) -> u64 {
let mut min_rank = u64::max_value();
for branch in branches {
let branch_len = branch.len();
let mut branch_rank = Vec::with_capacity(branch_len);
for derivates in branch {
let mut position = None;
for Query { prefix, kind } in derivates {
// find the best position of the current word in the document.
let current_position = match kind {
QueryKind::Exact { word, .. } => {
if *prefix {
word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().next())
.min()
} else {
words_positions
.get(word)
.and_then(|positions| positions.iter().next())
}
}
QueryKind::Tolerant { typo, word } => {
word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().next())
.min()
}
};
match (position, current_position) {
(Some(p), Some(cp)) => position = Some(cmp::min(p, cp)),
(None, Some(cp)) => position = Some(cp),
_ => (),
}
}
// if a position is found, we add it to the branch score,
// otherwise the branch is considered as unfindable in this document and we break.
if let Some(position) = position {
branch_rank.push(position as u64);
} else {
branch_rank.clear();
break;
}
}
if !branch_rank.is_empty() {
branch_rank.sort_unstable();
// because several words in same query can't match all a the position 0,
// we substract the word index to the position.
let branch_rank: u64 =
branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
// here we do the means of the words of the branch
min_rank =
min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
}
}
min_rank
}
fn word_derivations<'a>(
word: &str,
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap> {
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
match dfa.eval(document_word) {
Distance::Exact(_) => Some(positions),
Distance::AtLeast(_) => None,
}
})
}
let mut candidates = BTreeMap::new();
for docid in allowed_candidates {
let words_positions = ctx.docid_words_positions(docid)?;
let rank = compute_candidate_rank(branches, words_positions);
candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
}
Ok(candidates)
}
// TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Phrase};
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() {
Some((thead, tail)) => {
let tail = and_recurse(thead, tail);
let mut out = Vec::new();
for array in recurse(head) {
for tail_array in &tail {
let mut array = array.clone();
array.extend(tail_array.iter().cloned());
out.push(array);
}
}
out
}
None => recurse(head),
}
}
fn recurse(op: &Operation) -> FlattenedQueryTree {
match op {
And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)),
Or(_, ops) => {
if ops.iter().all(|op| op.query().is_some()) {
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
} else {
ops.iter().flat_map(recurse).collect()
}
}
Phrase(words) => {
let queries = words
.iter()
.filter_map(|w| w.as_ref())
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect();
vec![queries]
}
Operation::Query(query) => vec![vec![vec![query.clone()]]],
}
}
recurse(query_tree)
}
#[cfg(test)]
mod tests {
use big_s::S;
use super::*;
use crate::search::criteria::QueryKind;
#[test]
fn simple_flatten_query_tree() {
let query_tree = Operation::Or(
false,
vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
]),
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("thefish")),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("the")),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact(S("fish")),
}),
]),
],
),
]),
],
);
let result = flatten_query_tree(&query_tree);
insta::assert_debug_snapshot!(result, @r###"
[
[
[
Exact {
word: "manythefish",
},
],
],
[
[
Exact {
word: "manythe",
},
],
[
Exact {
word: "fish",
},
],
],
[
[
Exact {
word: "many",
},
],
[
Exact {
word: "thefish",
},
],
],
[
[
Exact {
word: "many",
},
],
[
Exact {
word: "the",
},
],
[
Exact {
word: "fish",
},
],
],
]
"###);
}
}

View File

@ -1,766 +0,0 @@
use std::collections::btree_map::Entry;
use std::collections::BTreeMap;
use std::convert::TryFrom;
use std::mem::take;
use log::debug;
use roaring::{MultiOps, RoaringBitmap};
use crate::search::criteria::{
resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
InitialCandidates,
};
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
use crate::{absolute_from_relative_position, FieldId, Result};
pub struct Exactness<'t> {
ctx: &'t dyn Context<'t>,
query_tree: Option<Operation>,
state: Option<State>,
initial_candidates: InitialCandidates,
parent: Box<dyn Criterion + 't>,
query: Vec<ExactQueryPart>,
cache: Option<ExactWordsCombinationCache>,
}
impl<'t> Exactness<'t> {
pub fn new(
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
primitive_query: &[PrimitiveQueryPart],
) -> heed::Result<Self> {
let mut query: Vec<_> = Vec::with_capacity(primitive_query.len());
for part in primitive_query {
query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?);
}
Ok(Exactness {
ctx,
query_tree: None,
state: None,
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
parent,
query,
cache: None,
})
}
}
impl<'t> Criterion for Exactness<'t> {
#[logging_timer::time("Exactness::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
if let Some(state) = self.state.as_mut() {
state.difference_with(params.excluded_candidates);
}
loop {
debug!("Exactness at state {:?}", self.state);
match self.state.as_mut() {
Some(state) if state.is_empty() => {
// reset state
self.state = None;
self.query_tree = None;
// we don't need to reset the combinations cache since it only depends on
// the primitive query, which does not change
}
Some(state) => {
let (candidates, state) =
resolve_state(self.ctx, take(state), &self.query, &mut self.cache)?;
self.state = state;
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: Some(candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
initial_candidates,
}) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
match initial_candidates {
Some(initial_candidates) => {
self.initial_candidates |= initial_candidates
}
None => self.initial_candidates.map_inplace(|c| c | &candidates),
}
self.state = Some(State::new(candidates));
self.query_tree = Some(query_tree);
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}));
}
None => return Ok(None),
},
}
}
}
}
#[derive(Debug)]
enum State {
/// Extract the documents that have an attribute that contains exactly the query.
ExactAttribute(RoaringBitmap),
/// Extract the documents that have an attribute that starts with exactly the query.
AttributeStartsWith(RoaringBitmap),
/// Rank the remaining documents by the number of exact words contained.
ExactWords(RoaringBitmap),
Remainings(Vec<RoaringBitmap>),
}
impl State {
fn new(candidates: RoaringBitmap) -> Self {
Self::ExactAttribute(candidates)
}
fn difference_with(&mut self, lhs: &RoaringBitmap) {
match self {
Self::ExactAttribute(candidates)
| Self::AttributeStartsWith(candidates)
| Self::ExactWords(candidates) => *candidates -= lhs,
Self::Remainings(candidates_array) => {
candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs);
candidates_array.retain(|candidates| !candidates.is_empty());
}
}
}
fn is_empty(&self) -> bool {
match self {
Self::ExactAttribute(candidates)
| Self::AttributeStartsWith(candidates)
| Self::ExactWords(candidates) => candidates.is_empty(),
Self::Remainings(candidates_array) => {
candidates_array.iter().all(RoaringBitmap::is_empty)
}
}
}
}
impl Default for State {
fn default() -> Self {
Self::Remainings(vec![])
}
}
#[logging_timer::time("Exactness::{}")]
fn resolve_state(
ctx: &dyn Context,
state: State,
query: &[ExactQueryPart],
cache: &mut Option<ExactWordsCombinationCache>,
) -> Result<(RoaringBitmap, Option<State>)> {
use State::*;
match state {
ExactAttribute(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new();
if let Ok(query_len) = u8::try_from(query.len()) {
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
if let Some(attribute_allowed_docids) =
ctx.field_id_word_count_docids(id, query_len)?
{
let mut attribute_candidates_array =
attribute_start_with_docids(ctx, id, query)?;
attribute_candidates_array.push(attribute_allowed_docids);
candidates |= MultiOps::intersection(attribute_candidates_array);
}
}
// only keep allowed candidates
candidates &= &allowed_candidates;
// remove current candidates from allowed candidates
allowed_candidates -= &candidates;
}
Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
}
AttributeStartsWith(mut allowed_candidates) => {
let mut candidates = RoaringBitmap::new();
let attributes_ids = ctx.searchable_fields_ids()?;
for id in attributes_ids {
let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?;
candidates |= MultiOps::intersection(attribute_candidates_array);
}
// only keep allowed candidates
candidates &= &allowed_candidates;
// remove current candidates from allowed candidates
allowed_candidates -= &candidates;
Ok((candidates, Some(ExactWords(allowed_candidates))))
}
ExactWords(allowed_candidates) => {
// Retrieve the cache if it already exist, otherwise create it.
let owned_cache = if let Some(cache) = cache.take() {
cache
} else {
compute_combinations(ctx, query)?
};
// The cache contains the sets of documents which contain exactly 1,2,3,.. exact words
// from the query. It cannot be empty. All the candidates in it are disjoint.
let mut candidates_array = owned_cache.combinations.clone();
for candidates in candidates_array.iter_mut() {
*candidates &= &allowed_candidates;
}
*cache = Some(owned_cache);
let best_candidates = candidates_array.pop().unwrap();
candidates_array.insert(0, allowed_candidates);
Ok((best_candidates, Some(Remainings(candidates_array))))
}
// pop remainings candidates until the emptiness
Remainings(mut candidates_array) => {
let candidates = candidates_array.pop().unwrap_or_default();
if !candidates_array.is_empty() {
Ok((candidates, Some(Remainings(candidates_array))))
} else {
Ok((candidates, None))
}
}
}
}
fn attribute_start_with_docids(
ctx: &dyn Context,
attribute_id: FieldId,
query: &[ExactQueryPart],
) -> heed::Result<Vec<RoaringBitmap>> {
let mut attribute_candidates_array = Vec::new();
// start from attribute first position
let mut pos = absolute_from_relative_position(attribute_id, 0);
for part in query {
use ExactQueryPart::*;
match part {
Synonyms(synonyms) => {
let mut synonyms_candidates = RoaringBitmap::new();
for word in synonyms {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
synonyms_candidates |= word_candidates;
}
}
attribute_candidates_array.push(synonyms_candidates);
pos += 1;
}
Phrase(phrase) => {
for word in phrase {
if let Some(word) = word {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
}
}
pos += 1;
}
}
}
}
Ok(attribute_candidates_array)
}
#[derive(Debug, Clone)]
pub enum ExactQueryPart {
Phrase(Vec<Option<String>>),
Synonyms(Vec<String>),
}
impl ExactQueryPart {
fn from_primitive_query_part(
ctx: &dyn Context,
part: &PrimitiveQueryPart,
) -> heed::Result<Self> {
let part = match part {
PrimitiveQueryPart::Word(word, _) => {
match ctx.synonyms(word)? {
Some(synonyms) => {
let mut synonyms: Vec<_> = synonyms
.into_iter()
.filter_map(|mut array| {
// keep 1 word synonyms only.
match array.pop() {
Some(word) if array.is_empty() => Some(word),
_ => None,
}
})
.collect();
synonyms.push(word.clone());
ExactQueryPart::Synonyms(synonyms)
}
None => ExactQueryPart::Synonyms(vec![word.clone()]),
}
}
PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()),
};
Ok(part)
}
}
struct ExactWordsCombinationCache {
// index 0 is only 1 word
combinations: Vec<RoaringBitmap>,
}
fn compute_combinations(
ctx: &dyn Context,
query: &[ExactQueryPart],
) -> Result<ExactWordsCombinationCache> {
let number_of_part = query.len();
let mut parts_candidates_array = Vec::with_capacity(number_of_part);
for part in query {
let mut candidates = RoaringBitmap::new();
use ExactQueryPart::*;
match part {
Synonyms(synonyms) => {
for synonym in synonyms {
if let Some(synonym_candidates) = ctx.word_docids(synonym)? {
candidates |= synonym_candidates;
}
}
}
// compute intersection on pair of words with a proximity of 0.
Phrase(phrase) => {
candidates |= resolve_phrase(ctx, phrase)?;
}
}
parts_candidates_array.push(candidates);
}
let combinations = create_disjoint_combinations(parts_candidates_array);
Ok(ExactWordsCombinationCache { combinations })
}
/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn`
/// such that `Xi` contains all the elements that are contained in **at least** `i+1` bitmaps among `b0,b1,...,bn`.
///
/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`.
///
/// ## Implementation
///
/// We do so by iteratively building a map containing the union of all the different ways to intersect `J` bitmaps among `b0,b1,...,bn`.
/// - The key of the map is the index `i` of the last bitmap in the intersections
/// - The value is the union of all the possible intersections of J bitmaps such that the last bitmap in the intersection is `bi`
///
/// For example, with the bitmaps `b0,b1,b2,b3`, this map should look like this
/// ```text
/// Map 0: (first iteration, contains all the combinations of 1 bitmap)
/// // What follows are unions of intersection of bitmaps asscociated with the index of their last component
/// 0: [b0]
/// 1: [b1]
/// 2: [b2]
/// 3: [b3]
/// Map 1: (second iteration, combinations of 2 bitmaps)
/// 1: [b0&b1]
/// 2: [b0&b2 | b1&b2]
/// 3: [b0&b3 | b1&b3 | b2&b3]
/// Map 2: (third iteration, combinations of 3 bitmaps)
/// 2: [b0&b1&b2]
/// 3: [b0&b2&b3 | b1&b2&b3]
/// Map 3: (fourth iteration, combinations of 4 bitmaps)
/// 3: [b0&b1&b2&b3]
/// ```
///
/// These maps are built one by one from the content of the preceding map.
/// For example, to create Map 2, we look at each line of Map 1, for example:
/// ```text
/// 2: [b0&b2 | b1&b2]
/// ```
/// And then for each i > 2, we compute `(b0&b2 | b1&b2) & bi = b0&b2&bi | b1&b2&bi`
/// and then add it the new map (Map 3) under the key `i` (if it is not empty):
/// ```text
/// 3: [b0&b2&b3 | b1&b2&b3]
/// 4: [b0&b2&b4 | b1&b2&b4]
/// 5: [b0&b2&b5 | b1&b2&b5]
/// etc.
/// ```
/// We only keep two maps in memory at any one point. As soon as Map J is built, we flatten Map J-1 into
/// a single bitmap by taking the union of all of its values. This union gives us Xj-1.
///
/// ## Memory Usage
/// This function is expected to be called on a maximum of 10 bitmaps. The worst case thus happens when
/// 10 identical large bitmaps are given.
///
/// In the context of Meilisearch, let's imagine that we are given 10 bitmaps containing all
/// the document ids. If the dataset contains 16 million documents, then each bitmap will take
/// around 2MB of memory.
///
/// When creating Map 3, we will have, in memory:
/// 1. The 10 original bitmaps (20MB)
/// 2. X0 : 2MB
/// 3. Map 1, containing 9 bitmaps: 18MB
/// 4. Map 2, containing 8 bitmaps: 16MB
/// 5. X1: 2MB
/// for a total of around 60MB of memory. This roughly represents the maximum memory usage of this function.
///
/// ## Time complexity
/// Let N be the size of the given list of bitmaps and M the length of each individual bitmap.
///
/// We need to create N new bitmaps. The most expensive one to create is the second one, where we need to
/// iterate over the N keys of Map 1, and for each of those keys `k_i`, we perform `N-k_i` bitmap unions.
/// Unioning two bitmaps is O(M), and we need to do it O(N^2) times.
///
/// Therefore the time complexity is O(N^3 * M).
fn create_non_disjoint_combinations(bitmaps: Vec<RoaringBitmap>) -> Vec<RoaringBitmap> {
let nbr_parts = bitmaps.len();
if nbr_parts == 1 {
return bitmaps;
}
let mut flattened_levels = vec![];
let mut last_level: BTreeMap<usize, RoaringBitmap> =
bitmaps.clone().into_iter().enumerate().collect();
for _ in 2..=nbr_parts {
let mut new_level = BTreeMap::new();
for (last_part_index, base_combination) in last_level.iter() {
#[allow(clippy::needless_range_loop)]
for new_last_part_index in last_part_index + 1..nbr_parts {
let new_combination = base_combination & &bitmaps[new_last_part_index];
if !new_combination.is_empty() {
match new_level.entry(new_last_part_index) {
Entry::Occupied(mut b) => {
*b.get_mut() |= new_combination;
}
Entry::Vacant(entry) => {
entry.insert(new_combination);
}
}
}
}
}
// Now flatten the last level to save memory
let flattened_last_level = MultiOps::union(last_level.into_values());
flattened_levels.push(flattened_last_level);
last_level = new_level;
}
// Flatten the last level
let flattened_last_level = MultiOps::union(last_level.into_values());
flattened_levels.push(flattened_last_level);
flattened_levels
}
/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn`
/// such that `Xi` contains all the elements that are contained in **exactly** `i+1` bitmaps among `b0,b1,...,bn`.
///
/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`.
fn create_disjoint_combinations(parts_candidates_array: Vec<RoaringBitmap>) -> Vec<RoaringBitmap> {
let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array);
let mut disjoint_combinations = vec![];
let mut combinations = non_disjoint_combinations.into_iter().peekable();
while let Some(mut combination) = combinations.next() {
if let Some(forbidden) = combinations.peek() {
combination -= forbidden;
}
disjoint_combinations.push(combination)
}
disjoint_combinations
}
#[cfg(test)]
mod tests {
use big_s::S;
use roaring::RoaringBitmap;
use crate::index::tests::TempIndex;
use crate::search::criteria::exactness::{
create_disjoint_combinations, create_non_disjoint_combinations,
};
use crate::snapshot_tests::display_bitmap;
use crate::{Criterion, SearchResult};
#[test]
fn test_exact_words_subcriterion() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key(S("id"));
settings.set_criteria(vec![Criterion::Exactness]);
})
.unwrap();
index
.add_documents(documents!([
// not relevant
{ "id": "0", "text": "cat good dog bad" },
// 1 exact word
{ "id": "1", "text": "they said: cats arebetter thandogs" },
// 3 exact words
{ "id": "2", "text": "they said: cats arebetter than dogs" },
// 5 exact words
{ "id": "3", "text": "they said: cats are better than dogs" },
// attribute starts with the exact words
{ "id": "4", "text": "cats are better than dogs except on Saturday" },
// attribute equal to the exact words
{ "id": "5", "text": "cats are better than dogs" },
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let SearchResult { matching_words: _, candidates: _, documents_ids } =
index.search(&rtxn).query("cats are better than dogs").execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4, 3, 2, 1]");
}
fn print_combinations(rbs: &[RoaringBitmap]) -> String {
let mut s = String::new();
for rb in rbs {
s.push_str(&format!("{}\n", &display_bitmap(rb)));
}
s
}
// In these unit tests, the test bitmaps always contain all the multiple of a certain number.
// This makes it easy to check the validity of the results of `create_disjoint_combinations` by
// counting the number of dividers of elements in the returned bitmaps.
fn assert_correct_combinations(combinations: &[RoaringBitmap], dividers: &[u32]) {
for (i, set) in combinations.iter().enumerate() {
let expected_nbr_dividers = i + 1;
for el in set {
let nbr_dividers = dividers.iter().map(|d| usize::from(el % d == 0)).sum::<usize>();
assert_eq!(
nbr_dividers, expected_nbr_dividers,
"{el} is divisible by {nbr_dividers} elements, not {expected_nbr_dividers}."
);
}
}
}
#[test]
fn compute_combinations_1() {
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
let parts_candidates = vec![b0];
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, ]
"###);
assert_correct_combinations(&combinations, &[2]);
}
#[test]
fn compute_combinations_2() {
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
let parts_candidates = vec![b0, b1];
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 21, 22, 26, 27, 28, 32, 33, 34, 38, 39, 40, 44, 45, 46, 50, 51, 52, 56, 57, 58, 62, 63, 64, 68, 69, 70, 74, 75, 76, 80, 81, 82, 86, 87, 88, 92, 93, 94, 98, 99, 100, 104, 105, 106, 110, 111, 112, 116, 117, 118, 122, 123, 124, 128, 129, 130, 134, 135, 136, 140, 141, 142, 146, 147, 148, ]
[0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, ]
"###);
}
#[test]
fn compute_combinations_4() {
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect();
let b3: RoaringBitmap = (0..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect();
let parts_candidates = vec![b0, b1, b2, b3];
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ]
[6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ]
[30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ]
[0, ]
"###);
// But we also check it programmatically
assert_correct_combinations(&combinations, &[2, 3, 5, 7]);
}
#[test]
fn compute_combinations_4_with_empty_results_at_end() {
let b0: RoaringBitmap = (1..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
let b1: RoaringBitmap = (1..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
let b2: RoaringBitmap = (1..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect();
let b3: RoaringBitmap = (1..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect();
let parts_candidates = vec![b0, b1, b2, b3];
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ]
[6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ]
[30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ]
[]
"###);
// But we also check it programmatically
assert_correct_combinations(&combinations, &[2, 3, 5, 7]);
}
#[test]
fn compute_combinations_4_with_some_equal_bitmaps() {
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect();
// b3 == b1
let b3: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
let parts_candidates = vec![b0, b1, b2, b3];
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[2, 4, 5, 8, 14, 16, 22, 25, 26, 28, 32, 34, 35, 38, 44, 46, 52, 55, 56, 58, 62, 64, 65, 68, 74, 76, 82, 85, 86, 88, 92, 94, 95, 98, 104, 106, 112, 115, 116, 118, 122, 124, 125, 128, 134, 136, 142, 145, 146, 148, ]
[3, 9, 10, 20, 21, 27, 33, 39, 40, 50, 51, 57, 63, 69, 70, 80, 81, 87, 93, 99, 100, 110, 111, 117, 123, 129, 130, 140, 141, 147, ]
[6, 12, 15, 18, 24, 36, 42, 45, 48, 54, 66, 72, 75, 78, 84, 96, 102, 105, 108, 114, 126, 132, 135, 138, 144, ]
[0, 30, 60, 90, 120, ]
"###);
// But we also check it programmatically
assert_correct_combinations(&combinations, &[2, 3, 5, 3]);
}
#[test]
fn compute_combinations_10() {
let dividers = [2, 3, 5, 7, 11, 6, 15, 35, 18, 14];
let parts_candidates: Vec<RoaringBitmap> = dividers
.iter()
.map(|&divider| {
(0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 210).collect()
})
.collect();
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[2, 3, 4, 5, 7, 8, 9, 11, 16, 25, 26, 27, 32, 34, 38, 39, 46, 49, 51, 52, 57, 58, 62, 64, 65, 68, 69, 74, 76, 81, 82, 85, 86, 87, 91, 92, 93, 94, 95, 104, 106, 111, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 143, 145, 146, 148, 152, 153, 155, 158, 159, 161, 164, 166, 171, 172, 177, 178, 183, 184, 185, 187, 188, 194, 201, 202, 203, 205, 206, 207, 208, 209, ]
[10, 20, 21, 22, 33, 40, 44, 50, 55, 63, 77, 80, 88, 99, 100, 130, 147, 160, 170, 176, 189, 190, 200, ]
[6, 12, 14, 15, 24, 28, 35, 45, 48, 56, 75, 78, 96, 98, 102, 110, 112, 114, 135, 138, 156, 174, 175, 182, 186, 192, 195, 196, 204, ]
[18, 36, 54, 66, 72, 108, 132, 144, 154, 162, 165, ]
[30, 42, 60, 70, 84, 105, 120, 140, 150, 168, 198, ]
[90, 126, 180, ]
[]
[210, ]
[]
[0, ]
"###);
assert_correct_combinations(&combinations, &dividers);
}
#[test]
fn compute_combinations_30() {
let dividers: [u32; 30] = [
1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,
5,
];
let parts_candidates: Vec<RoaringBitmap> = dividers
.iter()
.map(|divider| {
(0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 100).collect()
})
.collect();
let combinations = create_non_disjoint_combinations(parts_candidates.clone());
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
[0, 60, ]
[0, 60, ]
[0, 60, ]
[0, 60, ]
[0, 60, ]
[0, 60, ]
"###);
let combinations = create_disjoint_combinations(parts_candidates);
insta::assert_snapshot!(print_combinations(&combinations), @r###"
[]
[]
[]
[]
[]
[1, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61, 67, 71, 73, 77, 79, 83, 89, 91, 97, ]
[]
[]
[]
[]
[]
[2, 3, 5, 9, 14, 21, 22, 25, 26, 27, 33, 34, 35, 38, 39, 46, 51, 55, 57, 58, 62, 63, 65, 69, 74, 81, 82, 85, 86, 87, 93, 94, 95, 98, 99, ]
[]
[]
[]
[]
[]
[4, 6, 8, 10, 15, 16, 18, 28, 32, 42, 44, 45, 50, 52, 54, 56, 64, 66, 68, 70, 75, 76, 78, 88, 92, ]
[]
[]
[]
[]
[]
[12, 20, 24, 30, 36, 40, 48, 72, 80, 84, 90, 96, 100, ]
[]
[]
[]
[]
[]
[0, 60, ]
"###);
assert_correct_combinations(&combinations, &dividers);
}
}

View File

@ -1,77 +0,0 @@
use log::debug;
use roaring::RoaringBitmap;
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::InitialCandidates;
use crate::search::query_tree::Operation;
use crate::search::WordDerivationsCache;
use crate::Result;
/// The result of a call to the fetcher.
#[derive(Debug, Clone, PartialEq)]
pub struct FinalResult {
/// The query tree corresponding to the current bucket of the last criterion.
pub query_tree: Option<Operation>,
/// The candidates of the current bucket of the last criterion.
pub candidates: RoaringBitmap,
/// Candidates that comes from the current bucket of the initial criterion.
pub initial_candidates: InitialCandidates,
}
pub struct Final<'t> {
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
wdcache: WordDerivationsCache,
returned_candidates: RoaringBitmap,
}
impl<'t> Final<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
Final {
ctx,
parent,
wdcache: WordDerivationsCache::new(),
returned_candidates: RoaringBitmap::new(),
}
}
#[logging_timer::time("Final::{}")]
pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result<Option<FinalResult>> {
debug!("Final iteration");
let excluded_candidates = &self.returned_candidates | excluded_candidates;
let mut criterion_parameters = CriterionParameters {
wdcache: &mut self.wdcache,
// returned_candidates is merged with excluded_candidates to avoid duplicas
excluded_candidates: &excluded_candidates,
};
match self.parent.next(&mut criterion_parameters)? {
Some(CriterionResult {
query_tree,
candidates,
filtered_candidates,
initial_candidates,
}) => {
let mut candidates = match (candidates, query_tree.as_ref()) {
(Some(candidates), _) => candidates,
(None, Some(qt)) => {
resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates
}
(None, None) => self.ctx.documents_ids()? - excluded_candidates,
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
let initial_candidates = initial_candidates
.unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone()));
self.returned_candidates |= &candidates;
Ok(Some(FinalResult { query_tree, candidates, initial_candidates }))
}
None => Ok(None),
}
}
}

View File

@ -1,154 +0,0 @@
use std::iter;
use roaring::RoaringBitmap;
use rstar::RTree;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates};
use crate::{lat_lng_to_xyz, GeoPoint, Index, Result};
pub struct Geo<'t> {
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
ascending: bool,
parent: Box<dyn Criterion + 't>,
candidates: Box<dyn Iterator<Item = RoaringBitmap>>,
allowed_candidates: RoaringBitmap,
initial_candidates: InitialCandidates,
rtree: Option<RTree<GeoPoint>>,
point: [f64; 2],
}
impl<'t> Geo<'t> {
pub fn asc(
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
parent: Box<dyn Criterion + 't>,
point: [f64; 2],
) -> Result<Self> {
Self::new(index, rtxn, parent, point, true)
}
pub fn desc(
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
parent: Box<dyn Criterion + 't>,
point: [f64; 2],
) -> Result<Self> {
Self::new(index, rtxn, parent, point, false)
}
fn new(
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
parent: Box<dyn Criterion + 't>,
point: [f64; 2],
ascending: bool,
) -> Result<Self> {
let candidates = Box::new(iter::empty());
let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?;
let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new());
let rtree = index.geo_rtree(rtxn)?;
Ok(Self {
index,
rtxn,
ascending,
parent,
candidates,
allowed_candidates,
initial_candidates,
rtree,
point,
})
}
}
impl Criterion for Geo<'_> {
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
let rtree = self.rtree.as_ref();
loop {
match self.candidates.next() {
Some(mut candidates) => {
candidates -= params.excluded_candidates;
self.allowed_candidates -= &candidates;
return Ok(Some(CriterionResult {
query_tree: None,
candidates: Some(candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.clone()),
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree,
candidates,
filtered_candidates,
initial_candidates,
}) => {
let mut candidates = match (&query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => {
let context = CriteriaBuilder::new(self.rtxn, self.index)?;
resolve_query_tree(&context, qt, params.wdcache)?
}
(None, None) => self.index.documents_ids(self.rtxn)?,
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
match initial_candidates {
Some(initial_candidates) => {
self.initial_candidates |= initial_candidates
}
None => self.initial_candidates.map_inplace(|c| c | &candidates),
}
if candidates.is_empty() {
continue;
}
self.allowed_candidates = &candidates - params.excluded_candidates;
self.candidates = match rtree {
Some(rtree) => geo_point(
rtree,
self.allowed_candidates.clone(),
self.point,
self.ascending,
),
None => Box::new(std::iter::empty()),
};
}
None => return Ok(None),
},
}
}
}
}
fn geo_point(
rtree: &RTree<GeoPoint>,
mut candidates: RoaringBitmap,
point: [f64; 2],
ascending: bool,
) -> Box<dyn Iterator<Item = RoaringBitmap>> {
let point = lat_lng_to_xyz(&point);
let mut results = Vec::new();
for point in rtree.nearest_neighbor_iter(&point) {
if candidates.remove(point.data.0) {
results.push(std::iter::once(point.data.0).collect());
if candidates.is_empty() {
break;
}
}
}
if ascending {
Box::new(results.into_iter())
} else {
Box::new(results.into_iter().rev())
}
}

View File

@ -1,82 +0,0 @@
use roaring::RoaringBitmap;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates};
use crate::search::query_tree::Operation;
use crate::search::Distinct;
use crate::Result;
/// Initial is a mandatory criterion, it is always the first
/// and is meant to initalize the CriterionResult used by the other criteria.
/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time.
pub struct Initial<'t, D> {
ctx: &'t dyn Context<'t>,
answer: Option<CriterionResult>,
exhaustive_number_hits: bool,
distinct: Option<D>,
}
impl<'t, D> Initial<'t, D> {
pub fn new(
ctx: &'t dyn Context<'t>,
query_tree: Option<Operation>,
filtered_candidates: Option<RoaringBitmap>,
exhaustive_number_hits: bool,
distinct: Option<D>,
) -> Initial<D> {
let answer = CriterionResult {
query_tree,
candidates: None,
filtered_candidates,
initial_candidates: None,
};
Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct }
}
}
impl<D: Distinct> Criterion for Initial<'_, D> {
#[logging_timer::time("Initial::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
self.answer
.take()
.map(|mut answer| {
if self.exhaustive_number_hits {
// resolve the whole query tree to retrieve an exhaustive list of documents matching the query.
let candidates = answer
.query_tree
.as_ref()
.map(|query_tree| resolve_query_tree(self.ctx, query_tree, params.wdcache))
.transpose()?;
// then intersect the candidates with the potential filtered candidates.
let mut candidates = match (candidates, answer.filtered_candidates.take()) {
(Some(candidates), Some(filtered)) => candidates & filtered,
(Some(candidates), None) => candidates,
(None, Some(filtered)) => filtered,
(None, None) => self.ctx.documents_ids()?,
};
// then remove the potential soft deleted documents.
candidates -= params.excluded_candidates;
// because the initial_candidates should be an exhaustive count of the matching documents,
// we precompute the distinct attributes.
let initial_candidates = match &mut self.distinct {
Some(distinct) => {
let mut initial_candidates = RoaringBitmap::new();
for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) {
initial_candidates.insert(c?);
}
initial_candidates
}
None => candidates.clone(),
};
answer.candidates = Some(candidates);
answer.initial_candidates =
Some(InitialCandidates::Exhaustive(initial_candidates));
}
Ok(answer)
})
.transpose()
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,712 +0,0 @@
use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::HashMap;
use log::debug;
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
use super::{
query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context,
Criterion, CriterionParameters, CriterionResult,
};
use crate::search::criteria::InitialCandidates;
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache};
use crate::{Position, Result};
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
/// Threshold on the number of candidates that will make
/// the system choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 1000;
/// Threshold on the number of proximity that will make
/// the system choose between one algorithm or another.
const PROXIMITY_THRESHOLD: u8 = 0;
pub struct Proximity<'t> {
ctx: &'t dyn Context<'t>,
/// (max_proximity, query_tree, allowed_candidates)
state: Option<(u8, Operation, RoaringBitmap)>,
proximity: u8,
initial_candidates: InitialCandidates,
parent: Box<dyn Criterion + 't>,
candidates_cache: Cache,
plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
implementation_strategy: CriterionImplementationStrategy,
}
impl<'t> Proximity<'t> {
pub fn new(
ctx: &'t dyn Context<'t>,
parent: Box<dyn Criterion + 't>,
implementation_strategy: CriterionImplementationStrategy,
) -> Self {
Proximity {
ctx,
state: None,
proximity: 0,
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
parent,
candidates_cache: Cache::new(),
plane_sweep_cache: None,
implementation_strategy,
}
}
}
impl<'t> Criterion for Proximity<'t> {
#[logging_timer::time("Proximity::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
*allowed_candidates -= params.excluded_candidates;
}
loop {
debug!(
"Proximity at iteration {} (max prox {:?}) ({:?})",
self.proximity,
self.state.as_ref().map(|(mp, _, _)| mp),
self.state.as_ref().map(|(_, _, cd)| cd),
);
match &mut self.state {
Some((max_prox, _, allowed_candidates))
if allowed_candidates.is_empty() || self.proximity > *max_prox =>
{
self.state = None; // reset state
}
Some((_, query_tree, allowed_candidates)) => {
let mut new_candidates = if matches!(
self.implementation_strategy,
CriterionImplementationStrategy::OnlyIterative
) || (matches!(
self.implementation_strategy,
CriterionImplementationStrategy::Dynamic
) && allowed_candidates.len()
<= CANDIDATES_THRESHOLD
&& self.proximity > PROXIMITY_THRESHOLD)
{
if let Some(cache) = self.plane_sweep_cache.as_mut() {
match cache.next() {
Some((p, candidates)) => {
self.proximity = p;
candidates
}
None => {
self.state = None; // reset state
continue;
}
}
} else {
let cache = resolve_plane_sweep_candidates(
self.ctx,
query_tree,
allowed_candidates,
)?;
self.plane_sweep_cache = Some(cache.into_iter());
continue;
}
} else {
// use set theory based algorithm
resolve_candidates(
self.ctx,
query_tree,
self.proximity,
&mut self.candidates_cache,
params.wdcache,
)?
};
new_candidates &= &*allowed_candidates;
*allowed_candidates -= &new_candidates;
self.proximity += 1;
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: Some(new_candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
initial_candidates,
}) => {
let mut candidates = match candidates {
Some(candidates) => candidates,
None => {
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
- params.excluded_candidates
}
};
if let Some(filtered_candidates) = filtered_candidates {
candidates &= filtered_candidates;
}
match initial_candidates {
Some(initial_candidates) => {
self.initial_candidates |= initial_candidates
}
None => self.initial_candidates.map_inplace(|c| c | &candidates),
}
let maximum_proximity = maximum_proximity(&query_tree);
self.state = Some((maximum_proximity as u8, query_tree, candidates));
self.proximity = 0;
self.plane_sweep_cache = None;
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}));
}
None => return Ok(None),
},
}
}
}
}
fn resolve_candidates(
ctx: &dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
fn resolve_operation(
ctx: &dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
use Operation::{And, Or, Phrase};
let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Phrase(words) => {
if proximity == 0 {
let most_left = words
.iter()
.filter_map(|o| o.as_ref())
.next()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words
.iter()
.rev()
.filter_map(|o| o.as_ref())
.next()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
match (most_left, most_right) {
(Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, words)?)],
_otherwise => Default::default(),
}
} else {
Default::default()
}
}
Or(_, ops) => {
let mut output = Vec::new();
for op in ops {
let result = resolve_operation(ctx, op, proximity, cache, wdcache)?;
output.extend(result);
}
output
}
Operation::Query(q) => {
if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)]
} else {
Default::default()
}
}
};
Ok(result)
}
fn mdfs_pair(
ctx: &dyn Context,
left: &Operation,
right: &Operation,
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
}
let pair_max_proximity = 7;
let mut output = Vec::new();
for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
let left_key = (left.clone(), left_p);
if !cache.contains_key(&left_key) {
let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?;
cache.insert(left_key.clone(), candidates);
}
let right_key = (right.clone(), right_p);
if !cache.contains_key(&right_key) {
let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?;
cache.insert(right_key.clone(), candidates);
}
let lefts = cache.get(&left_key).unwrap();
let rights = cache.get(&right_key).unwrap();
for (ll, lr, lcandidates) in lefts {
for (rl, rr, rcandidates) in rights {
let mut candidates =
query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
if lcandidates.len() < rcandidates.len() {
candidates &= lcandidates;
candidates &= rcandidates;
} else {
candidates &= rcandidates;
candidates &= lcandidates;
}
if !candidates.is_empty() {
output.push((ll.clone(), rr.clone(), candidates));
}
}
}
}
}
Ok(output)
}
fn mdfs(
ctx: &dyn Context,
branches: &[Operation],
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
// Extract the first two elements but gives the tail
// that is just after the first element.
let next =
branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));
match next {
Some((head1, Some((head2, [_])))) => {
mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
}
Some((head1, Some((head2, tail)))) => {
let mut output = Vec::new();
for p in 0..=proximity {
for (lhead, _, head_candidates) in
mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
{
if !head_candidates.is_empty() {
for (_, rtail, mut candidates) in
mdfs(ctx, tail, proximity - p, cache, wdcache)?
{
candidates &= &head_candidates;
if !candidates.is_empty() {
output.push((lhead.clone(), rtail, candidates));
}
}
}
}
}
Ok(output)
}
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
None => Ok(Default::default()),
}
}
let mut candidates = RoaringBitmap::new();
for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? {
candidates |= cds;
}
Ok(candidates)
}
fn resolve_plane_sweep_candidates(
ctx: &dyn Context,
query_tree: &Operation,
allowed_candidates: &RoaringBitmap,
) -> Result<BTreeMap<u8, RoaringBitmap>> {
/// FIXME may be buggy with query like "new new york"
fn plane_sweep(
groups_positions: Vec<Vec<(Position, u8, Position)>>,
consecutive: bool,
) -> Result<Vec<(Position, u8, Position)>> {
fn compute_groups_proximity(
groups: &[(usize, (Position, u8, Position))],
consecutive: bool,
) -> Option<(Position, u8, Position)> {
// take the inner proximity of the first group as initial
let (_, (_, mut proximity, _)) = groups.first()?;
let (_, (left_most_pos, _, _)) = groups.first()?;
let (_, (_, _, right_most_pos)) =
groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
for pair in groups.windows(2) {
if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
// if two positions are equal, meaning that they share at least a word, we return None
if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
return None;
}
let pair_proximity = {
// if intervals are disjoint [..].(..)
if lpos2 > rpos1 {
lpos2 - rpos1
}
// if the second interval is a subset of the first [.(..).]
else if rpos2 < rpos1 {
(lpos2 - lpos1).min(rpos1 - rpos2)
}
// if intervals overlaps [.(..].)
else {
(lpos2 - lpos1).min(rpos2 - rpos1)
}
};
// if groups are in the good order (query order) we remove 1 to the proximity
// the proximity is clamped to 7
let pair_proximity =
if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };
proximity += pair_proximity as u8 + prox2;
}
}
// if groups should be consecutives, we will only accept groups with a proximity of 0
if !consecutive || proximity == 0 {
Some((*left_most_pos, proximity, *right_most_pos))
} else {
None
}
}
let groups_len = groups_positions.len();
let mut groups_positions: Vec<_> =
groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
// Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len);
for (i, positions) in groups_positions.iter_mut().enumerate() {
match positions.next() {
Some(p) => current.push((i, p)),
// if a group return None, it means that the document does not contain all the words,
// we return an empty result.
None => return Ok(Vec::new()),
}
}
// Sort k elements by their positions.
current.sort_unstable_by_key(|(_, p)| *p);
// Find leftmost and rightmost group and their positions.
let mut leftmost = *current.first().unwrap();
let mut rightmost = *current.last().unwrap();
let mut output = Vec::new();
loop {
// Find the position p of the next elements of a list of the leftmost group.
// If the list is empty, break the loop.
let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p));
// let q be the position q of second group of the interval.
let q = current[1];
// If p > r, then the interval [l, r] is minimal and
// we insert it into the heap according to its size.
if p.map_or(true, |p| p.1 > rightmost.1) {
if let Some(group) = compute_groups_proximity(&current, consecutive) {
output.push(group);
}
}
let p = match p {
Some(p) => p,
None => break,
};
// Replace the leftmost group P in the interval.
current[0] = p;
if p.1 > rightmost.1 {
// if [l, r] is minimal, let r = p and l = q.
rightmost = p;
leftmost = q;
} else {
// Ohterwise, let l = min{p,q}.
leftmost = if p.1 < q.1 { p } else { q };
}
// Then update the interval and order of groups_positions in the interval.
current.sort_unstable_by_key(|(_, p)| *p);
}
// Sort the list according to the size and the positions.
output.sort_unstable();
Ok(output)
}
fn resolve_operation<'a>(
query_tree: &'a Operation,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>,
) -> Result<Vec<(Position, u8, Position)>> {
use Operation::{And, Or, Phrase};
if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone());
}
let result = match query_tree {
And(ops) => {
let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops {
let positions = resolve_operation(operation, rocache, words_positions)?;
groups_positions.push(positions);
}
plane_sweep(groups_positions, false)?
}
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
// group stop_words together.
for words in words.linear_group_by_key(Option::is_none) {
// skip if it's a group of stop words.
if matches!(words.first(), None | Some(None)) {
continue;
}
// make a consecutive plane-sweep on the subgroup of words.
let mut subgroup = Vec::with_capacity(words.len());
for word in words.iter().map(|w| w.as_deref().unwrap()) {
match words_positions.get(word) {
Some(positions) => {
subgroup.push(positions.iter().map(|p| (p, 0, p)).collect())
}
None => return Ok(vec![]),
}
}
match subgroup.len() {
0 => {}
1 => groups_positions.push(subgroup.pop().unwrap()),
_ => groups_positions.push(plane_sweep(subgroup, true)?),
}
}
match groups_positions.len() {
0 => vec![],
1 => groups_positions.pop().unwrap(),
_ => plane_sweep(groups_positions, false)?,
}
}
Or(_, ops) => {
let mut result = Vec::new();
for op in ops {
result.extend(resolve_operation(op, rocache, words_positions)?)
}
result.sort_unstable();
result
}
Operation::Query(Query { prefix, kind }) => {
let mut result = Vec::new();
match kind {
QueryKind::Exact { word, .. } => {
if *prefix {
let iter = word_derivations(word, true, 0, words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
} else if let Some(positions) = words_positions.get(word) {
result.extend(positions.iter().map(|p| (p, 0, p)));
}
}
QueryKind::Tolerant { typo, word } => {
let iter = word_derivations(word, *prefix, *typo, words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
}
}
result.sort_unstable();
result
}
};
rocache.insert(query_tree, result.clone());
Ok(result)
}
fn word_derivations<'a>(
word: &str,
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap> {
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
match dfa.eval(document_word) {
Distance::Exact(_) => Some(positions),
Distance::AtLeast(_) => None,
}
})
}
let mut resolve_operation_cache = HashMap::new();
let mut candidates = BTreeMap::new();
for docid in allowed_candidates {
let words_positions = ctx.docid_words_positions(docid)?;
resolve_operation_cache.clear();
let positions =
resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?;
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7);
candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid);
}
Ok(candidates)
}
#[cfg(test)]
mod tests {
use std::io::Cursor;
use big_s::S;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::{Criterion, CriterionImplementationStrategy, SearchResult};
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
let mut documents = Vec::new();
for prefix in prefixes {
for i in 0..500 {
documents.push(
serde_json::json!({
"text": format!("{prefix}{i:x}"),
})
.as_object()
.unwrap()
.clone(),
)
}
}
documents
}
#[test]
fn test_proximity_criterion_prefix_handling() {
let mut index = TempIndex::new();
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_primary_key(S("id"));
settings.set_criteria(vec![
Criterion::Words,
Criterion::Typo,
Criterion::Proximity,
]);
})
.unwrap();
let mut documents = DocumentsBatchBuilder::new(Vec::new());
for doc in [
// 0
serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }),
// 1
serde_json::json!({ "text": "zero bad configuration" }),
// 2
serde_json::json!({ "text": "zero configuration" }),
// 3
serde_json::json!({ "text": "zero config" }),
// 4
serde_json::json!({ "text": "zero conf" }),
// 5
serde_json::json!({ "text": "zero bad conf" }),
] {
documents.append_json_object(doc.as_object().unwrap()).unwrap();
}
for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) {
documents.append_json_object(&doc).unwrap();
}
let documents =
DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap()))
.unwrap();
index.add_documents(documents).unwrap();
let rtxn = index.read_txn().unwrap();
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
.search(&rtxn)
.query("zero c")
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
.execute()
.unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]");
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
.search(&rtxn)
.query("zero co")
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
.execute()
.unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]");
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
.search(&rtxn)
.query("zero con")
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
.execute()
.unwrap();
// Here searh results are degraded because `con` is in the prefix cache but it is too
// long to be stored in the prefix proximity databases, and we don't want to iterate over
// all of its word derivations
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]");
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
.search(&rtxn)
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
.query("zero conf")
.execute()
.unwrap();
// Here search results are degraded as well, but we can still rank correctly documents
// that contain `conf` exactly, and not as a prefix.
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]");
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
.search(&rtxn)
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
.query("zero config")
.execute()
.unwrap();
// `config` is not a common prefix, so the normal methods are used
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]");
}
}

View File

@ -1,493 +0,0 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use super::{
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
CriterionResult,
};
use crate::search::criteria::{resolve_phrase, InitialCandidates};
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use crate::Result;
/// Maximum number of typo for a word of any length.
const MAX_TYPOS_PER_WORD: u8 = 2;
pub struct Typo<'t> {
ctx: &'t dyn Context<'t>,
/// (max_typos, query_tree, candidates)
state: Option<(u8, Operation, Candidates)>,
typos: u8,
initial_candidates: Option<InitialCandidates>,
parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
}
impl<'t> Typo<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Typo {
ctx,
state: None,
typos: 0,
initial_candidates: None,
parent,
candidates_cache: HashMap::new(),
}
}
}
impl<'t> Criterion for Typo<'t> {
#[logging_timer::time("Typo::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
// remove excluded candidates when next is called, instead of doing it in the loop.
match self.state.as_mut() {
Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates,
Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates,
None => (),
}
loop {
debug!(
"Typo at iteration {} (max typos {:?}) ({:?})",
self.typos,
self.state.as_ref().map(|(mt, _, _)| mt),
self.state.as_ref().map(|(_, _, cd)| cd),
);
match self.state.as_mut() {
Some((max_typos, _, _)) if self.typos > *max_typos => {
self.state = None; // reset state
}
Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
self.state = None; // reset state
}
Some((_, query_tree, candidates_authorization)) => {
let fst = self.ctx.words_fst();
let new_query_tree = match self.typos {
typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree(
fst,
query_tree.clone(),
self.typos,
params.wdcache,
)?,
MAX_TYPOS_PER_WORD => {
// When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
// we keep the altered query tree
*query_tree = alterate_query_tree(
fst,
query_tree.clone(),
self.typos,
params.wdcache,
)?;
// we compute the allowed candidates
let query_tree_allowed_candidates =
resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
// we assign the allowed candidates to the candidates authorization.
*candidates_authorization = match take(candidates_authorization) {
Allowed(allowed_candidates) => {
Allowed(query_tree_allowed_candidates & allowed_candidates)
}
Forbidden(forbidden_candidates) => {
Allowed(query_tree_allowed_candidates - forbidden_candidates)
}
};
query_tree.clone()
}
_otherwise => query_tree.clone(),
};
let mut candidates = resolve_candidates(
self.ctx,
&new_query_tree,
self.typos,
&mut self.candidates_cache,
params.wdcache,
)?;
match candidates_authorization {
Allowed(allowed_candidates) => {
candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates;
}
Forbidden(forbidden_candidates) => {
candidates -= &*forbidden_candidates;
*forbidden_candidates |= &candidates;
}
}
let initial_candidates = match self.initial_candidates.as_mut() {
Some(initial_candidates) => initial_candidates.take(),
None => InitialCandidates::Estimated(candidates.clone()),
};
self.typos += 1;
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: Some(candidates),
filtered_candidates: None,
initial_candidates: Some(initial_candidates),
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
initial_candidates,
}) => {
self.initial_candidates =
match (self.initial_candidates.take(), initial_candidates) {
(Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic),
(self_ic, parent_ic) => self_ic.or(parent_ic),
};
let candidates = match candidates.or(filtered_candidates) {
Some(candidates) => {
Candidates::Allowed(candidates - params.excluded_candidates)
}
None => Candidates::Forbidden(params.excluded_candidates.clone()),
};
let maximum_typos = maximum_typo(&query_tree) as u8;
self.state = Some((maximum_typos, query_tree, candidates));
self.typos = 0;
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}));
}
None => return Ok(None),
},
}
}
}
}
/// Modify the query tree by replacing every tolerant query by an Or operation
/// containing all of the corresponding exact words in the words FST. Each tolerant
/// query will only be replaced by exact query with up to `number_typos` maximum typos.
fn alterate_query_tree(
words_fst: &fst::Set<Cow<[u8]>>,
mut query_tree: Operation,
number_typos: u8,
wdcache: &mut WordDerivationsCache,
) -> Result<Operation> {
fn recurse(
words_fst: &fst::Set<Cow<[u8]>>,
operation: &mut Operation,
number_typos: u8,
wdcache: &mut WordDerivationsCache,
) -> Result<()> {
use Operation::{And, Or, Phrase};
match operation {
And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
}
// Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => Ok(()),
Operation::Query(q) => {
if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function,
// and directly create an Exact query
if number_typos == 0 {
*operation = Operation::Query(Query {
prefix: q.prefix,
kind: QueryKind::Exact { original_typo: 0, word: word.clone() },
});
} else {
let typo = *typo.min(&number_typos);
let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?;
let queries = words
.iter()
.map(|(word, typo)| {
Operation::Query(Query {
prefix: false,
kind: QueryKind::Exact {
original_typo: *typo,
word: word.to_string(),
},
})
})
.collect();
*operation = Operation::or(false, queries);
}
}
Ok(())
}
}
}
recurse(words_fst, &mut query_tree, number_typos, wdcache)?;
Ok(query_tree)
}
fn resolve_candidates(
ctx: &dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
fn resolve_operation(
ctx: &dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
use Operation::{And, Or, Phrase, Query};
match query_tree {
And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
Phrase(words) => resolve_phrase(ctx, words),
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?;
candidates |= docids;
}
Ok(candidates)
}
Query(q) => {
if q.kind.typo() == number_typos {
Ok(query_docids(ctx, q, wdcache)?)
} else {
Ok(RoaringBitmap::new())
}
}
}
}
fn mdfs(
ctx: &dyn Context,
branches: &[Operation],
mana: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
match branches.split_first() {
Some((head, [])) => {
let cache_key = (head.clone(), mana);
if let Some(candidates) = cache.get(&cache_key) {
Ok(candidates.clone())
} else {
let candidates = resolve_operation(ctx, head, mana, cache, wdcache)?;
cache.insert(cache_key, candidates.clone());
Ok(candidates)
}
}
Some((head, tail)) => {
let mut candidates = RoaringBitmap::new();
for m in 0..=mana {
let mut head_candidates = {
let cache_key = (head.clone(), m);
if let Some(candidates) = cache.get(&cache_key) {
candidates.clone()
} else {
let candidates = resolve_operation(ctx, head, m, cache, wdcache)?;
cache.insert(cache_key, candidates.clone());
candidates
}
};
if !head_candidates.is_empty() {
let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?;
head_candidates &= tail_candidates;
candidates |= head_candidates;
}
}
Ok(candidates)
}
None => Ok(RoaringBitmap::new()),
}
}
resolve_operation(ctx, query_tree, number_typos, cache, wdcache)
}
#[cfg(test)]
mod test {
use super::super::initial::Initial;
use super::super::test::TestContext;
use super::*;
use crate::search::NoopDistinct;
fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String {
let mut result = String::new();
while let Some(criterion) = criteria.next(&mut parameters).unwrap() {
result.push_str(&format!("{criterion:?}\n\n"));
}
result
}
#[test]
fn initial_placeholder_no_facets() {
let context = TestContext::default();
let query_tree = None;
let facet_candidates = None;
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent =
Initial::<NoopDistinct>::new(&context, query_tree, facet_candidates, false, None);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None }
"###);
}
#[test]
fn initial_query_tree_no_facets() {
let context = TestContext::default();
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let facet_candidates = None;
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent =
Initial::<NoopDistinct>::new(&context, Some(query_tree), facet_candidates, false, None);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
OR
Exact { word: "word" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
"###);
}
#[test]
fn initial_placeholder_with_facets() {
let context = TestContext::default();
let query_tree = None;
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent =
Initial::<NoopDistinct>::new(&context, query_tree, Some(facet_candidates), false, None);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None }
"###);
}
#[test]
fn initial_query_tree_with_facets() {
let context = TestContext::default();
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let criterion_parameters = CriterionParameters {
wdcache: &mut WordDerivationsCache::new(),
excluded_candidates: &RoaringBitmap::new(),
};
let parent = Initial::<NoopDistinct>::new(
&context,
Some(query_tree),
Some(facet_candidates),
false,
None,
);
let criteria = Typo::new(&context, Box::new(parent));
let result = display_criteria(criteria, criterion_parameters);
insta::assert_snapshot!(result, @r###"
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
CriterionResult { query_tree: Some(OR
AND
Exact { word: "split" }
Exact { word: "this" }
OR
Exact { word: "word" }
Exact { word: "world" }
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
"###);
}
}

View File

@ -1,106 +0,0 @@
use log::debug;
use roaring::RoaringBitmap;
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::InitialCandidates;
use crate::search::query_tree::Operation;
use crate::Result;
pub struct Words<'t> {
ctx: &'t dyn Context<'t>,
query_trees: Vec<Operation>,
candidates: Option<RoaringBitmap>,
initial_candidates: Option<InitialCandidates>,
filtered_candidates: Option<RoaringBitmap>,
parent: Box<dyn Criterion + 't>,
}
impl<'t> Words<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Words {
ctx,
query_trees: Vec::default(),
candidates: None,
initial_candidates: None,
parent,
filtered_candidates: None,
}
}
}
impl<'t> Criterion for Words<'t> {
#[logging_timer::time("Words::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop.
if let Some(candidates) = self.candidates.as_mut() {
*candidates -= params.excluded_candidates;
}
loop {
debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates);
match self.query_trees.pop() {
Some(query_tree) => {
let candidates = match self.candidates.as_mut() {
Some(allowed_candidates) => {
let mut candidates =
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
candidates &= &*allowed_candidates;
*allowed_candidates -= &candidates;
Some(candidates)
}
None => None,
};
let initial_candidates = self.initial_candidates.clone();
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates: self.filtered_candidates.clone(),
initial_candidates,
}));
}
None => match self.parent.next(params)? {
Some(CriterionResult {
query_tree: Some(query_tree),
candidates,
filtered_candidates,
initial_candidates,
}) => {
self.query_trees = explode_query_tree(query_tree);
self.candidates = candidates;
self.filtered_candidates = filtered_candidates;
self.initial_candidates =
match (self.initial_candidates.take(), initial_candidates) {
(Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic),
(self_ic, parent_ic) => self_ic.or(parent_ic),
};
}
Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
filtered_candidates,
initial_candidates,
}));
}
None => return Ok(None),
},
}
}
}
}
fn explode_query_tree(query_tree: Operation) -> Vec<Operation> {
match query_tree {
Operation::Or(true, ops) => ops,
otherwise => vec![otherwise],
}
}

View File

@ -1,218 +0,0 @@
use std::mem::size_of;
use concat_arrays::concat_arrays;
use heed::types::{ByteSlice, Str, Unit};
use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::error::InternalError;
use crate::heed_codec::facet::{FacetGroupKey, *};
use crate::index::db_name;
use crate::{DocumentId, FieldId, Index, Result};
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
/// A distinct implementer that is backed by facets.
///
/// On each iteration, the facet values for the
/// distinct attribute of the first document are retrieved. The document ids for these facet values
/// are then retrieved and taken out of the the candidate and added to the excluded set. We take
/// care to keep the document we are currently on, and remove it from the excluded list. The next
/// iterations will never contain any occurence of a document with the same distinct value as a
/// document from previous iterations.
#[derive(Clone)]
pub struct FacetDistinct<'a> {
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
}
impl<'a> FacetDistinct<'a> {
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
Self { distinct, index, txn }
}
}
pub struct FacetDistinctIter<'a> {
candidates: RoaringBitmap,
distinct: FieldId,
excluded: RoaringBitmap,
index: &'a Index,
iter_offset: usize,
txn: &'a heed::RoTxn<'a>,
}
impl<'a> FacetDistinctIter<'a> {
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index
.facet_id_string_docids
.get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key })
.map(|opt| opt.map(|v| v.bitmap))
}
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
// get facet docids on level 0
self.index
.facet_id_f64_docids
.get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key })
.map(|opt| opt.map(|v| v.bitmap))
}
fn distinct_string(&mut self, id: DocumentId) -> Result<()> {
let iter = facet_string_values(id, self.distinct, self.index, self.txn)?;
for item in iter {
let ((_, _, value), _) = item?;
let facet_docids =
self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::FACET_ID_STRING_DOCIDS,
key: None,
})?;
self.excluded |= facet_docids;
}
self.excluded.remove(id);
Ok(())
}
fn distinct_number(&mut self, id: DocumentId) -> Result<()> {
let iter = facet_number_values(id, self.distinct, self.index, self.txn)?;
for item in iter {
let ((_, _, value), _) = item?;
let facet_docids =
self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::FACET_ID_F64_DOCIDS,
key: None,
})?;
self.excluded |= facet_docids;
}
self.excluded.remove(id);
Ok(())
}
/// Performs the next iteration of the facet distinct. This is a convenience method that is
/// called by the Iterator::next implementation that transposes the result. It makes error
/// handling easier.
fn next_inner(&mut self) -> Result<Option<DocumentId>> {
// The first step is to remove all the excluded documents from our candidates
self.candidates -= &self.excluded;
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
match candidates_iter.next() {
Some(id) => {
// We distinct the document id on its facet strings and facet numbers.
self.distinct_string(id)?;
self.distinct_number(id)?;
// The first document of each iteration is kept, since the next call to
// `difference_with` will filter out all the documents for that facet value. By
// increasing the offset we make sure to get the first valid value for the next
// distinct document to keep.
self.iter_offset += 1;
Ok(Some(id))
}
// no more candidate at this offset, return.
None => Ok(None),
}
}
}
#[allow(clippy::drop_non_drop)]
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}
fn facet_number_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_f64s
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<FieldDocIdFacetF64Codec>();
Ok(iter)
}
fn facet_string_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, Str>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_types::<FieldDocIdFacetStringCodec, Str>();
Ok(iter)
}
impl Iterator for FacetDistinctIter<'_> {
type Item = Result<DocumentId>;
fn next(&mut self) -> Option<Self::Item> {
self.next_inner().transpose()
}
}
impl DocIter for FacetDistinctIter<'_> {
fn into_excluded(self) -> RoaringBitmap {
self.excluded
}
}
impl<'a> Distinct for FacetDistinct<'a> {
type Iter = FacetDistinctIter<'a>;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
FacetDistinctIter {
candidates,
distinct: self.distinct,
excluded,
index: self.index,
iter_offset: 0,
txn: self.txn,
}
}
}
#[cfg(test)]
mod test {
use super::super::test::{generate_index, validate_distinct_candidates};
use super::*;
macro_rules! test_facet_distinct {
($name:ident, $distinct:literal) => {
#[test]
fn $name() {
let (index, fid, candidates) = generate_index($distinct);
let txn = index.read_txn().unwrap();
let mut map_distinct = FacetDistinct::new(fid, &index, &txn);
let excluded = RoaringBitmap::new();
let mut iter = map_distinct.distinct(candidates.clone(), excluded);
let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
let excluded = iter.into_excluded();
assert_eq!(count as u64 + excluded.len(), candidates.len());
}
};
}
test_facet_distinct!(test_string, "txt");
test_facet_distinct!(test_strings, "txts");
test_facet_distinct!(test_number, "cat-int");
}

View File

@ -1,155 +0,0 @@
mod facet_distinct;
mod noop_distinct;
pub use facet_distinct::FacetDistinct;
pub use noop_distinct::NoopDistinct;
use roaring::RoaringBitmap;
use crate::{DocumentId, Result};
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
/// It provides a way to get back the ownership to the excluded set.
pub trait DocIter: Iterator<Item = Result<DocumentId>> {
/// Returns ownership on the internal exluded set.
fn into_excluded(self) -> RoaringBitmap;
}
/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct
/// must return an iterator containing only distinct documents, and add the discarded documents to
/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the
/// returned iterator.
pub trait Distinct {
type Iter: DocIter;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter;
}
#[cfg(test)]
mod test {
use std::collections::HashSet;
use std::io::Cursor;
use once_cell::sync::Lazy;
use rand::seq::SliceRandom;
use rand::Rng;
use roaring::RoaringBitmap;
use serde_json::{json, Value};
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::index::Index;
use crate::update::{
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
};
use crate::{DocumentId, FieldId, BEU32};
static JSON: Lazy<Vec<u8>> = Lazy::new(|| {
let mut rng = rand::thread_rng();
let num_docs = rng.gen_range(10..30);
let mut builder = DocumentsBatchBuilder::new(Vec::new());
let txts = ["Toto", "Titi", "Tata"];
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
let cat_ints = (1..10).collect::<Vec<_>>();
for i in 0..num_docs {
let txt = txts.choose(&mut rng).unwrap();
let mut sample_txts = cats.clone();
sample_txts.shuffle(&mut rng);
let mut sample_ints = cat_ints.clone();
sample_ints.shuffle(&mut rng);
let json = json!({
"id": i,
"txt": txt,
"cat-int": rng.gen_range(0..3),
"txts": sample_txts[..(rng.gen_range(0..3))],
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
});
let object = match json {
Value::Object(object) => object,
_ => panic!(),
};
builder.append_json_object(&object).unwrap();
}
builder.into_inner().unwrap()
});
/// Returns a temporary index populated with random test documents, the FieldId for the
/// distinct attribute, and the RoaringBitmap with the document ids.
pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) {
let index = TempIndex::new();
let mut txn = index.write_txn().unwrap();
// set distinct and faceted attributes for the index.
let config = IndexerConfig::default();
let mut update = Settings::new(&mut txn, &index, &config);
update.set_distinct_field(distinct.to_string());
update.execute(|_| (), || false).unwrap();
// add documents to the index
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
};
let addition =
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false)
.unwrap();
let reader =
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
.unwrap();
let (addition, user_error) = addition.add_documents(reader).unwrap();
user_error.unwrap();
addition.execute().unwrap();
let fields_map = index.fields_ids_map(&txn).unwrap();
let fid = fields_map.id(distinct).unwrap();
let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap();
let map = (0..documents.documents_count()).collect();
txn.commit().unwrap();
(index, fid, map)
}
/// Checks that all the candidates are distinct, and returns the candidates number.
pub(crate) fn validate_distinct_candidates(
candidates: impl Iterator<Item = crate::Result<DocumentId>>,
distinct: FieldId,
index: &Index,
) -> usize {
fn test(seen: &mut HashSet<String>, value: &Value) {
match value {
Value::Null | Value::Object(_) | Value::Bool(_) => (),
Value::Number(_) | Value::String(_) => {
let s = value.to_string();
assert!(seen.insert(s));
}
Value::Array(values) => values.iter().for_each(|value| test(seen, value)),
}
}
let mut seen = HashSet::<String>::new();
let txn = index.read_txn().unwrap();
let mut count = 0;
for candidate in candidates {
count += 1;
let candidate = candidate.unwrap();
let id = BEU32::new(candidate);
let document = index.documents.get(&txn, &id).unwrap().unwrap();
let value = document.get(distinct).unwrap();
let value = serde_json::from_slice(value).unwrap();
test(&mut seen, &value);
}
count
}
}

View File

@ -1,55 +0,0 @@
use roaring::bitmap::IntoIter;
use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::{DocumentId, Result};
/// A distinct implementer that does not perform any distinct,
/// and simply returns an iterator to the candidates.
pub struct NoopDistinct;
pub struct NoopDistinctIter {
candidates: IntoIter,
excluded: RoaringBitmap,
}
impl Iterator for NoopDistinctIter {
type Item = Result<DocumentId>;
fn next(&mut self) -> Option<Self::Item> {
self.candidates.next().map(Ok)
}
}
impl DocIter for NoopDistinctIter {
fn into_excluded(self) -> RoaringBitmap {
self.excluded
}
}
impl Distinct for NoopDistinct {
type Iter = NoopDistinctIter;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
NoopDistinctIter { candidates: candidates.into_iter(), excluded }
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_noop() {
let candidates = (1..10).collect();
let excluded = RoaringBitmap::new();
let mut iter = NoopDistinct.distinct(candidates, excluded);
assert_eq!(
iter.by_ref().map(Result::unwrap).collect::<Vec<_>>(),
(1..10).collect::<Vec<_>>()
);
let excluded = iter.into_excluded();
assert!(excluded.is_empty());
}
}

View File

@ -309,7 +309,7 @@ impl<'a> FacetDistribution<'a> {
let mut distribution = BTreeMap::new();
for (fid, name) in fields_ids_map.iter() {
if crate::is_faceted(name, &fields) {
let min_value = if let Some(min_value) = crate::search::criteria::facet_min_value(
let min_value = if let Some(min_value) = crate::search::facet::facet_min_value(
self.index,
self.rtxn,
fid,
@ -319,7 +319,7 @@ impl<'a> FacetDistribution<'a> {
} else {
continue;
};
let max_value = if let Some(max_value) = crate::search::criteria::facet_max_value(
let max_value = if let Some(max_value) = crate::search::facet::facet_max_value(
self.index,
self.rtxn,
fid,

View File

@ -2,11 +2,13 @@ pub use facet_sort_ascending::ascending_facet_sort;
pub use facet_sort_descending::descending_facet_sort;
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
pub use self::filter::{BadGeoError, Filter};
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result};
mod facet_distribution;
mod facet_distribution_iter;
mod facet_range_search;
@ -14,6 +16,38 @@ mod facet_sort_ascending;
mod facet_sort_descending;
mod filter;
fn facet_extreme_value<'t>(
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
) -> Result<Option<f64>> {
let extreme_value =
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
let (_, extreme_value) = extreme_value?;
Ok(OrderedF64Codec::bytes_decode(extreme_value))
}
pub fn facet_min_value<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
pub fn facet_max_value<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: u16,
candidates: RoaringBitmap,
) -> Result<Option<f64>> {
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
facet_extreme_value(it)
}
/// Get the first facet value in the facet database
pub(crate) fn get_first_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,

View File

@ -1,38 +1,27 @@
use std::borrow::Cow;
use std::collections::hash_map::{Entry, HashMap};
use std::fmt;
use std::mem::take;
use std::result::Result as StdResult;
use std::str::Utf8Error;
use std::time::Instant;
use charabia::TokenizerBuilder;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use log::debug;
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
pub use self::matches::{
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
};
use self::query_tree::QueryTreeBuilder;
use crate::error::UserError;
use crate::search::criteria::r#final::{Final, FinalResult};
use crate::search::criteria::InitialCandidates;
use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result};
use crate::{
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
};
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
use std::borrow::Cow;
use std::collections::hash_map::{Entry, HashMap};
use std::fmt;
use std::result::Result as StdResult;
use std::str::Utf8Error;
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
mod criteria;
mod distinct;
pub mod facet;
mod fst_utils;
mod matches;
@ -135,162 +124,18 @@ impl<'a> Search<'a> {
}
pub fn execute(&self) -> Result<SearchResult> {
// We create the query tree by spliting the query into tokens.
let before = Instant::now();
let (query_tree, primitive_query, matching_words) = match self.query.as_ref() {
Some(query) => {
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?;
builder.terms_matching_strategy(self.terms_matching_strategy);
builder.authorize_typos(self.is_typo_authorized()?);
builder.words_limit(self.words_limit);
// We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them.
let mut tokbuilder = TokenizerBuilder::new();
let stop_words = self.index.stop_words(self.rtxn)?;
if let Some(ref stop_words) = stop_words {
tokbuilder.stop_words(stop_words);
}
let script_lang_map = self.index.script_language(self.rtxn)?;
if !script_lang_map.is_empty() {
tokbuilder.allow_list(&script_lang_map);
}
let tokenizer = tokbuilder.build();
let tokens = tokenizer.tokenize(query);
builder
.build(tokens)?
.map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw)))
}
None => (None, None, None),
};
debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed());
// We create the original candidates with the facet conditions results.
let before = Instant::now();
let filtered_candidates = match &self.filter {
Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?),
None => None,
};
debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed());
// We check that we are allowed to use the sort criteria, we check
// that they are declared in the sortable fields.
if let Some(sort_criteria) = &self.sort_criteria {
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
for asc_desc in sort_criteria {
match asc_desc.member() {
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
return Err(UserError::InvalidSortableAttribute {
field: field.to_string(),
valid_fields: sortable_fields.into_iter().collect(),
})?
}
Member::Geo(_) if !sortable_fields.contains("_geo") => {
return Err(UserError::InvalidSortableAttribute {
field: "_geo".to_string(),
valid_fields: sortable_fields.into_iter().collect(),
})?
}
_ => (),
}
}
}
// We check that the sort ranking rule exists and throw an
// error if we try to use it and that it doesn't.
let sort_ranking_rule_missing = !self.index.criteria(self.rtxn)?.contains(&Criterion::Sort);
let empty_sort_criteria = self.sort_criteria.as_ref().map_or(true, |s| s.is_empty());
if sort_ranking_rule_missing && !empty_sort_criteria {
return Err(UserError::SortRankingRuleMissing.into());
}
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
match self.index.distinct_field(self.rtxn)? {
None => {
let criteria = criteria_builder.build::<NoopDistinct>(
query_tree,
primitive_query,
filtered_candidates,
self.sort_criteria.clone(),
self.exhaustive_number_hits,
None,
self.criterion_implementation_strategy,
)?;
self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria)
}
Some(name) => {
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
match field_ids_map.id(name) {
Some(fid) => {
let distinct = FacetDistinct::new(fid, self.index, self.rtxn);
let criteria = criteria_builder.build(
query_tree,
primitive_query,
filtered_candidates,
self.sort_criteria.clone(),
self.exhaustive_number_hits,
Some(distinct.clone()),
self.criterion_implementation_strategy,
)?;
self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria)
}
None => Ok(SearchResult::default()),
}
}
}
}
fn perform_sort<D: Distinct>(
&self,
mut distinct: D,
matching_words: MatchingWords,
mut criteria: Final,
) -> Result<SearchResult> {
let mut offset = self.offset;
let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new());
let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?;
let mut documents_ids = Vec::new();
while let Some(FinalResult { candidates, initial_candidates: ic, .. }) =
criteria.next(&excluded_candidates)?
{
debug!("Number of candidates found {}", candidates.len());
let excluded = take(&mut excluded_candidates);
let mut candidates = distinct.distinct(candidates, excluded);
initial_candidates |= ic;
if offset != 0 {
let discarded = candidates.by_ref().take(offset).count();
offset = offset.saturating_sub(discarded);
}
for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) {
documents_ids.push(candidate?);
}
excluded_candidates |= candidates.into_excluded();
if documents_ids.len() == self.limit {
break;
}
}
initial_candidates.map_inplace(|c| c - excluded_candidates);
Ok(SearchResult {
matching_words,
candidates: initial_candidates.into_inner(),
documents_ids,
})
let mut ctx = SearchContext::new(self.index, self.rtxn);
execute_search(
&mut ctx,
&self.query,
self.terms_matching_strategy,
&self.filter,
self.offset,
self.limit,
Some(self.words_limit),
&mut DefaultSearchLogger,
&mut DefaultSearchLogger,
)
}
}

View File

@ -18,7 +18,7 @@ mod words;
// #[cfg(test)]
use std::collections::{BTreeSet, HashSet};
use charabia::Tokenize;
use charabia::{Tokenize, TokenizerBuilder};
use db_cache::DatabaseCache;
use graph_based_ranking_rule::{Proximity, Typo};
use heed::RoTxn;
@ -224,32 +224,41 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
#[allow(clippy::too_many_arguments)]
pub fn execute_search(
ctx: &mut SearchContext,
query: &str,
query: &Option<String>,
terms_matching_strategy: TermsMatchingStrategy,
filters: Option<Filter>,
filters: &Option<Filter>,
from: usize,
length: usize,
words_limit: Option<usize>,
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<SearchResult> {
assert!(!query.is_empty());
let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?;
let graph = QueryGraph::from_query(ctx, query_terms)?;
let mut universe = if let Some(filters) = filters {
filters.evaluate(ctx.txn, ctx.index)?
} else {
ctx.index.documents_ids(ctx.txn)?
};
// TODO: other way to tell whether it is a placeholder search
// This way of doing things is not correct because if someone searches
// for a word that does not appear in any document, the word will be removed
// from the graph and thus its number of nodes will be == 2
// But in that case, we should return no results.
//
// The search is a placeholder search only if there are no tokens?
let documents_ids = if graph.nodes.len() > 2 {
let documents_ids = if let Some(query) = query {
// We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them.
let mut tokbuilder = TokenizerBuilder::new();
let stop_words = ctx.index.stop_words(ctx.txn)?;
if let Some(ref stop_words) = stop_words {
tokbuilder.stop_words(stop_words);
}
let script_lang_map = ctx.index.script_language(ctx.txn)?;
if !script_lang_map.is_empty() {
tokbuilder.allow_list(&script_lang_map);
}
let tokenizer = tokbuilder.build();
let tokens = tokenizer.tokenize(&query);
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
let graph = QueryGraph::from_query(ctx, query_terms)?;
universe = resolve_maximally_reduced_query_graph(
ctx,
&universe,
@ -259,6 +268,7 @@ pub fn execute_search(
)?;
let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?;
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
} else {
let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?;

View File

@ -427,7 +427,7 @@ impl LocatedQueryTerm {
/// Convert the tokenised search query into a list of located query terms.
pub fn located_query_terms_from_string(
ctx: &mut SearchContext,
query: NormalizedTokenIter<Vec<u8>>,
query: NormalizedTokenIter<&[u8]>,
words_limit: Option<usize>,
) -> Result<Vec<LocatedQueryTerm>> {
let nbr_typos = number_of_typos_allowed(ctx)?;