Revert "Sort at query time"

This commit is contained in:
Clémentine Urquizar 2021-08-20 18:09:17 +02:00 committed by GitHub
parent 41fc0dcb62
commit 922f9fd4d5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 148 additions and 701 deletions

View file

@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
use super::{Criterion, CriterionParameters, CriterionResult};
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::{FacetNumberIter, FacetStringIter};
use crate::search::facet::FacetNumberIter;
use crate::search::query_tree::Operation;
use crate::{FieldId, Index, Result};
@ -20,7 +20,7 @@ pub struct AscDesc<'t> {
rtxn: &'t heed::RoTxn<'t>,
field_name: String,
field_id: Option<FieldId>,
is_ascending: bool,
ascending: bool,
query_tree: Option<Operation>,
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
allowed_candidates: RoaringBitmap,
@ -53,16 +53,12 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_name: String,
is_ascending: bool,
ascending: bool,
) -> Result<Self> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let field_id = fields_ids_map.id(&field_name);
let faceted_candidates = match field_id {
Some(field_id) => {
let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?;
let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?;
number_faceted | string_faceted
}
Some(field_id) => index.number_faceted_documents_ids(rtxn, field_id)?,
None => RoaringBitmap::default(),
};
@ -71,7 +67,7 @@ impl<'t> AscDesc<'t> {
rtxn,
field_name,
field_id,
is_ascending,
ascending,
query_tree: None,
candidates: Box::new(std::iter::empty()),
allowed_candidates: RoaringBitmap::new(),
@ -91,7 +87,7 @@ impl<'t> Criterion for AscDesc<'t> {
loop {
debug!(
"Facet {}({}) iteration",
if self.is_ascending { "Asc" } else { "Desc" },
if self.ascending { "Asc" } else { "Desc" },
self.field_name
);
@ -140,7 +136,7 @@ impl<'t> Criterion for AscDesc<'t> {
self.index,
self.rtxn,
field_id,
self.is_ascending,
self.ascending,
candidates & &self.faceted_candidates,
)?,
None => Box::new(std::iter::empty()),
@ -171,49 +167,31 @@ fn facet_ordered<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
ascending: bool,
candidates: RoaringBitmap,
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
if candidates.len() <= CANDIDATES_THRESHOLD {
let number_iter = iterative_facet_number_ordered_iter(
index,
rtxn,
field_id,
is_ascending,
candidates.clone(),
)?;
let string_iter =
iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?;
Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box<dyn Iterator<Item = _>>)
let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else {
let facet_number_fn = if is_ascending {
let facet_fn = if ascending {
FacetNumberIter::new_reducing
} else {
FacetNumberIter::new_reverse_reducing
};
let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())?
.map(|res| res.map(|(_, docids)| docids));
let facet_string_fn = if is_ascending {
FacetStringIter::new_reducing
} else {
FacetStringIter::new_reverse_reducing
};
let string_iter = facet_string_fn(rtxn, index, field_id, candidates)?
.map(|res| res.map(|(_, _, docids)| docids));
Ok(Box::new(number_iter.chain(string_iter)))
let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
}
}
/// Fetch the whole list of candidates facet number values one by one and order them by it.
/// Fetch the whole list of candidates facet values one by one and order them by it.
///
/// This function is fast when the amount of candidates to rank is small.
fn iterative_facet_number_ordered_iter<'t>(
fn iterative_facet_ordered_iter<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
ascending: bool,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
@ -221,14 +199,14 @@ fn iterative_facet_number_ordered_iter<'t>(
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
let entry = if is_ascending { iter.next() } else { iter.last() };
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));
}
}
docids_values.sort_unstable_by_key(|(_, v)| *v);
let iter = docids_values.into_iter();
let iter = if is_ascending {
let iter = if ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
} else {
Box::new(iter.rev())
@ -238,49 +216,7 @@ fn iterative_facet_number_ordered_iter<'t>(
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter
.group_by(|(_, v)| *v)
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();
Ok(vec.into_iter())
}
/// Fetch the whole list of candidates facet string values one by one and order them by it.
///
/// This function is fast when the amount of candidates to rank is small.
fn iterative_facet_string_ordered_iter<'t>(
index: &'t Index,
rtxn: &'t heed::RoTxn,
field_id: FieldId,
is_ascending: bool,
candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, "");
let right = (field_id, docid.saturating_add(1), "");
// FIXME Doing this means that it will never be possible to retrieve
// the document with id 2^32, not sure this is a real problem.
let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?;
let entry = if is_ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), _)) = entry.transpose()? {
docids_values.push((docid, value));
}
}
docids_values.sort_unstable_by_key(|(_, v)| *v);
let iter = docids_values.into_iter();
let iter = if is_ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
} else {
Box::new(iter.rev())
};
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter
.group_by(|(_, v)| *v)
.group_by(|(_, v)| v.clone())
.into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect();

View file

@ -12,7 +12,6 @@ use self::r#final::Final;
use self::typo::Typo;
use self::words::Words;
use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
use crate::criterion::AscDesc as AscDescName;
use crate::search::{word_derivations, WordDerivationsCache};
use crate::{DocumentId, FieldId, Index, Result, TreeLevel};
@ -274,7 +273,6 @@ impl<'t> CriteriaBuilder<'t> {
query_tree: Option<Operation>,
primitive_query: Option<Vec<PrimitiveQueryPart>>,
filtered_candidates: Option<RoaringBitmap>,
sort_criteria: Option<Vec<AscDescName>>,
) -> Result<Final<'t>> {
use crate::criterion::Criterion as Name;
@ -284,30 +282,8 @@ impl<'t> CriteriaBuilder<'t> {
Box::new(Initial::new(query_tree, filtered_candidates)) as Box<dyn Criterion>;
for name in self.index.criteria(&self.rtxn)? {
criterion = match name {
Name::Words => Box::new(Words::new(self, criterion)),
Name::Typo => Box::new(Typo::new(self, criterion)),
Name::Sort => match sort_criteria {
Some(ref sort_criteria) => {
for asc_desc in sort_criteria {
criterion = match asc_desc {
AscDescName::Asc(field) => Box::new(AscDesc::asc(
&self.index,
&self.rtxn,
criterion,
field.to_string(),
)?),
AscDescName::Desc(field) => Box::new(AscDesc::desc(
&self.index,
&self.rtxn,
criterion,
field.to_string(),
)?),
};
}
criterion
}
None => criterion,
},
Name::Words => Box::new(Words::new(self, criterion)),
Name::Proximity => Box::new(Proximity::new(self, criterion)),
Name::Attribute => Box::new(Attribute::new(self, criterion)),
Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?),

View file

@ -131,7 +131,7 @@ use std::ops::Bound::{Excluded, Included, Unbounded};
use either::{Either, Left, Right};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{Database, LazyDecode, RoRange, RoRevRange};
use heed::{Database, LazyDecode, RoRange};
use roaring::RoaringBitmap;
use crate::heed_codec::facet::{
@ -206,65 +206,6 @@ impl<'t> Iterator for FacetStringGroupRange<'t> {
}
}
pub struct FacetStringGroupRevRange<'t> {
iter: RoRevRange<
't,
FacetLevelValueU32Codec,
LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
>,
end: Bound<u32>,
}
impl<'t> FacetStringGroupRevRange<'t> {
pub fn new<X, Y>(
rtxn: &'t heed::RoTxn,
db: Database<X, Y>,
field_id: FieldId,
level: NonZeroU8,
left: Bound<u32>,
right: Bound<u32>,
) -> heed::Result<FacetStringGroupRevRange<'t>> {
let db = db.remap_types::<
FacetLevelValueU32Codec,
FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
>();
let left_bound = match left {
Included(left) => Included((field_id, level, left, u32::MIN)),
Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
};
let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
Ok(FacetStringGroupRevRange { iter, end: right })
}
}
impl<'t> Iterator for FacetStringGroupRevRange<'t> {
type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok(((_fid, level, left, right), docids))) => {
let must_be_returned = match self.end {
Included(end) => right <= end,
Excluded(end) => right < end,
Unbounded => true,
};
if must_be_returned {
match docids.decode() {
Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))),
Err(e) => Some(Err(e)),
}
} else {
None
}
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}
/// An iterator that is used to explore the level 0 of the facets string database.
///
/// It yields the facet string and the roaring bitmap associated with it.
@ -339,81 +280,6 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
}
}
pub struct FacetStringLevelZeroRevRange<'t> {
iter: RoRevRange<
't,
FacetStringLevelZeroCodec,
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
>,
}
impl<'t> FacetStringLevelZeroRevRange<'t> {
pub fn new<X, Y>(
rtxn: &'t heed::RoTxn,
db: Database<X, Y>,
field_id: FieldId,
left: Bound<&str>,
right: Bound<&str>,
) -> heed::Result<FacetStringLevelZeroRevRange<'t>> {
fn encode_value<'a>(buffer: &'a mut Vec<u8>, field_id: FieldId, value: &str) -> &'a [u8] {
buffer.extend_from_slice(&field_id.to_be_bytes());
buffer.push(0);
buffer.extend_from_slice(value.as_bytes());
&buffer[..]
}
let mut left_buffer = Vec::new();
let left_bound = match left {
Included(value) => Included(encode_value(&mut left_buffer, field_id, value)),
Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)),
Unbounded => {
left_buffer.extend_from_slice(&field_id.to_be_bytes());
left_buffer.push(0);
Included(&left_buffer[..])
}
};
let mut right_buffer = Vec::new();
let right_bound = match right {
Included(value) => Included(encode_value(&mut right_buffer, field_id, value)),
Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)),
Unbounded => {
right_buffer.extend_from_slice(&field_id.to_be_bytes());
right_buffer.push(1); // we must only get the level 0
Excluded(&right_buffer[..])
}
};
let iter = db
.remap_key_type::<ByteSlice>()
.rev_range(rtxn, &(left_bound, right_bound))?
.remap_types::<
FacetStringLevelZeroCodec,
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>
>();
Ok(FacetStringLevelZeroRevRange { iter })
}
}
impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> {
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok(((_fid, normalized), (original, docids)))) => {
Some(Ok((normalized, original, docids)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}
type EitherStringRange<'t> = Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>;
type EitherStringRevRange<'t> =
Either<FacetStringGroupRevRange<'t>, FacetStringLevelZeroRevRange<'t>>;
/// An iterator that is used to explore the facet strings level by level,
/// it will only return facets strings that are associated with the
/// candidates documents ids given.
@ -421,45 +287,12 @@ pub struct FacetStringIter<'t> {
rtxn: &'t heed::RoTxn<'t>,
db: Database<ByteSlice, ByteSlice>,
field_id: FieldId,
level_iters: Vec<(RoaringBitmap, Either<EitherStringRange<'t>, EitherStringRevRange<'t>>)>,
level_iters:
Vec<(RoaringBitmap, Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>)>,
must_reduce: bool,
}
impl<'t> FacetStringIter<'t> {
pub fn new_reducing(
rtxn: &'t heed::RoTxn,
index: &'t Index,
field_id: FieldId,
documents_ids: RoaringBitmap,
) -> heed::Result<FacetStringIter<'t>> {
let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?;
Ok(FacetStringIter {
rtxn,
db,
field_id,
level_iters: vec![(documents_ids, Left(highest_iter))],
must_reduce: true,
})
}
pub fn new_reverse_reducing(
rtxn: &'t heed::RoTxn,
index: &'t Index,
field_id: FieldId,
documents_ids: RoaringBitmap,
) -> heed::Result<FacetStringIter<'t>> {
let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?;
Ok(FacetStringIter {
rtxn,
db,
field_id,
level_iters: vec![(documents_ids, Right(highest_reverse_iter))],
must_reduce: true,
})
}
pub fn new_non_reducing(
rtxn: &'t heed::RoTxn,
index: &'t Index,
@ -467,12 +300,30 @@ impl<'t> FacetStringIter<'t> {
documents_ids: RoaringBitmap,
) -> heed::Result<FacetStringIter<'t>> {
let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?;
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
let highest_iter = match NonZeroU8::new(highest_level) {
Some(highest_level) => Left(FacetStringGroupRange::new(
rtxn,
index.facet_id_string_docids,
field_id,
highest_level,
Unbounded,
Unbounded,
)?),
None => Right(FacetStringLevelZeroRange::new(
rtxn,
index.facet_id_string_docids,
field_id,
Unbounded,
Unbounded,
)?),
};
Ok(FacetStringIter {
rtxn,
db,
field_id,
level_iters: vec![(documents_ids, Left(highest_iter))],
level_iters: vec![(documents_ids, highest_iter)],
must_reduce: false,
})
}
@ -489,62 +340,6 @@ impl<'t> FacetStringIter<'t> {
.transpose()?
.map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit
}
fn highest_iter<X, Y>(
rtxn: &'t heed::RoTxn,
index: &'t Index,
db: Database<X, Y>,
field_id: FieldId,
) -> heed::Result<Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>> {
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
match NonZeroU8::new(highest_level) {
Some(highest_level) => FacetStringGroupRange::new(
rtxn,
index.facet_id_string_docids,
field_id,
highest_level,
Unbounded,
Unbounded,
)
.map(Left),
None => FacetStringLevelZeroRange::new(
rtxn,
index.facet_id_string_docids,
field_id,
Unbounded,
Unbounded,
)
.map(Right),
}
}
fn highest_reverse_iter<X, Y>(
rtxn: &'t heed::RoTxn,
index: &'t Index,
db: Database<X, Y>,
field_id: FieldId,
) -> heed::Result<Either<FacetStringGroupRevRange<'t>, FacetStringLevelZeroRevRange<'t>>> {
let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
match NonZeroU8::new(highest_level) {
Some(highest_level) => FacetStringGroupRevRange::new(
rtxn,
index.facet_id_string_docids,
field_id,
highest_level,
Unbounded,
Unbounded,
)
.map(Left),
None => FacetStringLevelZeroRevRange::new(
rtxn,
index.facet_id_string_docids,
field_id,
Unbounded,
Unbounded,
)
.map(Right),
}
}
}
impl<'t> Iterator for FacetStringIter<'t> {
@ -553,21 +348,6 @@ impl<'t> Iterator for FacetStringIter<'t> {
fn next(&mut self) -> Option<Self::Item> {
'outer: loop {
let (documents_ids, last) = self.level_iters.last_mut()?;
let is_ascending = last.is_left();
// We remap the different iterator types to make
// the algorithm less complex to understand.
let last = match last {
Left(ascending) => match ascending {
Left(last) => Left(Left(last)),
Right(last) => Right(Left(last)),
},
Right(descending) => match descending {
Left(last) => Left(Right(last)),
Right(last) => Right(Right(last)),
},
};
match last {
Left(last) => {
for result in last {
@ -579,50 +359,24 @@ impl<'t> Iterator for FacetStringIter<'t> {
*documents_ids -= &docids;
}
let result = if is_ascending {
match string_bounds {
Some((left, right)) => {
FacetStringLevelZeroRevRange::new(
self.rtxn,
self.db,
self.field_id,
Included(left),
Included(right),
)
.map(Right)
}
None => FacetStringGroupRevRange::new(
self.rtxn,
self.db,
self.field_id,
NonZeroU8::new(level.get() - 1).unwrap(),
Included(left),
Included(right),
)
.map(Left),
}
.map(Right)
} else {
match string_bounds {
Some((left, right)) => FacetStringLevelZeroRange::new(
self.rtxn,
self.db,
self.field_id,
Included(left),
Included(right),
)
.map(Right),
None => FacetStringGroupRange::new(
self.rtxn,
self.db,
self.field_id,
NonZeroU8::new(level.get() - 1).unwrap(),
Included(left),
Included(right),
)
.map(Left),
}
.map(Left)
let result = match string_bounds {
Some((left, right)) => FacetStringLevelZeroRange::new(
self.rtxn,
self.db,
self.field_id,
Included(left),
Included(right),
)
.map(Right),
None => FacetStringGroupRange::new(
self.rtxn,
self.db,
self.field_id,
NonZeroU8::new(level.get() - 1).unwrap(),
Included(left),
Included(right),
)
.map(Left),
};
match result {

View file

@ -18,8 +18,6 @@ pub(crate) use self::facet::ParserRule;
pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator};
pub use self::matching_words::MatchingWords;
use self::query_tree::QueryTreeBuilder;
use crate::criterion::AscDesc;
use crate::error::UserError;
use crate::search::criteria::r#final::{Final, FinalResult};
use crate::{DocumentId, Index, Result};
@ -39,7 +37,6 @@ pub struct Search<'a> {
filter: Option<FilterCondition>,
offset: usize,
limit: usize,
sort_criteria: Option<Vec<AscDesc>>,
optional_words: bool,
authorize_typos: bool,
words_limit: usize,
@ -54,7 +51,6 @@ impl<'a> Search<'a> {
filter: None,
offset: 0,
limit: 20,
sort_criteria: None,
optional_words: true,
authorize_typos: true,
words_limit: 10,
@ -78,11 +74,6 @@ impl<'a> Search<'a> {
self
}
pub fn sort_criteria(&mut self, criteria: Vec<AscDesc>) -> &mut Search<'a> {
self.sort_criteria = Some(criteria);
self
}
pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> {
self.optional_words = value;
self
@ -143,29 +134,8 @@ impl<'a> Search<'a> {
None => MatchingWords::default(),
};
// We check that we are allowed to use the sort criteria, we check
// that they are declared in the sortable fields.
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
if let Some(sort_criteria) = &self.sort_criteria {
for asc_desc in sort_criteria {
let field = asc_desc.field();
if !sortable_fields.contains(field) {
return Err(UserError::InvalidSortableAttribute {
field: field.to_string(),
valid_fields: sortable_fields,
}
.into());
}
}
}
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
let criteria = criteria_builder.build(
query_tree,
primitive_query,
filtered_candidates,
self.sort_criteria.clone(),
)?;
let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?;
match self.index.distinct_field(self.rtxn)? {
None => self.perform_sort(NoopDistinct, matching_words, criteria),
@ -229,7 +199,6 @@ impl fmt::Debug for Search<'_> {
filter,
offset,
limit,
sort_criteria,
optional_words,
authorize_typos,
words_limit,
@ -241,7 +210,6 @@ impl fmt::Debug for Search<'_> {
.field("filter", filter)
.field("offset", offset)
.field("limit", limit)
.field("sort_criteria", sort_criteria)
.field("optional_words", optional_words)
.field("authorize_typos", authorize_typos)
.field("words_limit", words_limit)