From 531bd6ddc7d72e415c298cabb880b8198e3459cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Nov 2020 19:28:20 +0100 Subject: [PATCH] Make the facet operator evaluation code generic --- src/search.rs | 326 +++++++++++++++++++++++++------------------------- 1 file changed, 160 insertions(+), 166 deletions(-) diff --git a/src/search.rs b/src/search.rs index 3a781847f..5cd998ffe 100644 --- a/src/search.rs +++ b/src/search.rs @@ -7,7 +7,7 @@ use std::str::FromStr; use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; +use heed::types::{ByteSlice, DecodeIgnore}; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; @@ -17,6 +17,7 @@ use roaring::bitmap::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::heed_codec::CboRoaringBitmapCodec; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; @@ -80,6 +81,7 @@ impl FacetCondition { where T::Err: Send + Sync + StdError + 'static, { use FacetOperator::*; + match iter.next() { Some(">") => { let param = iter.next().context("missing parameter")?; @@ -117,6 +119,161 @@ impl FacetCondition { None => bail!("missing facet filter first parameter"), } } + + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let left_bound = match left { + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), + Unbounded => Unbounded, + }; + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + // We also make sure that we don't decode the data before we are sure we must return it. + let iter = db + .remap_key_type::() + .lazily_decode_data() + .range(rtxn, &(left_bound, right_bound))? + .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { + match right { + Included(right) => *r <= right, + Excluded(right) => *r < right, + Unbounded => true, + } + })) + .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, level, l, r), docids) = result?; + debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + + fn evaluate_operator<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: u8, + operator: FacetOperator, + ) -> anyhow::Result + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + use FacetOperator::*; + + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(T::max_value())), + GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + LowerThan(val) => (Included(T::min_value()), Excluded(val)), + LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), + Equal(val) => (Included(val), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_types::() + .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? + .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + + match biggest_level { + Some(level) => { + let mut output = RoaringBitmap::new(); + Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; + Ok(output) + }, + None => Ok(RoaringBitmap::new()), + } + } + + fn evaluate( + &self, + rtxn: &heed::RoTxn, + db: heed::Database, + ) -> anyhow::Result + { + match *self { + FacetCondition::OperatorI64(fid, operator) => { + Self::evaluate_operator::(rtxn, db, fid, operator) + }, + FacetCondition::OperatorF64(fid, operator) => { + Self::evaluate_operator::(rtxn, db, fid, operator) + } + } + } } pub struct Search<'a> { @@ -241,103 +398,6 @@ impl<'a> Search<'a> { candidates } - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_levels( - &self, - field_id: u8, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> anyhow::Result<()> - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'a, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, - { - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return self.explore_facet_levels::(field_id, 0, left, right, output); - }, - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let left_bound = match left { - Included(left) => Included((field_id, level, left, T::min_value())), - Excluded(left) => Excluded((field_id, level, left, T::min_value())), - Unbounded => Unbounded, - }; - let right_bound = Included((field_id, level, T::max_value(), T::max_value())); - // We also make sure that we don't decode the data before we are sure we must return it. - let iter = self.index - .facet_field_id_value_docids - .remap_key_type::() - .lazily_decode_data() - .range(self.rtxn, &(left_bound, right_bound))? - .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { - match right { - Included(right) => *r <= right, - Excluded(right) => *r < right, - Unbounded => true, - } - })) - .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - output.union_with(&docids); - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Some(l); } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - self.explore_facet_levels::(field_id, deeper_level, left, sub_right, output)?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - self.explore_facet_levels::(field_id, deeper_level, sub_left, right, output)?; - } - }, - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - self.explore_facet_levels::(field_id, deeper_level, left, right, output)?; - }, - } - - Ok(()) - } - pub fn execute(&self) -> anyhow::Result { let limit = self.limit; let fst = self.index.words_fst(self.rtxn)?; @@ -349,75 +409,9 @@ impl<'a> Search<'a> { }; // We create the original candidates with the facet conditions results. - use FacetOperator::*; + let facet_db = self.index.facet_field_id_value_docids; let facet_candidates = match self.facet_condition { - // TODO make that generic over floats and integers. - Some(FacetCondition::OperatorI64(fid, operator)) => { - // Make sure we always bound the ranges with the field id and the level, - // as the facets values are all in the same database and prefixed by the - // field id and the level. - let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(i64::MAX)), - GreaterThanOrEqual(val) => (Included(val), Included(i64::MAX)), - LowerThan(val) => (Included(i64::MIN), Excluded(val)), - LowerThanOrEqual(val) => (Included(i64::MIN), Included(val)), - Equal(val) => (Included(val), Included(val)), - Between(left, right) => (Included(left), Included(right)), - }; - - let db = self.index - .facet_field_id_value_docids - .remap_key_type::(); - - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db - .remap_data_type::() - .get_lower_than_or_equal_to(self.rtxn, &(fid, u8::MAX, i64::MAX, i64::MAX))? - .and_then(|((id, level, _, _), _)| if id == fid { Some(level) } else { None }); - - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - self.explore_facet_levels::(fid, level, left, right, &mut output)?; - Some(output) - }, - None => None, - } - }, - Some(FacetCondition::OperatorF64(fid, operator)) => { - // Make sure we always bound the ranges with the field id and the level, - // as the facets values are all in the same database and prefixed by the - // field id and the level. - let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(f64::MAX)), - GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), - LowerThan(val) => (Included(f64::MIN), Excluded(val)), - LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), - Equal(val) => (Included(val), Included(val)), - Between(left, right) => (Included(left), Included(right)), - }; - - let db = self.index - .facet_field_id_value_docids - .remap_key_type::(); - - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db - .remap_data_type::() - .get_lower_than_or_equal_to(self.rtxn, &(fid, u8::MAX, f64::MAX, f64::MAX))? - .and_then(|((id, level, _, _), _)| if id == fid { Some(level) } else { None }); - - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - self.explore_facet_levels::(fid, level, left, right, &mut output)?; - Some(output) - }, - None => None, - } - }, + Some(condition) => Some(condition.evaluate(self.rtxn, facet_db)?), None => None, };