From 7d67c9e2e73db41eb614a61583904c6998a38619 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Nov 2020 14:50:32 +0100 Subject: [PATCH] Improve the facet search algorithm performances --- src/search.rs | 163 +++++++++++++++++++++++++++++++------------------- 1 file changed, 101 insertions(+), 62 deletions(-) diff --git a/src/search.rs b/src/search.rs index 3338d5222..cedca1085 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fmt; +use std::ops::Bound::{self, Unbounded, Included, Excluded}; use anyhow::{bail, ensure, Context}; use fst::{IntoStreamer, Streamer}; @@ -12,7 +13,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::{facet::FacetLevelValueI64Codec, CboRoaringBitmapCodec}; +use crate::heed_codec::facet::FacetLevelValueI64Codec; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId}; @@ -226,6 +227,95 @@ impl<'a> Search<'a> { candidates } + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels( + &self, + field_id: u8, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return self.explore_facet_levels(field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let left_bound = match left { + Included(left) => Included((field_id, level, left, i64::MIN)), + Excluded(left) => Excluded((field_id, level, left, i64::MIN)), + Unbounded => Unbounded, + }; + let right_bound = Included((field_id, level, i64::MAX, i64::MAX)); + let db = self.index.facet_field_id_value_docids.remap_key_type::(); + let iter = db + .range(self.rtxn, &(left_bound, right_bound))? + .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { + match right { + Included(right) => *r <= right, + Excluded(right) => *r < right, + Unbounded => true, + } + })); + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, _level, l, r), docids) = result?; + debug!("{} to {} (level {}) found {} documents", l, r, _level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + self.explore_facet_levels(field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + self.explore_facet_levels(field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + self.explore_facet_levels(field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + pub fn execute(&self) -> anyhow::Result { let limit = self.limit; let fst = self.index.words_fst(self.rtxn)?; @@ -239,64 +329,11 @@ impl<'a> Search<'a> { // We create the original candidates with the facet conditions results. let facet_candidates = match self.facet_condition { Some(FacetCondition::Operator(fid, operator)) => { - use std::ops::Bound::{self, Included, Excluded}; use FacetOperator::*; - fn explore_facet_levels( - rtxn: &heed::RoTxn, - db: &heed::Database, - field_id: u8, - level: u8, - left: Bound, - right: Bound, - candidates: &mut RoaringBitmap, - ) -> anyhow::Result<()> - { - let mut left_found = left; - let mut right_found = right; - - let range = { - let left = match left { - Included(left) => Included((field_id, level, left, i64::MIN)), - Excluded(left) => Excluded((field_id, level, left, i64::MIN)), - Bound::Unbounded => Bound::Unbounded, - }; - let right = Included((field_id, level, i64::MAX, i64::MAX)); - (left, right) - }; - - for (i, result) in db.range(rtxn, &range)?.enumerate() { - let ((_fid, _level, l, r), docids) = result?; - match right { - Included(right) if r > right => break, - Excluded(right) if r >= right => break, - _ => (), - } - - eprintln!("{} to {} (level {})", l, r, _level); - candidates.union_with(&docids); - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Excluded(l); } - right_found = Excluded(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - // TODO we must avoid going at deeper when the bounds are already satisfied. - explore_facet_levels(rtxn, db, field_id, deeper_level, left, left_found, candidates)?; - explore_facet_levels(rtxn, db, field_id, deeper_level, right_found, right, candidates)?; - - Ok(()) - } - - // Make sure we always bound the ranges with the field id, as the facets - // values are all in the same database and prefixed by the field id. + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. let (left, right) = match operator { GreaterThan(val) => (Excluded(val), Included(i64::MAX)), GreaterThanOrEqual(val) => (Included(val), Included(i64::MAX)), @@ -316,7 +353,7 @@ impl<'a> Search<'a> { // the first entry of it which corresponds to the last key of our field id. let db = db.remap_data_type::(); match db.get_lower_than(self.rtxn, &(next_fid, 0, i64::MIN, i64::MIN))? { - Some(((id, level, _left, _right), _docids)) if fid == id => Some(level), + Some(((id, level, ..), _)) if fid == id => Some(level), _ => None, } }, @@ -324,7 +361,7 @@ impl<'a> Search<'a> { // If we can't generate a bigger field id, it must be equal to 255 and // therefore the last key of the database must be the one we want. match db.remap_data_type::().last(self.rtxn)? { - Some(((id, level, _left, _right), _docids)) if fid == id => Some(level), + Some(((id, level, ..), _)) if fid == id => Some(level), _ => None, } }, @@ -332,9 +369,9 @@ impl<'a> Search<'a> { match biggest_level { Some(level) => { - let mut candidates = RoaringBitmap::new(); - explore_facet_levels(self.rtxn, &db, fid, level, left, right, &mut candidates)?; - Some(candidates) + let mut output = RoaringBitmap::new(); + self.explore_facet_levels(fid, level, left, right, &mut output)?; + Some(output) }, None => None, } @@ -342,6 +379,8 @@ impl<'a> Search<'a> { None => None, }; + debug!("facet candidates: {:?}", facet_candidates); + let (candidates, derived_words) = match (facet_candidates, derived_words) { (Some(mut facet_candidates), Some(derived_words)) => { let words_candidates = Self::compute_candidates(&derived_words);