From d893e83622018d15633e32190a93ad6fe6152a32 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Jan 2021 15:10:30 +0100 Subject: [PATCH] Speed-up facet aggregation by using a FacetIter --- src/search/facet/facet_distribution.rs | 137 ++++++++++++++++--------- src/search/facet/mod.rs | 39 ++++++- src/search/mod.rs | 8 +- 3 files changed, 124 insertions(+), 60 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 2ee297fa2..8256d1234 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -9,7 +9,7 @@ use serde_json::Value; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::search::facet::FacetRange; +use crate::search::facet::{FacetIter, FacetRange}; use crate::{Index, FieldId}; pub struct FacetDistribution<'a> { @@ -41,61 +41,99 @@ impl<'a> FacetDistribution<'a> { } fn facet_values(&self, field_id: FieldId, facet_type: FacetType) -> heed::Result> { - if let Some(candidates) = self.candidates.as_ref().filter(|c| c.len() <= 1000) { - let mut key_buffer = vec![field_id]; - match facet_type { - FacetType::Float => { - let mut facet_values = HashSet::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - facet_values.insert(OrderedFloat(value)); + if let Some(candidates) = self.candidates.as_ref() { + if candidates.len() <= 1000 { + let mut key_buffer = vec![field_id]; + match facet_type { + FacetType::Float => { + let mut facet_values = HashSet::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + facet_values.insert(OrderedFloat(value)); + } } - } - Ok(facet_values.into_iter().map(|f| Value::from(*f)).collect()) - }, - FacetType::Integer => { - let mut facet_values = HashSet::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - facet_values.insert(value); + Ok(facet_values.into_iter().map(|f| Value::from(*f)).collect()) + }, + FacetType::Integer => { + let mut facet_values = HashSet::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + facet_values.insert(value); + } } - } - Ok(facet_values.into_iter().map(Value::from).collect()) - }, - FacetType::String => { - let mut facet_values = HashSet::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - facet_values.insert(value); + Ok(facet_values.into_iter().map(Value::from).collect()) + }, + FacetType::String => { + let mut facet_values = HashSet::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + facet_values.insert(value); + } } + Ok(facet_values.into_iter().map(Value::from).collect()) + }, + } + } else { + let iter = match facet_type { + FacetType::String => { + let db = self.index.facet_field_id_value_docids; + let iter = db + .prefix_iter(self.rtxn, &[field_id])? + .remap_key_type::() + .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); + Box::new(iter) as Box::> + }, + FacetType::Integer => { + let iter = FacetIter::::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; + Box::new(iter.map(|r| r.map(|(v, docids)| (Value::from(v), docids)))) + }, + FacetType::Float => { + let iter = FacetIter::::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; + Box::new(iter.map(|r| r.map(|(v, docids)| (Value::from(v), docids)))) + }, + }; + + let mut facet_values = Vec::new(); + for result in iter { + let (value, docids) = result?; + if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { + facet_values.push(value); } - Ok(facet_values.into_iter().map(Value::from).collect()) - }, + if facet_values.len() == self.max_values_by_facet { + break; + } + } + + Ok(facet_values) } } else { let db = self.index.facet_field_id_value_docids; let iter = match facet_type { FacetType::String => { let iter = db - .prefix_iter(&self.rtxn, &[field_id])? + .prefix_iter(self.rtxn, &[field_id])? .remap_key_type::() .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); Box::new(iter) as Box::> @@ -119,11 +157,8 @@ impl<'a> FacetDistribution<'a> { let mut facet_values = Vec::new(); for result in iter { let (value, docids) = result?; - match &self.candidates { - Some(candidates) => if !docids.is_disjoint(candidates) { - facet_values.push(value); - }, - None => facet_values.push(value), + if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { + facet_values.push(value); } if facet_values.len() == self.max_values_by_facet { break; diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index 70b5b4658..e5b06185f 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -147,6 +147,7 @@ pub struct FacetIter<'t, T: 't, KC> { db: Database, field_id: FieldId, level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t, T, KC>>)>, + must_reduce: bool, } impl<'t, T, KC> FacetIter<'t, T, KC> @@ -155,7 +156,10 @@ where KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, T: PartialOrd + Copy + Bounded, { - pub fn new( + /// Create a `FacetIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, field_id: FieldId, @@ -165,10 +169,14 @@ where let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Left(highest_iter))] }) + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) } - pub fn new_reverse( + /// Create a `FacetIter` that will iterate on the different facet entries in reverse + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reverse_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, field_id: FieldId, @@ -178,7 +186,26 @@ where let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Right(highest_iter))] }) + let level_iters = vec![(documents_ids, Right(highest_iter))]; + Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will not reduce the given documents ids + /// while iterating on the different facet levels, possibly returning multiple times + /// a document id associated with multiple facet values. + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> + { + let db = index.facet_field_id_value_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) } fn highest_level(rtxn: &'t heed::RoTxn, db: Database, fid: FieldId) -> heed::Result> { @@ -216,7 +243,9 @@ where docids.intersect_with(&documents_ids); if !docids.is_empty() { - documents_ids.difference_with(&docids); + if self.must_reduce { + documents_ids.difference_with(&docids); + } if level == 0 { debug!("found {:?} at {:?}", docids, left); diff --git a/src/search/mod.rs b/src/search/mod.rs index 05999caed..459b301a6 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -189,9 +189,9 @@ impl<'a> Search<'a> { } } else { let facet_fn = if ascending { - FacetIter::::new + FacetIter::::new_reducing } else { - FacetIter::::new_reverse + FacetIter::::new_reverse_reducing }; let mut limit_tmp = limit; let mut output = Vec::new(); @@ -226,9 +226,9 @@ impl<'a> Search<'a> { } } else { let facet_fn = if ascending { - FacetIter::::new + FacetIter::::new_reducing } else { - FacetIter::::new_reverse + FacetIter::::new_reverse_reducing }; let mut limit_tmp = limit; let mut output = Vec::new();