From fc686aaca781c36df7d856d10535151b96277a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Nov 2020 13:08:57 +0100 Subject: [PATCH] Use the De Morgan law to simplify the NOT operation --- http-ui/src/main.rs | 12 +-- src/index.rs | 22 ++++++ src/search/facet/mod.rs | 96 +++++++++++++++++------ src/update/clear_documents.rs | 6 ++ src/update/delete_documents.rs | 8 ++ src/update/{facet_levels.rs => facets.rs} | 80 ++++++++++++++----- src/update/index_documents/mod.rs | 4 +- src/update/mod.rs | 4 +- src/update/update_builder.rs | 8 +- 9 files changed, 182 insertions(+), 58 deletions(-) rename src/update/{facet_levels.rs => facets.rs} (80%) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ca1ddcd45..1c5385b14 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -197,7 +197,7 @@ enum UpdateMeta { DocumentsAddition { method: String, format: String }, ClearDocuments, Settings(Settings), - FacetLevels(FacetLevels), + Facets(Facets), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -236,7 +236,7 @@ struct Settings { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] -struct FacetLevels { +struct Facets { last_level_size: Option, number_of_levels: Option, easing_function: Option, @@ -411,10 +411,10 @@ async fn main() -> anyhow::Result<()> { Err(e) => Err(e.into()) } }, - UpdateMeta::FacetLevels(levels) => { + UpdateMeta::Facets(levels) => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.facet_levels(&mut wtxn, &index_cloned); + let mut builder = update_builder.facets(&mut wtxn, &index_cloned); if let Some(value) = levels.last_level_size { builder.last_level_size(value); } @@ -806,8 +806,8 @@ async fn main() -> anyhow::Result<()> { let change_facet_levels_route = warp::filters::method::post() .and(warp::path!("facet-levels")) .and(warp::body::json()) - .map(move |levels: FacetLevels| { - let meta = UpdateMeta::FacetLevels(levels); + .map(move |levels: Facets| { + let meta = UpdateMeta::Facets(levels); let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); eprintln!("update {} registered", update_id); diff --git a/src/index.rs b/src/index.rs index ccaba4ca6..b21c7d39b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -18,6 +18,7 @@ use crate::{ pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; +pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -224,6 +225,27 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } + /* faceted documents ids */ + + /// Writes the documents ids that are faceted under this field id. + pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> { + let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) + } + + /// Retrieve all the documents ids that faceted under this field id. + pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result { + let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index c39de878b..1d20ca1bd 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -26,15 +26,42 @@ mod parser; pub enum FacetNumberOperator { GreaterThan(T), GreaterThanOrEqual(T), + Equal(T), + NotEqual(T), LowerThan(T), LowerThanOrEqual(T), - Equal(T), Between(T, T), } +impl FacetNumberOperator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + fn negate(self) -> (Self, Option) { + match self { + GreaterThan(x) => (LowerThanOrEqual(x), None), + GreaterThanOrEqual(x) => (LowerThan(x), None), + Equal(x) => (NotEqual(x), None), + NotEqual(x) => (Equal(x), None), + LowerThan(x) => (GreaterThanOrEqual(x), None), + LowerThanOrEqual(x) => (GreaterThan(x), None), + Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), + } + } +} + #[derive(Debug, Clone, PartialEq)] pub enum FacetStringOperator { Equal(String), + NotEqual(String), +} + +impl FacetStringOperator { + fn negate(self) -> Self { + match self { + FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), + FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), + } + } } #[derive(Debug, Clone, PartialEq)] @@ -44,7 +71,6 @@ pub enum FacetCondition { OperatorString(u8, FacetStringOperator), Or(Box, Box), And(Box, Box), - Not(Box), } fn get_field_id_facet_type<'a>( @@ -106,24 +132,24 @@ impl FacetCondition { fim: &FieldsIdsMap, ff: &HashMap, expression: Pairs, - ) -> anyhow::Result + ) -> anyhow::Result { PREC_CLIMBER.climb( expression, |pair: Pair| match pair.as_rule() { - Rule::between => Ok(FacetCondition::between(fim, ff, pair)?), - Rule::eq => Ok(FacetCondition::equal(fim, ff, pair)?), - Rule::neq => Ok(Not(Box::new(FacetCondition::equal(fim, ff, pair)?))), - Rule::greater => Ok(FacetCondition::greater_than(fim, ff, pair)?), - Rule::geq => Ok(FacetCondition::greater_than_or_equal(fim, ff, pair)?), - Rule::less => Ok(FacetCondition::lower_than(fim, ff, pair)?), - Rule::leq => Ok(FacetCondition::lower_than_or_equal(fim, ff, pair)?), + Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), + Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), + Rule::eq => Ok(Self::equal(fim, ff, pair)?), + Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), + Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(Self::lower_than(fim, ff, pair)?), + Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), - Rule::not => Ok(Not(Box::new(Self::from_pairs(fim, ff, pair.into_inner())?))), _ => unreachable!(), }, - |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { match op.as_rule() { Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), @@ -133,6 +159,22 @@ impl FacetCondition { ) } + fn negate(self) -> FacetCondition { + match self { + OperatorI64(fid, op) => match op.negate() { + (op, None) => OperatorI64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), + }, + OperatorF64(fid, op) => match op.negate() { + (op, None) => OperatorF64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), + }, + OperatorString(fid, op) => OperatorString(fid, op.negate()), + Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), + And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), + } + } + fn between( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashMap, @@ -381,6 +423,7 @@ impl FacetCondition { fn evaluate_number_operator<'t, T: 't, KC>( rtxn: &'t heed::RoTxn, + index: &Index, db: heed::Database, field_id: u8, operator: FacetNumberOperator, @@ -396,9 +439,14 @@ impl FacetCondition { let (left, right) = match operator { GreaterThan(val) => (Excluded(val), Included(T::max_value())), GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + Equal(val) => (Included(val), Included(val)), + NotEqual(val) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let docids = Self::evaluate_number_operator::(rtxn, index, db, field_id, Equal(val))?; + return Ok(all_documents_ids - docids); + }, LowerThan(val) => (Included(T::min_value()), Excluded(val)), LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), - Equal(val) => (Included(val), Included(val)), Between(left, right) => (Included(left), Included(right)), }; @@ -421,6 +469,7 @@ impl FacetCondition { fn evaluate_string_operator( rtxn: &heed::RoTxn, + index: &Index, db: heed::Database, field_id: u8, operator: &FacetStringOperator, @@ -432,7 +481,13 @@ impl FacetCondition { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()) } - } + }, + FacetStringOperator::NotEqual(string) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let op = FacetStringOperator::Equal(string.clone()); + let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; + return Ok(all_documents_ids - docids); + }, } } @@ -445,14 +500,14 @@ impl FacetCondition { let db = index.facet_field_id_value_docids; match self { OperatorI64(fid, op) => { - Self::evaluate_number_operator::(rtxn, db, *fid, *op) + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) }, OperatorF64(fid, op) => { - Self::evaluate_number_operator::(rtxn, db, *fid, *op) + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) }, OperatorString(fid, op) => { let db = db.remap_key_type::(); - Self::evaluate_string_operator(rtxn, db, *fid, op) + Self::evaluate_string_operator(rtxn, index, db, *fid, op) }, Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; @@ -464,13 +519,6 @@ impl FacetCondition { let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs & rhs) }, - Not(op) => { - // TODO is this right or is this wrong? because all documents ids are not faceted - // so doing that can return documents that are not faceted at all. - let all_documents_ids = index.documents_ids(rtxn)?; - let documents_ids = op.evaluate(rtxn, index)?; - Ok(all_documents_ids - documents_ids) - }, } } } diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 447dca8b4..5dc14f97d 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; + let faceted_fields = self.index.faceted_fields(self.wtxn)?; // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; + // We clean all the faceted documents ids. + for (field_id, _) in faceted_fields { + self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; + } + // Clear the other databases. word_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 9924080f2..b1db4f94c 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -184,6 +184,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // Remove the documents ids from the faceted documents ids. + let faceted_fields = self.index.faceted_fields(self.wtxn)?; + for (field_id, _) in faceted_fields { + let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + } + // We delete the documents ids that are under the facet field id values. let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; while let Some(result) = iter.next() { diff --git a/src/update/facet_levels.rs b/src/update/facets.rs similarity index 80% rename from src/update/facet_levels.rs rename to src/update/facets.rs index 4a7769b7a..96a7e825e 100644 --- a/src/update/facet_levels.rs +++ b/src/update/facets.rs @@ -24,7 +24,7 @@ pub enum EasingName { Linear, } -pub struct FacetLevels<'t, 'u, 'i> { +pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, pub(crate) chunk_compression_type: CompressionType, @@ -35,9 +35,9 @@ pub struct FacetLevels<'t, 'u, 'i> { easing_function: EasingName, } -impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> FacetLevels<'t, 'u, 'i> { - FacetLevels { +impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { + Facets { wtxn, index, chunk_compression_type: CompressionType::None, @@ -70,7 +70,7 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for (field_id, facet_type) in faceted_fields { - let content = match facet_type { + let (content, documents_ids) = match facet_type { FacetType::Integer => { clear_field_levels::( self.wtxn, @@ -78,7 +78,13 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { field_id, )?; - compute_facet_levels::( + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels::( self.wtxn, self.index.facet_field_id_value_docids, self.chunk_compression_type, @@ -88,7 +94,9 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { self.number_of_levels, self.easing_function, field_id, - )? + )?; + + (Some(content), documents_ids) }, FacetType::Float => { clear_field_levels::( @@ -97,7 +105,13 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { field_id, )?; - compute_facet_levels::( + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_levels::( self.wtxn, self.index.facet_field_id_value_docids, self.chunk_compression_type, @@ -107,18 +121,32 @@ impl<'t, 'u, 'i> FacetLevels<'t, 'u, 'i> { self.number_of_levels, self.easing_function, field_id, - )? + )?; + + (Some(content), documents_ids) + }, + FacetType::String => { + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + (None, documents_ids) }, - FacetType::String => continue, }; - write_into_lmdb_database( - self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), - content, - |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, - )?; + if let Some(content) = content { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + } + + self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; } Ok(()) @@ -138,9 +166,7 @@ where let left = (field_id, 1, T::min_value(), T::min_value()); let right = (field_id, u8::MAX, T::max_value(), T::max_value()); let range = left..=right; - db.remap_key_type::() - .delete_range(wtxn, &range) - .map(drop) + db.remap_key_type::().delete_range(wtxn, &range).map(drop) } fn compute_facet_levels<'t, T: 't, KC>( @@ -217,6 +243,20 @@ where writer_into_reader(writer, shrink_size) } +fn compute_faceted_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database, + field_id: u8, +) -> anyhow::Result +{ + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &[field_id])? { + let (_key, docids) = result?; + documents_ids.union_with(&docids); + } + Ok(documents_ids) +} + fn write_entry( writer: &mut Writer, field_id: u8, diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index d48696e94..362175ce5 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -16,7 +16,7 @@ use rayon::prelude::*; use rayon::ThreadPool; use crate::index::Index; -use crate::update::{FacetLevels, UpdateIndexingStep}; +use crate::update::{Facets, UpdateIndexingStep}; use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, @@ -584,7 +584,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } - let mut builder = FacetLevels::new(self.wtxn, self.index); + let mut builder = Facets::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; diff --git a/src/update/mod.rs b/src/update/mod.rs index 87035065c..416e88464 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -1,7 +1,7 @@ mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facet_levels; +mod facets; mod index_documents; mod settings; mod update_builder; @@ -12,7 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; -pub use self::facet_levels::{FacetLevels, EasingName}; +pub use self::facets::{Facets, EasingName}; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/src/update/update_builder.rs b/src/update/update_builder.rs index 8f7f1a0a8..b973bd535 100644 --- a/src/update/update_builder.rs +++ b/src/update/update_builder.rs @@ -2,7 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, FacetLevels}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -119,13 +119,13 @@ impl<'a> UpdateBuilder<'a> { builder } - pub fn facet_levels<'t, 'u, 'i>( + pub fn facets<'t, 'u, 'i>( self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> FacetLevels<'t, 'u, 'i> + ) -> Facets<'t, 'u, 'i> { - let mut builder = FacetLevels::new(wtxn, index); + let mut builder = Facets::new(wtxn, index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level;