From 39a4a0a362f4803016072b54a0cbcf88ccb3a55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:27:16 +0200 Subject: [PATCH] Reintroduce filter range search and facet extractors --- milli/src/search/facet/facet_range_search.rs | 12 +- milli/src/search/facet/filter.rs | 248 +++++------------- milli/src/update/delete_documents.rs | 10 +- .../extract/extract_facet_number_docids.rs | 13 +- .../extract/extract_facet_string_docids.rs | 40 +-- 5 files changed, 92 insertions(+), 231 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index c01346b25..75db9fda2 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,7 @@ use super::get_last_facet_value; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -48,13 +48,13 @@ where } Bound::Unbounded => Bound::Unbounded, }; - + let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; - let highest_level = get_highest_level(rtxn, db, field_id)?; + let mut f = FacetRangeSearch { rtxn, db: &db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, &db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, &db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, &db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dd34abe6d..79d7f5e0f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,22 +1,17 @@ -use std::collections::HashSet; -use std::fmt::{Debug, Display}; -use std::ops::Bound::{self, Excluded, Included}; -use std::ops::RangeBounds; - use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; -use heed::LazyDecode; use roaring::RoaringBitmap; +use std::collections::HashSet; +use std::fmt::{Debug, Display}; +use std::ops::Bound::{self, Excluded, Included}; -// use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; -// use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, -}; +use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; + +use super::facet_range_search; /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; @@ -147,158 +142,15 @@ impl<'a> Filter<'a> { } } -fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, FacetGroupValueCodec>, - field_id: FieldId, -) { -} - impl<'a> Filter<'a> { - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, CboRoaringBitmapCodec>, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> Result<()> { - // level must be > 0, I'll create a separate function for level 0 - // if level == 0 { - // call that function - //} - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels( - rtxn, db, field_id, 0, left, right, output, - ); - } - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - let range_start_key = FacetKey { - field_id, - level, - left_bound: match left { - Included(l) => l, - Excluded(l) => l, - Bound::Unbounded => f64::MIN, - }, - }; - let mut range_iter = db - .remap_data_type::>() - .range(rtxn, &(range_start_key..))?; + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; - let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; - while let Some(el) = range_iter.next() { - let (facet_key, value) = el?; - let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound)); - // if the current range intersects with the query range, then go deeper - // what does it mean for two ranges to intersect? - let gte_left = match left { - Included(l) => previous_facet_key.left_bound >= l, - Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true? - Bound::Unbounded => true, - }; - let lte_right = match right { - Included(r) => facet_key.left_bound <= r, - Excluded(r) => facet_key.left_bound < r, - Bound::Unbounded => true, - }; - } - // at this point, previous_facet_key and previous_value are the last groups in the level - // we must also check whether we should visit this group - - todo!(); - - // let mut left_found = None; - // let mut right_found = None; - - // // We must create a custom iterator to be able to iterate over the - // // requested range as the range iterator cannot express some conditions. - // let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - // debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - // for (i, result) in iter.enumerate() { - // let ((_fid, level, l, r), docids) = result?; - // debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - // *output |= docids; - // // We save the leftest and rightest bounds we actually found at this level. - // if i == 0 { - // left_found = Some(l); - // } - // right_found = Some(r); - // } - - // // Can we go deeper? - // let deeper_level = match level.checked_sub(1) { - // Some(level) => level, - // None => return Ok(()), - // }; - - // // We must refine the left and right bounds of this range by retrieving the - // // missing part in a deeper level. - // match left_found.zip(right_found) { - // Some((left_found, right_found)) => { - // // If the bound is satisfied we avoid calling this function again. - // if !matches!(left, Included(l) if l == left_found) { - // let sub_right = Excluded(left_found); - // debug!( - // "calling left with {:?} to {:?} (level {})", - // left, sub_right, deeper_level - // ); - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // left, - // sub_right, - // output, - // )?; - // } - // if !matches!(right, Included(r) if r == right_found) { - // let sub_left = Excluded(right_found); - // debug!( - // "calling right with {:?} to {:?} (level {})", - // sub_left, right, deeper_level - // ); - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // sub_left, - // right, - // output, - // )?; - // } - // } - // None => { - // // If we found nothing at this level it means that we must find - // // the same bounds but at a deeper, more precise level. - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // left, - // right, - // output, - // )?; - // } - // } - - // Ok(()) + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) } fn evaluate_operator( @@ -337,15 +189,15 @@ impl<'a> Filter<'a> { Some(n) => { let n = Included(n); let mut output = RoaringBitmap::new(); - // Self::explore_facet_number_levels( - // rtxn, - // numbers_db, - // field_id, - // 0, - // n, - // n, - // &mut output, - // )?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + 0, + n, + n, + &mut output, + )?; output } None => RoaringBitmap::new(), @@ -381,29 +233,53 @@ impl<'a> Filter<'a> { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - // Self::explore_facet_number_levels( - // rtxn, - // numbers_db, - // field_id, - // level, - // left, - // right, - // &mut output, - // )?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + level, + left, + right, + &mut output, + )?; Ok(output) } None => Ok(RoaringBitmap::new()), } } - pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { - // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; - let filterable_fields = index.filterable_fields(rtxn)?; + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> Result<()> { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_number_levels( + rtxn, db, field_id, 0, left, right, output, + ); + } + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + let x = facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, &db, field_id, &left, &right, + )?; + // TODO: the facet range search should take a mutable roaring bitmap as argument + *output = x; - // and finally we delete all the soft_deleted_documents, again, only once at the very end - self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) + Ok(()) } fn inner_evaluate( diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 32b2ac986..e16d98e74 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,22 +2,20 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode, Database}; -use obkv::Key; +use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; use super::{ClearDocuments, Facets}; -use crate::error::{InternalError, SerializationError, UserError}; -// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +use crate::error::{InternalError, UserError}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - fields_ids_map, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, - FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, + DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, + RoaringBitmapCodec, SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index c5424a346..eece08ee3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,6 +6,8 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::Result; @@ -31,14 +33,13 @@ pub fn extract_facet_number_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { - todo!() - // let (field_id, document_id, number) = - // FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + let (field_id, document_id, number) = + FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - // let key = (field_id, 0, number, number); - // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + let key = FacetKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 4e655329e..51d2df923 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,13 +1,11 @@ -use std::fs::File; -use std::iter::FromIterator; -use std::{io, str}; - -use roaring::RoaringBitmap; - use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; -// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; use crate::{FieldId, Result}; +use heed::BytesEncode; +use std::fs::File; +use std::io; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -22,38 +20,26 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, // TODO: check + merge_cbo_roaring_bitmaps, // TODO: check that it is correct indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, original_value_bytes)) = cursor.move_on_next()? { + while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); - let document_id = u32::from_be_bytes(document_id_bytes); - let original_value = str::from_utf8(original_value_bytes)?; - key_buffer.clear(); - // TODO - // FacetStringLevelZeroCodec::serialize_into( - // field_id, - // str::from_utf8(normalized_value_bytes)?, - // &mut key_buffer, - // ); + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); - value_buffer.clear(); - // TODO - // encode_prefix_string(original_value, &mut value_buffer)?; - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - bitmap.serialize_into(&mut value_buffer)?; + let normalised_value = std::str::from_utf8(normalized_value_bytes)?; + let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; + facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; } sorter_into_reader(facet_string_docids_sorter, indexer)