diff --git a/infos/src/main.rs b/infos/src/main.rs index d5d1ad0af..da15251b0 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -627,14 +627,14 @@ fn facet_values_docids( FacetType::String => { wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { - let ((_fid, value), docids) = result?; + let ((_fid, normalized), (_original, docids)) = result?; let count = docids.len(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[value.to_string(), count.to_string(), docids])?; + wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?; } } } diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs new file mode 100644 index 000000000..597335b6e --- /dev/null +++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs @@ -0,0 +1,53 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::num::NonZeroU8; + +use crate::{try_split_array_at, FieldId}; + +/// A codec that stores the field id, level 1 and higher and the groups ids. +/// +/// It can only be used to encode the facet string of the level 1 or higher. +pub struct FacetLevelValueU32Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { + type DItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + let (level, bytes) = bytes.split_first()?; + let level = NonZeroU8::new(*level)?; + let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; + let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; + Some((field_id, level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { + type EItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the big-endian integers. + let bytes = left.to_be_bytes(); + buffer[..4].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[4..8].copy_from_slice(&bytes[..]); + + // Then the u32 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[8..12].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[12..].copy_from_slice(&bytes[..]); + + let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); + bytes.extend_from_slice(&field_id.to_be_bytes()); + bytes.push(level.get()); + bytes.extend_from_slice(&buffer); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs new file mode 100644 index 000000000..009c6454a --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; +use std::str; + +use crate::{try_split_array_at, FieldId}; + +/// A codec that stores the field id, level 0, and facet string. +/// +/// It can only be used to encode the facet string of the level 0, +/// as it hardcodes the level. +/// +/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, +/// and make sure that the levels are not mixed-up. The level 0 is special, the key +/// are strings, other levels represent groups and keys are simply two integers. +pub struct FacetStringLevelZeroCodec; + +impl FacetStringLevelZeroCodec { + pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { + out.reserve(value.len() + 2); + out.extend_from_slice(&field_id.to_be_bytes()); + out.push(0); // the level zero (for LMDB ordering only) + out.extend_from_slice(value.as_bytes()); + } +} + +impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { + type DItem = (FieldId, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + let (level, bytes) = bytes.split_first()?; + + if *level != 0 { + return None; + } + + let value = str::from_utf8(bytes).ok()?; + Some((field_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { + type EItem = (FieldId, &'a str); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::new(); + FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs new file mode 100644 index 000000000..b2434d453 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +use super::try_split_at; + +/// A codec that encodes a string in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must know the +/// original string of a normalized facet value, the original values are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringLevelZeroValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (&'a str, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (string_len, bytes) = try_split_at(bytes, 2)?; + let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; + + let (string, bytes) = try_split_at(bytes, string_len as usize)?; + let string = str::from_utf8(string).ok()?; + + C::bytes_decode(bytes).map(|item| (string, item)) + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (&'a str, C::EItem); + + fn bytes_encode((string, value): &'a Self::EItem) -> Option> { + let string_len: u16 = string.len().try_into().ok()?; + let value_bytes = C::bytes_encode(&value)?; + + let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); + bytes.extend_from_slice(&string_len.to_be_bytes()); + bytes.extend_from_slice(string.as_bytes()); + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let string = "abc"; + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (string, docids.clone()); + let bytes = + FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_docids) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_docids), (string, docids)); + } + + #[test] + fn deserialize_unit() { + let string = "def"; + let key = (string, ()); + let bytes = FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_unit) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_unit), (string, ())); + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs new file mode 100644 index 000000000..337433c2b --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -0,0 +1,114 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +use super::try_split_at; + +/// A codec that optionally encodes two strings in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must +/// know the origin of a group, the group left and right bounds are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (Option<(&'a str, &'a str)>, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (contains_bounds, bytes) = bytes.split_first()?; + + if *contains_bounds != 0 { + let (left_len, bytes) = try_split_at(bytes, 2)?; + let (right_len, bytes) = try_split_at(bytes, 2)?; + + let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; + let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; + + let (left, bytes) = try_split_at(bytes, left_len as usize)?; + let (right, bytes) = try_split_at(bytes, right_len as usize)?; + + let left = str::from_utf8(left).ok()?; + let right = str::from_utf8(right).ok()?; + + C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) + } else { + C::bytes_decode(bytes).map(|item| (None, item)) + } + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (Option<(&'a str, &'a str)>, C::EItem); + + fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { + let mut bytes = Vec::new(); + + match bounds { + Some((left, right)) => { + bytes.push(u8::max_value()); + + if left.is_empty() || right.is_empty() { + return None; + } + + let left_len: u16 = left.len().try_into().ok()?; + let right_len: u16 = right.len().try_into().ok()?; + + bytes.extend_from_slice(&left_len.to_be_bytes()); + bytes.extend_from_slice(&right_len.to_be_bytes()); + + bytes.extend_from_slice(left.as_bytes()); + bytes.extend_from_slice(right.as_bytes()); + + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } + None => { + bytes.push(0); + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + Some(Cow::Owned(bytes)) + } + } + } +} + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let bounds = Some(("abc", "def")); + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (bounds, docids.clone()); + let bytes = + FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); + let (out_bounds, out_docids) = + FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_bounds, out_docids), (bounds, docids)); + } + + #[test] + fn deserialize_unit() { + let bounds = Some(("abc", "def")); + let key = (bounds, ()); + let bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); + let (out_bounds, out_unit) = + FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_bounds, out_unit), (bounds, ())); + } +} diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index 36408f578..178bb21c1 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -9,13 +9,13 @@ impl FieldDocIdFacetStringCodec { pub fn serialize_into( field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, out: &mut Vec, ) { - out.reserve(2 + 4 + value.len()); + out.reserve(2 + 4 + normalized_value.len()); out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); + out.extend_from_slice(normalized_value.as_bytes()); } } @@ -29,17 +29,22 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, value)) + let normalized_value = str::from_utf8(bytes).ok()?; + Some((field_id, document_id, normalized_value)) } } impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { type EItem = (FieldId, DocumentId, &'a str); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); + FieldDocIdFacetStringCodec::serialize_into( + *field_id, + *document_id, + normalized_value, + &mut bytes, + ); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 532da12fa..a6a805bf7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,9 +1,25 @@ mod facet_level_value_f64_codec; -mod facet_value_string_codec; +mod facet_level_value_u32_codec; +mod facet_string_level_zero_codec; +mod facet_string_level_zero_value_codec; +mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -pub use self::facet_value_string_codec::FacetValueStringCodec; +pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; +pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 099a5891d..f26643de7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -11,8 +11,8 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -90,13 +90,14 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and the string with the original string and docids that corresponds to it. + pub facet_id_string_docids: + Database>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, /// Maps the document id, the facet field id and the strings. - pub field_id_docid_facet_strings: Database, + pub field_id_docid_facet_strings: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ccee2c393..99d63c90d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::FacetIter; +use crate::search::facet::FacetNumberIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -172,8 +172,11 @@ fn facet_ordered<'t>( let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { - let facet_fn = - if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing }; + let facet_fn = if ascending { + FacetNumberIter::new_reducing + } else { + FacetNumberIter::new_reverse_reducing + }; let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 91620da2a..4436d4cda 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,6 +1,7 @@ use std::mem::size_of; -use heed::types::ByteSlice; +use concat_arrays::concat_arrays; +use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; use super::{Distinct, DocIter}; @@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + .map(|result| result.map(|(_original, docids)| docids)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { @@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> { } fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); - key + concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } fn facet_number_values<'a>( @@ -127,7 +128,7 @@ fn facet_number_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -144,14 +145,14 @@ fn facet_string_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index .field_id_docid_facet_strings .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_types::(); Ok(iter) } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index ae3fdb91e..e7dc52a82 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -47,7 +47,7 @@ mod test { let mut documents = Vec::new(); - let txts = ["toto", "titi", "tata"]; + let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -90,7 +90,6 @@ mod test { addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.update_format(UpdateFormat::Json); - addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b0b22ac49..94f875dfc 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,16 +1,17 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{cmp, fmt}; +use std::{cmp, fmt, mem}; -use heed::types::{ByteSlice, Unit}; -use heed::{BytesDecode, Database}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetValueStringCodec; -use crate::search::facet::{FacetIter, FacetRange}; -use crate::{DocumentId, FieldId, Index, Result}; +use crate::heed_codec::facet::{ + FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, +}; +use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; +use crate::{FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -22,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; +const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, @@ -67,46 +68,63 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - fn fetch_facet_values<'t, KC, K: 't>( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - candidates: &RoaringBitmap, - distribution: &mut BTreeMap, - ) -> heed::Result<()> - where - K: fmt::Display, - KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, - { - let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - - for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = db - .remap_key_type::() - .prefix_iter(rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *distribution.entry(value.to_string()).or_insert(0) += 1; - } - } - - Ok(()) - } - match facet_type { FacetType::Number => { + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + + let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *distribution.entry(value.to_string()).or_insert(0) += 1; + if distribution.len() - distribution_prelength == self.max_values_by_facet { + break; + } + } + } } FacetType::String => { + let mut normalized_distribution = BTreeMap::new(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_strings; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, normalized_value), original_value) = result?; + let (_, count) = normalized_distribution + .entry(normalized_value) + .or_insert_with(|| (original_value, 0)); + *count += 1; + + if normalized_distribution.len() == self.max_values_by_facet { + break; + } + } + } + + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); } } + + Ok(()) } /// There is too much documents, we use the facet levels to move throught @@ -118,7 +136,7 @@ impl<'a> FacetDistribution<'a> { distribution: &mut BTreeMap, ) -> heed::Result<()> { let iter = - FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { let (value, mut docids) = result?; @@ -134,6 +152,29 @@ impl<'a> FacetDistribution<'a> { Ok(()) } + fn facet_strings_distribution_from_facet_levels( + &self, + field_id: FieldId, + candidates: &RoaringBitmap, + distribution: &mut BTreeMap, + ) -> heed::Result<()> { + let iter = + FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + + for result in iter { + let (_normalized, original, mut docids) = result?; + docids &= candidates; + if !docids.is_empty() { + distribution.insert(original.to_string(), docids.len()); + } + if distribution.len() == self.max_values_by_facet { + break; + } + } + + Ok(()) + } + /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the /// facet values one by one and iterate on the facet level 0 for numbers. fn facet_values_from_raw_facet_database( @@ -143,7 +184,7 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); let db = self.index.facet_id_f64_docids; - let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; for result in range { let ((_, _, value, _), docids) = result?; @@ -158,16 +199,22 @@ impl<'a> FacetDistribution<'a> { .facet_id_string_docids .remap_key_type::() .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::(); + let mut normalized_distribution = BTreeMap::new(); for result in iter { - let ((_, value), docids) = result?; - distribution.insert(value.to_string(), docids.len()); + let ((_, normalized_value), (original_value, docids)) = result?; + normalized_distribution.insert(normalized_value, (original_value, docids.len())); if distribution.len() == self.max_values_by_facet { break; } } + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); + Ok(distribution) } @@ -198,14 +245,12 @@ impl<'a> FacetDistribution<'a> { candidates, &mut distribution, )?; - self.facet_distribution_from_documents( + self.facet_strings_distribution_from_facet_levels( field_id, - String, candidates, &mut distribution, )?; } - Ok(distribution) } None => self.facet_values_from_raw_facet_database(field_id), diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs new file mode 100644 index 000000000..02390aac1 --- /dev/null +++ b/milli/src/search/facet/facet_number.rs @@ -0,0 +1,248 @@ +use std::ops::Bound::{self, Excluded, Included, Unbounded}; + +use either::Either::{self, Left, Right}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{FieldId, Index}; + +pub struct FacetNumberRange<'t> { + iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, +} + +impl<'t> FacetNumberRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), + }; + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetNumberRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetNumberRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((fid, level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +pub struct FacetNumberRevRange<'t> { + iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, +} + +impl<'t> FacetNumberRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), + }; + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetNumberRevRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetNumberRevRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; + + fn next(&mut self) -> Option { + loop { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), + Err(e) => return Some(Err(e)), + } + } + continue; + } + Some(Err(e)) => return Some(Err(e)), + None => return None, + } + } + } +} + +pub struct FacetNumberIter<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, + must_reduce: bool, +} + +impl<'t> FacetNumberIter<'t> { + /// Create a `FacetNumberIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reverse_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids; + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Right(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetNumberIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will not reduce the given documents ids + /// while iterating on the different facet levels, possibly returning multiple times + /// a document id associated with multiple facet values. + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) + } + + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> { + let level = db + .remap_types::() + .prefix_iter(rtxn, &fid.to_be_bytes())? + .remap_key_type::() + .last() + .transpose()? + .map(|((_, level, _, _), _)| level); + Ok(level) + } +} + +impl<'t> Iterator for FacetNumberIter<'t> { + type Item = heed::Result<(f64, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); + for result in last { + // If the last iterator must find an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + match result { + Ok(((_fid, level, left, right), mut docids)) => { + docids &= &*documents_ids; + if !docids.is_empty() { + if self.must_reduce { + *documents_ids -= &docids; + } + + if level == 0 { + return Some(Ok((left, docids))); + } + + let rtxn = self.rtxn; + let db = self.db; + let fid = self.field_id; + let left = Included(left); + let right = Included(right); + + let result = if is_ascending { + FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) + .map(Left) + } else { + FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) + .map(Right) + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + } + Err(e) => return Some(Err(e)), + } + } + } + Err(e) => return Some(Err(e)), + } + } + self.level_iters.pop(); + } + } +} diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs new file mode 100644 index 000000000..40ea8c04a --- /dev/null +++ b/milli/src/search/facet/facet_string.rs @@ -0,0 +1,406 @@ +//! This module contains helpers iterators for facet strings. +//! +//! The purpose is to help iterate over the quite complex system of facets strings. A simple +//! description of the system would be that every facet string value is stored into an LMDB database +//! and that every value is associated with the document ids which are associated with this facet +//! string value. +//! +//! In reality it is a little bit more complex as we have to create aggregations of runs of facet +//! string values, those aggregations helps in choosing the right groups of facets to follow. +//! +//! ## A typical algorithm run +//! +//! If a group of aggregated facets values contains one of the documents ids, we must continue +//! iterating over the sub-groups. +//! +//! If this group is the lowest level and contain at least one document id we yield the associated +//! facet documents ids. +//! +//! If the group doesn't contain one of our documents ids, we continue to the next group at this +//! same level. +//! +//! ## The complexity comes from the strings +//! +//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create +//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the +//! two numbers bounds, the left and the right bound of the group, both inclusive. +//! +//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and +//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values +//! are simple unions of the documents ids coming from the groups below. +//! +//! ### Example of what a facet number LMDB database contain +//! +//! | level | left-bound | right-bound | documents ids | +//! |-------|------------|-------------|------------------| +//! | 0 | 0 | _skipped_ | 1, 2 | +//! | 0 | 1 | _skipped_ | 6, 7 | +//! | 0 | 3 | _skipped_ | 4, 7 | +//! | 0 | 5 | _skipped_ | 2, 3, 4 | +//! | 1 | 0 | 1 | 1, 2, 6, 7 | +//! | 1 | 3 | 5 | 2, 3, 4, 7 | +//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | +//! +//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second +//! bound, that's the base level where you can directly fetch the documents ids associated with an +//! exact number. +//! +//! The next levels have two different bounds and the associated documents ids are simply the result +//! of an union of all the documents ids associated with the aggregated groups above. +//! +//! ## The complexity of defining groups for facet strings +//! +//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in +//! lexicographical order, it means that whatever the key represent the bytes are read in their raw +//! form and a simple `strcmp` will define the order in which keys will be read from the store. +//! +//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and +//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the +//! first number then by the second if the the first number is equal on two keys. +//! +//! For strings it is a lot more complex as those types are unsized, it means that the size of facet +//! strings is different for each facet value. +//! +//! ### Basic approach: padding the keys +//! +//! A first approach would be to simply define the maximum size of a facet string and pad the keys +//! with zeroes. The big problem of this approach is that it: +//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the +//! other. +//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB +//! performances. +//! +//! ### Better approach: number the facet groups +//! +//! A better approach would be to number the groups, this way we don't have the downsides of the +//! previously described approach but we need to be able to describe the groups by using a number. +//! +//! #### Example of facet strings with numbered groups +//! +//! | level | left-bound | right-bound | left-string | right-string | documents ids | +//! |-------|------------|-------------|-------------|--------------|------------------| +//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | +//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | +//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | +//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | +//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | +//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +//! +//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not +//! need to store the facet string value two times. +//! +//! The number in the left-bound and right-bound columns are incremental numbers representing the +//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +//! of the LMDB keys. +//! +//! In the value, not in the key, you can see that we added two new values: the left-string and the +//! right-string, which defines the original facet strings associated with the given group. +//! +//! We put those two strings inside of the value, this way we do not limit the maximum size of the +//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big +//! values on another page, this helps in iterating over keys fast enough and only fetch the page +//! with the values when required. +//! +//! The other little advantage with this solution is that there is no a big overhead, compared with +//! the facet number levels, we only duplicate the facet strings once for the level 1. +//! +//! #### A typical algorithm run +//! +//! Note that the algorithm is always moving from the highest level to the lowest one, one level +//! by one level, this is why it is ok to only store the facets string on the level 1. +//! +//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, +//! we must continue iterating over the sub-groups. To do so: +//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds +//! and iterate over the facet groups defined by these numbers over the current level - 1. +//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the +//! value and just do the same as with the facet numbers but with strings: iterate over the +//! current level - 1 with both keys. +//! +//! If this group is the lowest level (level 0) and contain at least one document id we yield the +//! associated facet documents ids. +//! +//! If the group doesn't contain one of our documents ids, we continue to the next group at this +//! same level. +//! + +use std::num::NonZeroU8; +use std::ops::Bound; +use std::ops::Bound::{Excluded, Included, Unbounded}; + +use either::{Either, Left, Right}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{Database, LazyDecode, RoRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, +}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{FieldId, Index}; + +/// An iterator that is used to explore the facets level strings +/// from the level 1 to infinity. +/// +/// It yields the level, group id that an entry covers, the optional group strings +/// that it covers of the level 0 only if it is an entry from the level 1 and +/// the roaring bitmap associated. +pub struct FacetStringGroupRange<'t> { + iter: RoRange< + 't, + FacetLevelValueU32Codec, + LazyDecode>, + >, + end: Bound, +} + +impl<'t> FacetStringGroupRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: NonZeroU8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let db = db.remap_types::< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >(); + let left_bound = match left { + Included(left) => Included((field_id, level, left, u32::MIN)), + Excluded(left) => Excluded((field_id, level, left, u32::MIN)), + Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), + }; + let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetStringGroupRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetStringGroupRange<'t> { + type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +/// An iterator that is used to explore the level 0 of the facets string database. +/// +/// It yields the facet string and the roaring bitmap associated with it. +pub struct FacetStringLevelZeroRange<'t> { + iter: RoRange< + 't, + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, +} + +impl<'t> FacetStringLevelZeroRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + left: Bound<&str>, + right: Bound<&str>, + ) -> heed::Result> { + fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { + buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + &buffer[..] + } + + let mut left_buffer = Vec::new(); + let left_bound = match left { + Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), + Unbounded => { + left_buffer.extend_from_slice(&field_id.to_be_bytes()); + left_buffer.push(0); + Included(&left_buffer[..]) + } + }; + + let mut right_buffer = Vec::new(); + let right_bound = match right { + Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), + Unbounded => { + right_buffer.extend_from_slice(&field_id.to_be_bytes()); + right_buffer.push(1); // we must only get the level 0 + Excluded(&right_buffer[..]) + } + }; + + let iter = db + .remap_key_type::() + .range(rtxn, &(left_bound, right_bound))? + .remap_types::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec + >(); + + Ok(FacetStringLevelZeroRange { iter }) + } +} + +impl<'t> Iterator for FacetStringLevelZeroRange<'t> { + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, normalized), (original, docids)))) => { + Some(Ok((normalized, original, docids))) + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +/// An iterator that is used to explore the facet strings level by level, +/// it will only return facets strings that are associated with the +/// candidates documents ids given. +pub struct FacetStringIter<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: + Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, +} + +impl<'t> FacetStringIter<'t> { + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_string_docids.remap_types::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = match NonZeroU8::new(highest_level) { + Some(highest_level) => Left(FacetStringGroupRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + )?), + None => Right(FacetStringLevelZeroRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + )?), + }; + + Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)] }) + } + + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> { + Ok(db + .remap_types::() + .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits + .last() + .transpose()? + .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit + } +} + +impl<'t> Iterator for FacetStringIter<'t> { + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + match last { + Left(last) => { + for result in last { + match result { + Ok(((level, left, right), (string_bounds, mut docids))) => { + docids &= &*documents_ids; + if !docids.is_empty() { + *documents_ids -= &docids; + + let result = match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + } + Err(e) => return Some(Err(e)), + } + } + } + Err(e) => return Some(Err(e)), + } + } + } + Right(last) => { + // level zero only + for result in last { + match result { + Ok((normalized, original, mut docids)) => { + docids &= &*documents_ids; + if !docids.is_empty() { + *documents_ids -= &docids; + return Some(Ok((normalized, original, docids))); + } + } + Err(e) => return Some(Err(e)), + } + } + } + } + + self.level_iters.pop(); + } + } +} diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 1b1eafcab..cc108f855 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -15,9 +15,11 @@ use roaring::RoaringBitmap; use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; -use super::FacetRange; +use super::FacetNumberRange; use crate::error::UserError; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, +}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] @@ -282,7 +284,7 @@ impl FilterCondition { // We must create a custom iterator to be able to iterate over the // requested range as the range iterator cannot express some conditions. - let iter = FacetRange::new(rtxn, db, field_id, level, left, right)?; + let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; debug!("Iterating between {:?} and {:?} (level {})", left, right, level); @@ -363,7 +365,10 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database, + strings_db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, field_id: FieldId, operator: &Operator, ) -> Result { @@ -374,7 +379,8 @@ impl FilterCondition { GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), Equal(number, string) => { - let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + let (_original_value, string_docids) = + strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 9774bdd52..ddf710e32 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,253 +1,11 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; - pub use self::facet_distribution::FacetDistribution; +pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; +pub use self::facet_string::FacetStringIter; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; mod facet_distribution; +mod facet_number; +mod facet_string; mod filter_condition; mod parser; - -pub struct FacetRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -pub struct FacetRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetRevRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} - -pub struct FacetIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t>>)>, - must_reduce: bool, -} - -impl<'t> FacetIter<'t> { - /// Create a `FacetIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } - - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} - -impl<'t> Iterator for FacetIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; - - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } - - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - - if level == 0 { - return Some(Ok((left, docids))); - } - - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); - - let result = if is_ascending { - FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left) - } else { - FacetRevRange::new(rtxn, db, fid, level - 1, left, right).map(Right) - }; - - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f40a6aed6..574459547 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub(crate) use self::facet::ParserRule; -pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator}; +pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::FieldIdMapMissingEntry; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 222f3b2d3..bcb7d7580 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,12 +3,13 @@ use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; -use heed::types::{ByteSlice, Unit}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use serde_json::Value; use super::ClearDocuments; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; @@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_number_docids( self.wtxn, facet_id_f64_docids, &self.documents_ids, )?; - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_string_docids( self.wtxn, facet_id_string_docids, &self.documents_ids, @@ -419,15 +420,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( +fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + db: &heed::Database, field_id: FieldId, to_remove: &RoaringBitmap, convert: F, ) -> heed::Result<()> where - C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, + C: heed::BytesDecode<'a, DItem = K>, + DC: heed::BytesDecode<'a, DItem = V>, F: Fn(K) -> DocumentId, { let mut iter = db @@ -436,7 +438,7 @@ where .remap_key_type::(); while let Some(result) = iter.next() { - let (key, ()) = result?; + let (key, _) = result?; if to_remove.contains(convert(key)) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -446,7 +448,33 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_value_docids<'a, C>( +fn remove_docids_from_facet_field_id_string_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database>, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, (original_value, mut docids)) = result?; + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &(original_value, docids))? }; + } + } + + Ok(()) +} + +fn remove_docids_from_facet_field_id_number_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 5fabbc504..cb9a90f7e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,6 +1,6 @@ -use std::cmp; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::{cmp, mem}; use chrono::Utc; use grenad::{CompressionType, FileFuse, Reader, Writer}; @@ -10,7 +10,10 @@ use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, @@ -64,24 +67,42 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for field_id in faceted_fields { + // Clear the facet string levels. + clear_field_string_levels( + self.wtxn, + self.index.facet_id_string_docids.remap_types::(), + field_id, + )?; + // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_documents_ids( + let string_documents_ids = compute_faceted_strings_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, )?; + let facet_string_levels = compute_facet_string_levels( + self.wtxn, + self.index.facet_id_string_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_documents_ids( + let number_documents_ids = compute_faceted_numbers_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, )?; - let content = compute_facet_number_levels( + let facet_number_levels = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -106,8 +127,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), - content, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }), + facet_number_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), + WriteMethod::GetMergePut, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_string_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), WriteMethod::GetMergePut, )?; } @@ -193,21 +222,6 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } -fn compute_faceted_documents_ids( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: FieldId, -) -> Result { - let mut documents_ids = RoaringBitmap::new(); - - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { - let (_key, docids) = result?; - documents_ids |= docids; - } - - Ok(documents_ids) -} - fn write_number_entry( writer: &mut Writer, field_id: FieldId, @@ -222,3 +236,129 @@ fn write_number_entry( writer.insert(&key, &data)?; Ok(()) } + +fn compute_faceted_strings_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database>, + field_id: FieldId, +) -> Result { + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { + let (_key, (_original_value, docids)) = result?; + documents_ids |= docids; + } + + Ok(documents_ids) +} + +fn compute_faceted_numbers_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database, + field_id: FieldId, +) -> Result { + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { + let (_key, docids) = result?; + documents_ids |= docids; + } + + Ok(documents_ids) +} + +fn clear_field_string_levels<'t>( + wtxn: &'t mut heed::RwTxn, + db: heed::Database, + field_id: FieldId, +) -> heed::Result<()> { + let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); + let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); + let range = left..=right; + db.remap_key_type::().delete_range(wtxn, &range).map(drop) +} + +fn compute_facet_string_levels<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + for (level, group_size) in group_size_iter { + let level = NonZeroU8::new(level).unwrap(); + let mut left = (0, ""); + let mut right = (0, ""); + let mut group_docids = RoaringBitmap::new(); + + // Because we know the size of the level 0 we can use a range iterator that starts + // at the first value of the level and goes to the last by simply counting. + for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { + let ((_field_id, value), (_original_value, docids)) = result?; + + if i == 0 { + left = (i as u32, value); + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. We also reset the docids. + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + + // We save the left bound for the new group. + left = (i as u32, value); + } + + // The right bound is always the bound we run through. + group_docids |= docids; + right = (i as u32, value); + } + + if !group_docids.is_empty() { + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_string_entry( + writer: &mut Writer, + field_id: FieldId, + level: NonZeroU8, + (left_id, left_value): (u32, &str), + (right_id, right_value): (u32, &str), + docids: RoaringBitmap, +) -> Result<()> { + let key = (field_id, level, left_id, right_id); + let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = match level.get() { + 1 => (Some((left_value, right_value)), docids), + _ => (None, docids), + }; + let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) + .ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 8613a8824..7e5d0b581 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -2,8 +2,11 @@ use std::borrow::Cow; use std::result::Result as StdResult; use fst::IntoStreamer; +use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use crate::error::SerializationError; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result Ok(vec) } +/// Uses the FacetStringLevelZeroValueCodec to merge the values. +pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + let (head, tail) = values.split_first().unwrap(); + let (head_string, mut head_rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&head[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + + for value in tail { + let (_string, rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&value[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + head_rb |= rb; + } + + FacetStringLevelZeroValueCodec::::bytes_encode(&(head_string, head_rb)) + .map(|cow| cow.into_owned()) + .ok_or(SerializationError::Encoding { db_name: None }) + .map_err(Into::into) +} + pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9ac05fe1a..efe16def7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; pub use self::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use self::store::{Readers, Store}; pub use self::transform::{Transform, TransformOutput}; @@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_field_strings_docids_readers, - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, write_method, )?; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index ebf365f44..444b11e31 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -22,12 +22,13 @@ use tempfile::tempfile; use super::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; @@ -65,7 +66,7 @@ pub struct Store<'s, A> { LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, - facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, + facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, @@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_strings_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -283,21 +284,33 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn insert_facet_string_values_docid( &mut self, field_id: FieldId, - value: String, + normalized_value: String, + original_value: String, id: DocumentId, ) -> Result<()> { - let sorter = &mut self.field_id_docid_facet_strings_sorter; - Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; + if normalized_value.is_empty() { + return Ok(()); + } - let key = (field_id, value); + let sorter = &mut self.field_id_docid_facet_strings_sorter; + Self::write_field_id_docid_facet_string_value( + sorter, + field_id, + id, + &normalized_value, + &original_value, + )?; + + let key = (field_id, normalized_value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_string_docids.get_refresh(&key) { - Some(old) => { + Some((_original_value, old)) => { old.insert(id); } None => { // A newly inserted element is append at the end of the linked hash map. - self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + self.facet_field_string_docids + .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { @@ -359,7 +372,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { document_id: DocumentId, words_positions: &mut HashMap>, facet_numbers_values: &mut HashMap>, - facet_strings_values: &mut HashMap>, + facet_strings_values: &mut HashMap>, record: &[u8], ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. @@ -395,8 +408,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // We store document_id associated with all the facet strings fields ids and values. for (field, values) in facet_strings_values.drain() { - for value in values { - self.insert_facet_string_values_docid(field, value, document_id)?; + for (normalized, original) in values { + self.insert_facet_string_values_docid(field, normalized, original, document_id)?; } } @@ -512,23 +525,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, + I: IntoIterator, Error: From, { let mut key_buffer = Vec::new(); - let mut data_buffer = Vec::new(); - for ((field_id, value), docids) in iter { + for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); - data_buffer.clear(); - FacetValueStringCodec::serialize_into(field_id, &value, &mut key_buffer); - CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); + + let data = (original_value.as_str(), docids); + let data = FacetStringLevelZeroValueCodec::::bytes_encode(&data) + .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; if lmdb_key_valid_size(&key_buffer) { - sorter.insert(&key_buffer, &data_buffer)?; + sorter.insert(&key_buffer, &data)?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } } @@ -583,19 +597,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, + original_value: &str, ) -> Result<()> where Error: From, { let mut buffer = Vec::new(); - - FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); + FieldDocIdFacetStringCodec::serialize_into( + field_id, + document_id, + normalized_value, + &mut buffer, + ); if lmdb_key_valid_size(&buffer) { - sorter.insert(&buffer, &[])?; + sorter.insert(&buffer, original_value.as_bytes())?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } Ok(()) @@ -925,24 +944,24 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn extract_facet_values(value: &Value) -> (Vec, Vec) { +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { fn inner_extract_facet_values( value: &Value, can_recurse: bool, output_numbers: &mut Vec, - output_strings: &mut Vec, + output_strings: &mut Vec<(String, String)>, ) { match value { Value::Null => (), - Value::Bool(b) => output_strings.push(b.to_string()), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), Value::Number(number) => { if let Some(float) = number.as_f64() { output_numbers.push(float); } } - Value::String(string) => { - let string = string.trim().to_lowercase(); - output_strings.push(string); + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); } Value::Array(values) => { if can_recurse { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c6540b33a..e4adbccb9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -276,8 +276,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.searchable_fields { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (displayed_fields, - // filterable_fields) + // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_fields_ids_map = FieldsIdsMap::new();