From c3f49f766d0a3a181e3c515438d413620e7b36b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 29 Aug 2022 16:01:54 +0200 Subject: [PATCH 01/58] Prepare refactor of facets database Prepare refactor of facets database --- infos/src/main.rs | 1 + .../facet/facet_level_value_f64_codec.rs | 89 -- .../facet/facet_level_value_u32_codec.rs | 53 - .../facet/facet_string_level_zero_codec.rs | 50 - .../facet_string_level_zero_value_codec.rs | 90 -- milli/src/heed_codec/facet/mod.rs | 22 +- milli/src/heed_codec/facet/new/mod.rs | 148 ++ .../heed_codec/facet/new/ordered_f64_codec.rs | 36 + milli/src/heed_codec/facet/new/str_ref.rs | 20 + milli/src/index.rs | 17 +- milli/src/search/criteria/asc_desc.rs | 33 +- milli/src/search/distinct/facet_distinct.rs | 10 +- milli/src/search/facet/facet_distribution.rs | 120 +- milli/src/search/facet/facet_number.rs | 539 +++++--- milli/src/search/facet/facet_string.rs | 1217 ++++++++--------- milli/src/search/facet/filter.rs | 268 ++-- milli/src/search/facet/mod.rs | 4 +- milli/src/search/mod.rs | 2 +- milli/src/snapshot_tests.rs | 81 +- milli/src/update/delete_documents.rs | 108 +- milli/src/update/facets.rs | 261 ++-- .../extract/extract_facet_number_docids.rs | 13 +- .../extract/extract_facet_string_docids.rs | 24 +- .../src/update/index_documents/extract/mod.rs | 6 +- .../helpers/merge_functions.rs | 52 +- .../src/update/index_documents/helpers/mod.rs | 6 +- .../src/update/index_documents/typed_chunk.rs | 16 +- 27 files changed, 1662 insertions(+), 1624 deletions(-) create mode 100644 infos/src/main.rs delete mode 100644 milli/src/heed_codec/facet/facet_level_value_f64_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_level_value_u32_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs create mode 100644 milli/src/heed_codec/facet/new/mod.rs create mode 100644 milli/src/heed_codec/facet/new/ordered_f64_codec.rs create mode 100644 milli/src/heed_codec/facet/new/str_ref.rs diff --git a/infos/src/main.rs b/infos/src/main.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/infos/src/main.rs @@ -0,0 +1 @@ + diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs deleted file mode 100644 index 1e66427ca..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ /dev/null @@ -1,89 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; -use crate::{try_split_array_at, FieldId}; - -// TODO do not de/serialize right bound when level = 0 -pub struct FacetLevelValueF64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { - type DItem = (FieldId, u8, f64, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - let (left, right) = if *level != 0 { - let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; - let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; - (left, right) - } else { - let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; - (left, left) - }; - - Some((field_id, *level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { - type EItem = (FieldId, u8, f64, f64); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 32]; - - let len = if *level != 0 { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - let bytes = f64_into_bytes(*right)?; - buffer[8..16].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[16..24].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[24..].copy_from_slice(&bytes[..]); - - 32 // length - } else { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..16].copy_from_slice(&bytes[..]); - - 16 // length - }; - - let mut bytes = Vec::with_capacity(len + 3); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(*level); - bytes.extend_from_slice(&buffer[..len]); - Some(Cow::Owned(bytes)) - } -} - -#[cfg(test)] -mod tests { - use heed::{BytesDecode, BytesEncode}; - - use super::*; - - #[test] - fn globally_ordered_f64() { - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); - - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); - } -} diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs deleted file mode 100644 index 597335b6e..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::num::NonZeroU8; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 1 and higher and the groups ids. -/// -/// It can only be used to encode the facet string of the level 1 or higher. -pub struct FacetLevelValueU32Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { - type DItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - let level = NonZeroU8::new(*level)?; - let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; - let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; - Some((field_id, level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { - type EItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 16]; - - // Write the big-endian integers. - let bytes = left.to_be_bytes(); - buffer[..4].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[4..8].copy_from_slice(&bytes[..]); - - // Then the u32 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..12].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[12..].copy_from_slice(&bytes[..]); - - let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(level.get()); - bytes.extend_from_slice(&buffer); - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs deleted file mode 100644 index 009c6454a..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 0, and facet string. -/// -/// It can only be used to encode the facet string of the level 0, -/// as it hardcodes the level. -/// -/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, -/// and make sure that the levels are not mixed-up. The level 0 is special, the key -/// are strings, other levels represent groups and keys are simply two integers. -pub struct FacetStringLevelZeroCodec; - -impl FacetStringLevelZeroCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.push(0); // the level zero (for LMDB ordering only) - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - if *level != 0 { - return None; - } - - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs deleted file mode 100644 index 22031c474..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ /dev/null @@ -1,90 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use crate::error::SerializationError; -use crate::heed_codec::RoaringBitmapCodec; -use crate::{try_split_array_at, try_split_at, Result}; - -pub type FacetStringLevelZeroValueCodec = StringValueCodec; - -/// A codec that encodes a string in front of a value. -/// -/// The usecase is for the facet string levels algorithm where we must know the -/// original string of a normalized facet value, the original values are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct StringValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for StringValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (&'a str, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (string, bytes) = decode_prefix_string(bytes)?; - C::bytes_decode(bytes).map(|item| (string, item)) - } -} - -impl<'a, C> heed::BytesEncode<'a> for StringValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (&'a str, C::EItem); - - fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let value_bytes = C::bytes_encode(&value)?; - - let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); - encode_prefix_string(string, &mut bytes).ok()?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } -} - -pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { - let (original_length_bytes, bytes) = try_split_array_at(value)?; - let original_length = u16::from_be_bytes(original_length_bytes) as usize; - let (string, bytes) = try_split_at(bytes, original_length)?; - let string = str::from_utf8(string).ok()?; - Some((string, bytes)) -} - -pub fn encode_prefix_string(string: &str, buffer: &mut Vec) -> Result<()> { - let string_len: u16 = - string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; - buffer.extend_from_slice(&string_len.to_be_bytes()); - buffer.extend_from_slice(string.as_bytes()); - Ok(()) -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - - #[test] - fn deserialize_roaring_bitmaps() { - let string = "abc"; - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (string, docids.clone()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_docids) = - StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_docids), (string, docids)); - } - - #[test] - fn deserialize_unit() { - let string = "def"; - let key = (string, ()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_unit) = StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_unit), (string, ())); - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 0b2d9186f..d23ab391e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,19 +1,21 @@ -mod facet_level_value_f64_codec; -mod facet_level_value_u32_codec; -mod facet_string_level_zero_codec; -mod facet_string_level_zero_value_codec; +// mod facet_level_value_f64_codec; +// mod facet_level_value_u32_codec; +// mod facet_string_level_zero_codec; +// mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +pub mod new; + use heed::types::OwnedType; -pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; -pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -pub use self::facet_string_level_zero_value_codec::{ - decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, -}; +// pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +// pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +// pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +// pub use self::facet_string_level_zero_value_codec::{ +// decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, +// }; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs new file mode 100644 index 000000000..5ed6a61f6 --- /dev/null +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -0,0 +1,148 @@ +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; +use std::{borrow::Cow, convert::TryFrom, marker::PhantomData}; + +pub mod ordered_f64_codec; +pub mod str_ref; +// TODO: these codecs were quickly written and not fast/resilient enough + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FacetKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} +impl<'a> FacetKey<&'a [u8]> { + pub fn into_owned(self) -> FacetKey> { + FacetKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetKey> { + pub fn as_ref(&self) -> FacetKey<&[u8]> { + FacetKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound).unwrap(); + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).unwrap()); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..]).unwrap(); + Some(FacetKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.push(value.size); + value.bitmap.serialize_into(&mut v).unwrap(); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = RoaringBitmap::deserialize_from(&bytes[1..]).unwrap(); + Some(FacetGroupValue { size, bitmap }) + } +} + +// TODO: get rid of this codec as it is named confusingly + should really be part of heed +// or even replace the current ByteSlice codec +pub struct MyByteSlice; + +impl<'a> BytesEncode<'a> for MyByteSlice { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for MyByteSlice { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} + +// I won't need these ones anymore +// pub struct U16Codec; +// impl<'a> BytesEncode<'a> for U16Codec { +// type EItem = u16; + +// fn bytes_encode(item: &'a Self::EItem) -> Option> { +// Some(Cow::Owned(item.to_be_bytes().to_vec())) +// } +// } +// impl<'a> BytesDecode<'a> for U16Codec { +// type DItem = u16; + +// fn bytes_decode(bytes: &'a [u8]) -> Option { +// Some(u16::from_be_bytes(bytes[0..=1].try_into().unwrap())) +// } +// } + +// pub struct StrCodec; +// impl<'a> BytesEncode<'a> for StrCodec { +// type EItem = &'a str; + +// fn bytes_encode(item: &'a &'a str) -> Option> { +// Some(Cow::Borrowed(item.as_bytes())) +// } +// } +// impl<'a> BytesDecode<'a> for StrCodec { +// type DItem = &'a str; + +// fn bytes_decode(bytes: &'a [u8]) -> Option { +// let s = std::str::from_utf8(bytes).unwrap(); +// Some(s) +// } +// } diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs new file mode 100644 index 000000000..856a9c0d1 --- /dev/null +++ b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs @@ -0,0 +1,36 @@ +use std::{borrow::Cow, convert::TryInto}; + +use heed::BytesDecode; + +use crate::facet::value_encoding::f64_into_bytes; + +pub struct OrderedF64Codec; + +impl<'a> BytesDecode<'a> for OrderedF64Codec { + type DItem = f64; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + if bytes.len() < 16 { + return None; + } + let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + Some(f) + } +} + +impl heed::BytesEncode<'_> for OrderedF64Codec { + type EItem = f64; + + fn bytes_encode(f: &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // write the globally ordered float + let bytes = f64_into_bytes(*f)?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 value just to be able to read it back + let bytes = f.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + Some(Cow::Owned(buffer.to_vec())) + } +} diff --git a/milli/src/heed_codec/facet/new/str_ref.rs b/milli/src/heed_codec/facet/new/str_ref.rs new file mode 100644 index 000000000..80a51c803 --- /dev/null +++ b/milli/src/heed_codec/facet/new/str_ref.rs @@ -0,0 +1,20 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +pub struct StrRefCodec; +impl<'a> BytesEncode<'a> for StrRefCodec { + type EItem = &'a str; + + fn bytes_encode(item: &'a &'a str) -> Option> { + Some(Cow::Borrowed(item.as_bytes())) + } +} +impl<'a> BytesDecode<'a> for StrRefCodec { + type DItem = &'a str; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s = std::str::from_utf8(bytes).unwrap(); + Some(s) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 94e2f538d..0561a77ac 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -13,9 +13,14 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, + // FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, + FieldIdCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -123,10 +128,10 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, - /// Maps the facet field id, level and the number with the docids that corresponds to it. - pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the original string and docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id and ranges of strings with the docids that corresponds to them. + pub facet_id_string_docids: Database, FacetGroupValueCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..bd08c54a5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::{FacetNumberIter, FacetStringIter}; +// use crate::search::facet::FacetStringIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -186,23 +186,24 @@ fn facet_ordered<'t>( iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - let facet_number_fn = if is_ascending { - FacetNumberIter::new_reducing - } else { - FacetNumberIter::new_reverse_reducing - }; - let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - .map(|res| res.map(|(_, docids)| docids)); + todo!() + // let facet_number_fn = if is_ascending { + // FacetNumberIter::new_reducing + // } else { + // FacetNumberIter::new_reverse_reducing + // }; + // let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? + // .map(|res| res.map(|(_, docids)| docids)); - let facet_string_fn = if is_ascending { - FacetStringIter::new_reducing - } else { - FacetStringIter::new_reverse_reducing - }; - let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - .map(|res| res.map(|(_, _, docids)| docids)); + // let facet_string_fn = if is_ascending { + // FacetStringIter::new_reducing + // } else { + // FacetStringIter::new_reverse_reducing + // }; + // let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? + // .map(|res| res.map(|(_, _, docids)| docids)); - Ok(Box::new(number_iter.chain(string_iter))) + // Ok(Box::new(number_iter.chain(string_iter))) } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 33e7b4975..4a4815775 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,6 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; +use crate::heed_codec::facet::new::FacetKey; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; @@ -47,13 +48,16 @@ impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { self.index .facet_id_string_docids - .get(self.txn, &(self.distinct, key)) - .map(|result| result.map(|(_original, docids)| docids)) + .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { // get facet docids on level 0 - self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) + self.index + .facet_id_f64_docids + .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn distinct_string(&mut self, id: DocumentId) -> Result<()> { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b2718a490..fddf93d4b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -7,10 +7,8 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::{ - FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, -}; -use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +// use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -133,21 +131,22 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + todo!() + // let iter = + // FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - for result in iter { - let (value, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in iter { + // let (value, mut docids) = result?; + // docids &= candidates; + // if !docids.is_empty() { + // distribution.insert(value.to_string(), docids.len()); + // } + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - Ok(()) + // Ok(()) } fn facet_strings_distribution_from_facet_levels( @@ -156,21 +155,22 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + todo!() + // let iter = + // FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - for result in iter { - let (_normalized, original, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(original.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in iter { + // let (_normalized, original, mut docids) = result?; + // docids &= candidates; + // if !docids.is_empty() { + // distribution.insert(original.to_string(), docids.len()); + // } + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - Ok(()) + // Ok(()) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -179,41 +179,43 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, ) -> heed::Result> { - let mut distribution = BTreeMap::new(); + todo!() + // let mut distribution = BTreeMap::new(); - let db = self.index.facet_id_f64_docids; - let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + // let db = self.index.facet_id_f64_docids; + // let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; - for result in range { - let ((_, _, value, _), docids) = result?; - distribution.insert(value.to_string(), docids.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in range { + // let ((_, _, value, _), docids) = result?; + // distribution.insert(value.to_string(), docids.len()); + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - let iter = self - .index - .facet_id_string_docids - .remap_key_type::() - .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + // let iter = self + // .index + // .facet_id_string_docids + // .remap_key_type::() + // .prefix_iter(self.rtxn, &field_id.to_be_bytes())? + // .remap_key_type::(); - let mut normalized_distribution = BTreeMap::new(); - for result in iter { - let ((_, normalized_value), (original_value, docids)) = result?; - normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if normalized_distribution.len() == self.max_values_per_facet { - break; - } - } + // let mut normalized_distribution = BTreeMap::new(); + // for result in iter { + // let ((_, normalized_value), group_value) = result?; + // normalized_distribution + // .insert(normalized_value, (normalized_value, group_value.bitmap.len())); + // if normalized_distribution.len() == self.max_values_per_facet { + // break; + // } + // } - let iter = normalized_distribution - .into_iter() - .map(|(_normalized, (original, count))| (original.to_string(), count)); - distribution.extend(iter); + // let iter = normalized_distribution + // .into_iter() + // .map(|(_normalized, (original, count))| (original.to_string(), count)); + // distribution.extend(iter); - Ok(distribution) + // Ok(distribution) } fn facet_values(&self, field_id: FieldId) -> heed::Result> { diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs index 02390aac1..5f7bd5325 100644 --- a/milli/src/search/facet/facet_number.rs +++ b/milli/src/search/facet/facet_number.rs @@ -1,248 +1,335 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; +// use std::ops::Bound::{self, Excluded, Included, Unbounded}; -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; +// use either::Either::{self, Left, Right}; +// use heed::types::{ByteSlice, DecodeIgnore}; +// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange}; +// use obkv::Key; +// use roaring::RoaringBitmap; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; +// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +// use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::{FieldId, Index}; -pub struct FacetNumberRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} +// pub struct FacetNumberRange<'t, 'e> { +// rtxn: &'t heed::RoTxn<'e>, +// db: Database, FacetGroupValueCodec>, +// iter: RoRange<'t, FacetKeyCodec, LazyDecode>, +// max_bound: f64, +// previous: Option<(FacetKey, Lazy<'t, FacetGroupValueCodec>)>, +// field_id: FieldId, +// end: Bound, +// } -impl<'t> FacetNumberRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRange { iter, end: right }) - } -} +// impl<'t, 'e> FacetNumberRange<'t, 'e> { +// pub fn new( +// rtxn: &'t heed::RoTxn<'e>, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level: u8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let left_bound = match left { +// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }), +// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }), +// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), +// }; -impl<'t> Iterator for FacetNumberRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; +// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?; +// let mut previous = iter.next().transpose()?; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// // Compute the maximum end bound by looking at the key of the last element in level 0 +// let mut prefix_level_0 = vec![]; +// prefix_level_0.extend_from_slice(&field_id.to_be_bytes()); +// prefix_level_0.push(level); -pub struct FacetNumberRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} +// let mut rev_iter = +// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?; -impl<'t> FacetNumberRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRevRange { iter, end: right }) - } -} +// let rev_iter_first = rev_iter.next().transpose()?; +// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first { +// let max_bound_key = +// FacetKeyCodec::::bytes_decode(max_bound_key).unwrap(); +// max_bound_key.left_bound +// } else { +// // I can't imagine when that would happen, but let's handle it correctly anyway +// // by making the iterator empty +// previous = None; +// 0.0 // doesn't matter since previous = None so the iterator will always early exit +// // and return None itself +// }; -impl<'t> Iterator for FacetNumberRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; +// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right }) +// } +// } - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} +// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> { +// type Item = heed::Result<(FacetKey, RoaringBitmap)>; -pub struct FacetNumberIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, - must_reduce: bool, -} +// fn next(&mut self) -> Option { +// // The idea here is to return the **previous** element only if the left +// // bound of the current key fits within the range given to the iter +// // if it doesn't, then there is still a chance that it must be returned, +// // but we need to check the actual right bound of the group by looking for +// // the key preceding the first key of the next group in level 0 -impl<'t> FacetNumberIter<'t> { - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } +// let (prev_key, prev_value) = self.previous?; - /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids; - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } +// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() { +// let (key, group_value) = match next { +// Ok(n) => n, +// Err(e) => return Some(Err(e)), +// }; +// (key.left_bound, Some((key, group_value))) +// } else { +// // we're at the end of the level iter, so we need to fetch the max bound instead +// (self.max_bound, None) +// }; +// let must_be_returned = match self.end { +// Included(end) => next_left_bound <= end, +// Excluded(end) => next_left_bound < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match prev_value.decode() { +// Ok(group_value) => { +// self.previous = next_previous; +// Some(Ok((prev_key, group_value.bitmap))) +// } +// Err(e) => Some(Err(e)), +// } +// } else { +// // it still possible that we want to return the value (one last time) +// // but to do so, we need to fetch the right bound of the current group +// // this is done by getting the first element at level 0 of the next group +// // then iterating in reverse from it +// // once we have the right bound, we can compare it, and then return or not +// // then we still set self.previous to None so that no other element can return +// // from it? +// let mut level_0_key_prefix = vec![]; +// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes()); +// level_0_key_prefix.push(0); +// let key = +// FacetKey:: { field_id: self.field_id, level: 0, left_bound: next_left_bound }; +// let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); +// level_0_key_prefix.extend_from_slice(&key_bytes); - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } +// let mut rev_iter_next_group_level_0 = self +// .db +// .as_polymorph() +// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix) +// .unwrap(); +// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap(); +// let key_for_right_bound = +// FacetKeyCodec::::bytes_decode(key_for_right_bound).unwrap(); +// let right_bound = key_for_right_bound.left_bound; +// let must_be_returned = match self.end { +// Included(end) => right_bound <= end, +// Excluded(end) => right_bound < end, +// Unbounded => unreachable!(), +// }; +// self.previous = None; +// if must_be_returned { +// match prev_value.decode() { +// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))), +// Err(e) => Some(Err(e)), +// } +// } else { +// None +// } +// } +// } +// } - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} +// pub struct FacetNumberRevRange<'t> { +// iter: RoRevRange<'t, FacetKeyCodec, LazyDecode>, +// end: Bound, +// } -impl<'t> Iterator for FacetNumberIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; +// impl<'t> FacetNumberRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level: u8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let left_bound = match left { +// Included(left) => Included(FacetKey { field_id, level, left_bound: left }), +// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }), +// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), +// }; +// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX }); +// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetNumberRevRange { iter, end: right }) +// } +// } - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } +// impl<'t> Iterator for FacetNumberRevRange<'t> { +// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } +// fn next(&mut self) -> Option { +// loop { +// match self.iter.next() { +// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => { +// let must_be_returned = match self.end { +// Included(end) => todo!(), //right <= end, +// Excluded(end) => todo!(), //right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok(docids) => { +// return Some(Ok(( +// FacetKey { field_id, level, left_bound }, +// docids.bitmap, +// ))) +// } +// Err(e) => return Some(Err(e)), +// } +// } +// continue; +// } +// Some(Err(e)) => return Some(Err(e)), +// None => return None, +// } +// } +// } +// } - if level == 0 { - return Some(Ok((left, docids))); - } +// pub struct FacetNumberIter<'t, 'e> { +// rtxn: &'t heed::RoTxn<'t>, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, +// must_reduce: bool, +// } - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); +// impl<'t, 'e> FacetNumberIter<'t, 'e> { +// /// Create a `FacetNumberIter` that will iterate on the different facet entries +// /// (facet value + documents ids) and that will reduce the given documents ids +// /// while iterating on the different facet levels. +// pub fn new_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Left(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) +// } - let result = if is_ascending { - FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) - .map(Left) - } else { - FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) - .map(Right) - }; +// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse +// /// (facet value + documents ids) and that will reduce the given documents ids +// /// while iterating on the different facet levels. +// pub fn new_reverse_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Right(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) +// } - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} +// /// Create a `FacetNumberIter` that will iterate on the different facet entries +// /// (facet value + documents ids) and that will not reduce the given documents ids +// /// while iterating on the different facet levels, possibly returning multiple times +// /// a document id associated with multiple facet values. +// pub fn new_non_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Left(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) +// } + +// fn highest_level( +// rtxn: &'t heed::RoTxn, +// db: Database, X>, +// fid: FieldId, +// ) -> heed::Result> { +// let level = db +// .remap_types::() +// .prefix_iter(rtxn, &fid.to_be_bytes())? +// .remap_key_type::>() +// .last() +// .transpose()? +// .map(|(key, _)| key.level); +// Ok(level) +// } +// } + +// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> { +// type Item = heed::Result<(f64, RoaringBitmap)>; + +// fn next(&mut self) -> Option { +// 'outer: loop { +// let (documents_ids, last) = self.level_iters.last_mut()?; +// let is_ascending = last.is_left(); +// for result in last { +// // If the last iterator must find an empty set of documents it means +// // that we found all the documents in the sub level iterations already, +// // we can pop this level iterator. +// if documents_ids.is_empty() { +// break; +// } + +// match result { +// Ok((key, mut docids)) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } + +// if level == 0 { +// return Some(Ok((left, docids))); +// } + +// let rtxn = self.rtxn; +// let db = self.db; +// let fid = self.field_id; +// let left = Included(left); +// let right = Included(right); + +// let result = if is_ascending { +// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) +// .map(Left) +// } else { +// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) +// .map(Right) +// }; + +// match result { +// Ok(iter) => { +// self.level_iters.push((docids, iter)); +// continue 'outer; +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// self.level_iters.pop(); +// } +// } +// } diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index c55430cf1..b01359503 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -1,652 +1,649 @@ -//! This module contains helpers iterators for facet strings. -//! -//! The purpose is to help iterate over the quite complex system of facets strings. A simple -//! description of the system would be that every facet string value is stored into an LMDB database -//! and that every value is associated with the document ids which are associated with this facet -//! string value. -//! -//! In reality it is a little bit more complex as we have to create aggregations of runs of facet -//! string values, those aggregations helps in choosing the right groups of facets to follow. -//! -//! ## A typical algorithm run -//! -//! If a group of aggregated facets values contains one of the documents ids, we must continue -//! iterating over the sub-groups. -//! -//! If this group is the lowest level and contain at least one document id we yield the associated -//! facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! -//! ## The complexity comes from the strings -//! -//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -//! two numbers bounds, the left and the right bound of the group, both inclusive. -//! -//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -//! are simple unions of the documents ids coming from the groups below. -//! -//! ### Example of what a facet number LMDB database contain -//! -//! | level | left-bound | right-bound | documents ids | -//! |-------|------------|-------------|------------------| -//! | 0 | 0 | _skipped_ | 1, 2 | -//! | 0 | 1 | _skipped_ | 6, 7 | -//! | 0 | 3 | _skipped_ | 4, 7 | -//! | 0 | 5 | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -//! bound, that's the base level where you can directly fetch the documents ids associated with an -//! exact number. -//! -//! The next levels have two different bounds and the associated documents ids are simply the result -//! of an union of all the documents ids associated with the aggregated groups above. -//! -//! ## The complexity of defining groups for facet strings -//! -//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -//! lexicographical order, it means that whatever the key represent the bytes are read in their raw -//! form and a simple `strcmp` will define the order in which keys will be read from the store. -//! -//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -//! first number then by the second if the the first number is equal on two keys. -//! -//! For strings it is a lot more complex as those types are unsized, it means that the size of facet -//! strings is different for each facet value. -//! -//! ### Basic approach: padding the keys -//! -//! A first approach would be to simply define the maximum size of a facet string and pad the keys -//! with zeroes. The big problem of this approach is that it: -//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -//! other. -//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -//! performances. -//! -//! ### Better approach: number the facet groups -//! -//! A better approach would be to number the groups, this way we don't have the downsides of the -//! previously described approach but we need to be able to describe the groups by using a number. -//! -//! #### Example of facet strings with numbered groups -//! -//! | level | left-bound | right-bound | left-string | right-string | documents ids | -//! |-------|------------|-------------|-------------|--------------|------------------| -//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -//! need to store the facet string value two times. -//! -//! The number in the left-bound and right-bound columns are incremental numbers representing the -//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -//! of the LMDB keys. -//! -//! In the value, not in the key, you can see that we added two new values: the left-string and the -//! right-string, which defines the original facet strings associated with the given group. -//! -//! We put those two strings inside of the value, this way we do not limit the maximum size of the -//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -//! values on another page, this helps in iterating over keys fast enough and only fetch the page -//! with the values when required. -//! -//! The other little advantage with this solution is that there is no a big overhead, compared with -//! the facet number levels, we only duplicate the facet strings once for the level 1. -//! -//! #### A typical algorithm run -//! -//! Note that the algorithm is always moving from the highest level to the lowest one, one level -//! by one level, this is why it is ok to only store the facets string on the level 1. -//! -//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -//! we must continue iterating over the sub-groups. To do so: -//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -//! and iterate over the facet groups defined by these numbers over the current level - 1. -//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -//! value and just do the same as with the facet numbers but with strings: iterate over the -//! current level - 1 with both keys. -//! -//! If this group is the lowest level (level 0) and contain at least one document id we yield the -//! associated facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! +// //! This module contains helpers iterators for facet strings. +// //! +// //! The purpose is to help iterate over the quite complex system of facets strings. A simple +// //! description of the system would be that every facet string value is stored into an LMDB database +// //! and that every value is associated with the document ids which are associated with this facet +// //! string value. +// //! +// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet +// //! string values, those aggregations helps in choosing the right groups of facets to follow. +// //! +// //! ## A typical algorithm run +// //! +// //! If a group of aggregated facets values contains one of the documents ids, we must continue +// //! iterating over the sub-groups. +// //! +// //! If this group is the lowest level and contain at least one document id we yield the associated +// //! facet documents ids. +// //! +// //! If the group doesn't contain one of our documents ids, we continue to the next group at this +// //! same level. +// //! +// //! ## The complexity comes from the strings +// //! +// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create +// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the +// //! two numbers bounds, the left and the right bound of the group, both inclusive. +// //! +// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and +// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values +// //! are simple unions of the documents ids coming from the groups below. +// //! +// //! ### Example of what a facet number LMDB database contain +// //! +// //! | level | left-bound | right-bound | documents ids | +// //! |-------|------------|-------------|------------------| +// //! | 0 | 0 | _skipped_ | 1, 2 | +// //! | 0 | 1 | _skipped_ | 6, 7 | +// //! | 0 | 3 | _skipped_ | 4, 7 | +// //! | 0 | 5 | _skipped_ | 2, 3, 4 | +// //! | 1 | 0 | 1 | 1, 2, 6, 7 | +// //! | 1 | 3 | 5 | 2, 3, 4, 7 | +// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | +// //! +// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second +// //! bound, that's the base level where you can directly fetch the documents ids associated with an +// //! exact number. +// //! +// //! The next levels have two different bounds and the associated documents ids are simply the result +// //! of an union of all the documents ids associated with the aggregated groups above. +// //! +// //! ## The complexity of defining groups for facet strings +// //! +// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in +// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw +// //! form and a simple `strcmp` will define the order in which keys will be read from the store. +// //! +// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and +// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the +// //! first number then by the second if the the first number is equal on two keys. +// //! +// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet +// //! strings is different for each facet value. +// //! +// //! ### Basic approach: padding the keys +// //! +// //! A first approach would be to simply define the maximum size of a facet string and pad the keys +// //! with zeroes. The big problem of this approach is that it: +// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the +// //! other. +// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB +// //! performances. +// //! +// //! ### Better approach: number the facet groups +// //! +// //! A better approach would be to number the groups, this way we don't have the downsides of the +// //! previously described approach but we need to be able to describe the groups by using a number. +// //! +// //! #### Example of facet strings with numbered groups +// //! +// //! | level | left-bound | right-bound | left-string | right-string | documents ids | +// //! |-------|------------|-------------|-------------|--------------|------------------| +// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | +// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | +// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | +// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | +// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | +// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +// //! +// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not +// //! need to store the facet string value two times. +// //! +// //! The number in the left-bound and right-bound columns are incremental numbers representing the +// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +// //! of the LMDB keys. +// //! +// //! In the value, not in the key, you can see that we added two new values: the left-string and the +// //! right-string, which defines the original facet strings associated with the given group. +// //! +// //! We put those two strings inside of the value, this way we do not limit the maximum size of the +// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big +// //! values on another page, this helps in iterating over keys fast enough and only fetch the page +// //! with the values when required. +// //! +// //! The other little advantage with this solution is that there is no a big overhead, compared with +// //! the facet number levels, we only duplicate the facet strings once for the level 1. +// //! +// //! #### A typical algorithm run +// //! +// //! Note that the algorithm is always moving from the highest level to the lowest one, one level +// //! by one level, this is why it is ok to only store the facets string on the level 1. +// //! +// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids, +// //! we must continue iterating over the sub-groups. To do so: +// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds +// //! and iterate over the facet groups defined by these numbers over the current level - 1. +// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the +// //! value and just do the same as with the facet numbers but with strings: iterate over the +// //! current level - 1 with both keys. +// //! +// //! If this group is the lowest level (level 0) and contain at least one document id we yield the +// //! associated facet documents ids. +// //! +// //! If the group doesn't contain one of our documents ids, we continue to the next group at this +// //! same level. +// //! -use std::num::NonZeroU8; -use std::ops::Bound; -use std::ops::Bound::{Excluded, Included, Unbounded}; +// use std::num::NonZeroU8; +// use std::ops::Bound; +// use std::ops::Bound::{Excluded, Included, Unbounded}; -use either::{Either, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; +// use either::{Either, Left, Right}; +// use heed::types::{ByteSlice, DecodeIgnore}; +// use heed::{Database, LazyDecode, RoRange, RoRevRange}; +// use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; +// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +// use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::{FieldId, Index}; -/// An iterator that is used to explore the facets level strings -/// from the level 1 to infinity. -/// -/// It yields the level, group id that an entry covers, the optional group strings -/// that it covers of the level 0 only if it is an entry from the level 1 and -/// the roaring bitmap associated. -pub struct FacetStringGroupRange<'t> { - iter: RoRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} +// /// An iterator that is used to explore the facets level strings +// /// from the level 1 to infinity. +// /// +// /// It yields the level, group id that an entry covers, the optional group strings +// /// that it covers of the level 0 only if it is an entry from the level 1 and +// /// the roaring bitmap associated. +// pub struct FacetStringGroupRange<'t> { +// iter: RoRange< +// 't, +// FacetLevelValueU32Codec, +// LazyDecode>, +// >, +// end: Bound, +// } -impl<'t> FacetStringGroupRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRange { iter, end: right }) - } -} +// impl<'t> FacetStringGroupRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// level: NonZeroU8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let db = db.remap_types::< +// FacetLevelValueU32Codec, +// FacetStringZeroBoundsValueCodec, +// >(); +// let left_bound = match left { +// Included(left) => Included((field_id, level, left, u32::MIN)), +// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), +// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), +// }; +// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); +// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetStringGroupRange { iter, end: right }) +// } +// } -impl<'t> Iterator for FacetStringGroupRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; +// impl<'t> Iterator for FacetStringGroupRange<'t> { +// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, level, left, right), docids))) => { +// let must_be_returned = match self.end { +// Included(end) => right <= end, +// Excluded(end) => right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), +// Err(e) => Some(Err(e)), +// } +// } else { +// None +// } +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -pub struct FacetStringGroupRevRange<'t> { - iter: RoRevRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} +// pub struct FacetStringGroupRevRange<'t> { +// iter: RoRevRange< +// 't, +// FacetLevelValueU32Codec, +// LazyDecode>, +// >, +// end: Bound, +// } -impl<'t> FacetStringGroupRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRevRange { iter, end: right }) - } -} +// impl<'t> FacetStringGroupRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// level: NonZeroU8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let db = db.remap_types::< +// FacetLevelValueU32Codec, +// FacetStringZeroBoundsValueCodec, +// >(); +// let left_bound = match left { +// Included(left) => Included((field_id, level, left, u32::MIN)), +// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), +// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), +// }; +// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); +// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetStringGroupRevRange { iter, end: right }) +// } +// } -impl<'t> Iterator for FacetStringGroupRevRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; +// impl<'t> Iterator for FacetStringGroupRevRange<'t> { +// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => { - return Some(Ok(((level, left, right), (bounds, docids)))) - } - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} +// fn next(&mut self) -> Option { +// loop { +// match self.iter.next() { +// Some(Ok(((_fid, level, left, right), docids))) => { +// let must_be_returned = match self.end { +// Included(end) => right <= end, +// Excluded(end) => right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok((bounds, docids)) => { +// return Some(Ok(((level, left, right), (bounds, docids)))) +// } +// Err(e) => return Some(Err(e)), +// } +// } +// continue; +// } +// Some(Err(e)) => return Some(Err(e)), +// None => return None, +// } +// } +// } +// } -/// An iterator that is used to explore the level 0 of the facets string database. -/// -/// It yields the facet string and the roaring bitmap associated with it. -pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} +// /// An iterator that is used to explore the level 0 of the facets string database. +// /// +// /// It yields the facet string and the roaring bitmap associated with it. +// pub struct FacetStringLevelZeroRange<'t> { +// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, +// } -impl<'t> FacetStringLevelZeroRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } +// impl<'t> FacetStringLevelZeroRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// left: Bound<&str>, +// right: Bound<&str>, +// ) -> heed::Result> { +// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { +// buffer.extend_from_slice(&field_id.to_be_bytes()); +// buffer.push(0); +// buffer.extend_from_slice(value.as_bytes()); +// &buffer[..] +// } - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; +// let mut left_buffer = Vec::new(); +// let left_bound = match left { +// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), +// Unbounded => { +// left_buffer.extend_from_slice(&field_id.to_be_bytes()); +// left_buffer.push(0); +// Included(&left_buffer[..]) +// } +// }; - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; +// let mut right_buffer = Vec::new(); +// let right_bound = match right { +// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), +// Unbounded => { +// right_buffer.extend_from_slice(&field_id.to_be_bytes()); +// right_buffer.push(1); // we must only get the level 0 +// Excluded(&right_buffer[..]) +// } +// }; - let iter = db - .remap_key_type::() - .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); +// let iter = db +// .remap_key_type::() +// .range(rtxn, &(left_bound, right_bound))? +// .remap_types::(); - Ok(FacetStringLevelZeroRange { iter }) - } -} +// Ok(FacetStringLevelZeroRange { iter }) +// } +// } -impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringLevelZeroRange<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, normalized), (original, docids)))) => { +// Some(Ok((normalized, original, docids))) +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} +// pub struct FacetStringLevelZeroRevRange<'t> { +// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, +// } -impl<'t> FacetStringLevelZeroRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } +// impl<'t> FacetStringLevelZeroRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// left: Bound<&str>, +// right: Bound<&str>, +// ) -> heed::Result> { +// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { +// buffer.extend_from_slice(&field_id.to_be_bytes()); +// buffer.push(0); +// buffer.extend_from_slice(value.as_bytes()); +// &buffer[..] +// } - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; +// let mut left_buffer = Vec::new(); +// let left_bound = match left { +// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), +// Unbounded => { +// left_buffer.extend_from_slice(&field_id.to_be_bytes()); +// left_buffer.push(0); +// Included(&left_buffer[..]) +// } +// }; - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; +// let mut right_buffer = Vec::new(); +// let right_bound = match right { +// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), +// Unbounded => { +// right_buffer.extend_from_slice(&field_id.to_be_bytes()); +// right_buffer.push(1); // we must only get the level 0 +// Excluded(&right_buffer[..]) +// } +// }; - let iter = db - .remap_key_type::() - .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::(); +// let iter = db +// .remap_key_type::() +// .rev_range(rtxn, &(left_bound, right_bound))? +// .remap_types::(); - Ok(FacetStringLevelZeroRevRange { iter }) - } -} +// Ok(FacetStringLevelZeroRevRange { iter }) +// } +// } -impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, normalized), (original, docids)))) => { +// Some(Ok((normalized, original, docids))) +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -type EitherStringRevRange<'t> = - Either, FacetStringLevelZeroRevRange<'t>>; +// type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; +// type EitherStringRevRange<'t> = +// Either, FacetStringLevelZeroRevRange<'t>>; -/// An iterator that is used to explore the facet strings level by level, -/// it will only return facets strings that are associated with the -/// candidates documents ids given. -pub struct FacetStringIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, - must_reduce: bool, -} +// /// An iterator that is used to explore the facet strings level by level, +// /// it will only return facets strings that are associated with the +// /// candidates documents ids given. +// pub struct FacetStringIter<'t> { +// rtxn: &'t heed::RoTxn<'t>, +// db: Database, +// field_id: FieldId, +// level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, +// must_reduce: bool, +// } -impl<'t> FacetStringIter<'t> { - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: true, - }) - } +// impl<'t> FacetStringIter<'t> { +// pub fn new_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Left(highest_iter))], +// must_reduce: true, +// }) +// } - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Right(highest_reverse_iter))], - must_reduce: true, - }) - } +// pub fn new_reverse_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Right(highest_reverse_iter))], +// must_reduce: true, +// }) +// } - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: false, - }) - } +// pub fn new_non_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Left(highest_iter))], +// must_reduce: false, +// }) +// } - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - Ok(db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits - .last() - .transpose()? - .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit - } +// fn highest_level( +// rtxn: &'t heed::RoTxn, +// db: Database, +// fid: FieldId, +// ) -> heed::Result> { +// Ok(db +// .remap_types::() +// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits +// .last() +// .transpose()? +// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit +// } - fn highest_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } +// fn highest_iter( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// db: Database, +// field_id: FieldId, +// ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// match NonZeroU8::new(highest_level) { +// Some(highest_level) => FacetStringGroupRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// highest_level, +// Unbounded, +// Unbounded, +// ) +// .map(Left), +// None => FacetStringLevelZeroRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// Unbounded, +// Unbounded, +// ) +// .map(Right), +// } +// } - fn highest_reverse_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } -} +// fn highest_reverse_iter( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// db: Database, +// field_id: FieldId, +// ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// match NonZeroU8::new(highest_level) { +// Some(highest_level) => FacetStringGroupRevRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// highest_level, +// Unbounded, +// Unbounded, +// ) +// .map(Left), +// None => FacetStringLevelZeroRevRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// Unbounded, +// Unbounded, +// ) +// .map(Right), +// } +// } +// } -impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringIter<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); +// fn next(&mut self) -> Option { +// 'outer: loop { +// let (documents_ids, last) = self.level_iters.last_mut()?; +// let is_ascending = last.is_left(); - // We remap the different iterator types to make - // the algorithm less complex to understand. - let last = match last { - Left(ascending) => match ascending { - Left(group) => Left(Left(group)), - Right(zero_level) => Right(Left(zero_level)), - }, - Right(descending) => match descending { - Left(group) => Left(Right(group)), - Right(zero_level) => Right(Right(zero_level)), - }, - }; +// // We remap the different iterator types to make +// // the algorithm less complex to understand. +// let last = match last { +// Left(ascending) => match ascending { +// Left(group) => Left(Left(group)), +// Right(zero_level) => Right(Left(zero_level)), +// }, +// Right(descending) => match descending { +// Left(group) => Left(Right(group)), +// Right(zero_level) => Right(Right(zero_level)), +// }, +// }; - match last { - Left(group) => { - for result in group { - match result { - Ok(((level, left, right), (string_bounds, mut docids))) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } +// match last { +// Left(group) => { +// for result in group { +// match result { +// Ok(((level, left, right), (string_bounds, mut docids))) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } - let result = if is_ascending { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) - } else { - match string_bounds { - Some((left, right)) => { - FacetStringLevelZeroRevRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right) - } - None => FacetStringGroupRevRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Right) - }; +// let result = if is_ascending { +// match string_bounds { +// Some((left, right)) => FacetStringLevelZeroRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// Included(left), +// Included(right), +// ) +// .map(Right), +// None => FacetStringGroupRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// NonZeroU8::new(level.get() - 1).unwrap(), +// Included(left), +// Included(right), +// ) +// .map(Left), +// } +// .map(Left) +// } else { +// match string_bounds { +// Some((left, right)) => { +// FacetStringLevelZeroRevRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// Included(left), +// Included(right), +// ) +// .map(Right) +// } +// None => FacetStringGroupRevRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// NonZeroU8::new(level.get() - 1).unwrap(), +// Included(left), +// Included(right), +// ) +// .map(Left), +// } +// .map(Right) +// }; - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - } - Right(zero_level) => { - // level zero only - for result in zero_level { - match result { - Ok((normalized, original, mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - return Some(Ok((normalized, original, docids))); - } - } - Err(e) => return Some(Err(e)), - } - } - } - } +// match result { +// Ok(iter) => { +// self.level_iters.push((docids, iter)); +// continue 'outer; +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Right(zero_level) => { +// // level zero only +// for result in zero_level { +// match result { +// Ok((normalized, original, mut docids)) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } +// return Some(Ok((normalized, original, docids))); +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// } - self.level_iters.pop(); - } - } -} +// self.level_iters.pop(); +// } +// } +// } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7241dab2b..e911dfb15 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,16 +1,20 @@ use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; +use std::ops::RangeBounds; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; +use heed::LazyDecode; use log::debug; use roaring::RoaringBitmap; -use super::FacetNumberRange; +// use super::FacetNumberRange; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +// use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{ distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, }; @@ -144,18 +148,29 @@ impl<'a> Filter<'a> { } } +fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, +) { +} + impl<'a> Filter<'a> { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. fn explore_facet_number_levels( rtxn: &heed::RoTxn, - db: heed::Database, + db: heed::Database, CboRoaringBitmapCodec>, field_id: FieldId, level: u8, left: Bound, right: Bound, output: &mut RoaringBitmap, ) -> Result<()> { + // level must be > 0, I'll create a separate function for level 0 + // if level == 0 { + // call that function + //} match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { @@ -170,87 +185,121 @@ impl<'a> Filter<'a> { (Excluded(l), Included(r)) if l >= r => return Ok(()), (_, _) => (), } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - *output |= docids; - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { - left_found = Some(l); - } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), + let range_start_key = FacetKey { + field_id, + level, + left_bound: match left { + Included(l) => l, + Excluded(l) => l, + Bound::Unbounded => f64::MIN, + }, }; + let mut range_iter = db + .remap_data_type::>() + .range(rtxn, &(range_start_key..))?; - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!( - "calling left with {:?} to {:?} (level {})", - left, sub_right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - sub_right, - output, - )?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!( - "calling right with {:?} to {:?} (level {})", - sub_left, right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - sub_left, - right, - output, - )?; - } - } - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - right, - output, - )?; - } + let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; + while let Some(el) = range_iter.next() { + let (facet_key, value) = el?; + let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound)); + // if the current range intersects with the query range, then go deeper + // what does it mean for two ranges to intersect? + let gte_left = match left { + Included(l) => previous_facet_key.left_bound >= l, + Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true? + Bound::Unbounded => true, + }; + let lte_right = match right { + Included(r) => facet_key.left_bound <= r, + Excluded(r) => facet_key.left_bound < r, + Bound::Unbounded => true, + }; } + // at this point, previous_facet_key and previous_value are the last groups in the level + // we must also check whether we should visit this group - Ok(()) + todo!(); + + // let mut left_found = None; + // let mut right_found = None; + + // // We must create a custom iterator to be able to iterate over the + // // requested range as the range iterator cannot express some conditions. + // let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; + + // debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + // for (i, result) in iter.enumerate() { + // let ((_fid, level, l, r), docids) = result?; + // debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + // *output |= docids; + // // We save the leftest and rightest bounds we actually found at this level. + // if i == 0 { + // left_found = Some(l); + // } + // right_found = Some(r); + // } + + // // Can we go deeper? + // let deeper_level = match level.checked_sub(1) { + // Some(level) => level, + // None => return Ok(()), + // }; + + // // We must refine the left and right bounds of this range by retrieving the + // // missing part in a deeper level. + // match left_found.zip(right_found) { + // Some((left_found, right_found)) => { + // // If the bound is satisfied we avoid calling this function again. + // if !matches!(left, Included(l) if l == left_found) { + // let sub_right = Excluded(left_found); + // debug!( + // "calling left with {:?} to {:?} (level {})", + // left, sub_right, deeper_level + // ); + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // left, + // sub_right, + // output, + // )?; + // } + // if !matches!(right, Included(r) if r == right_found) { + // let sub_left = Excluded(right_found); + // debug!( + // "calling right with {:?} to {:?} (level {})", + // sub_left, right, deeper_level + // ); + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // sub_left, + // right, + // output, + // )?; + // } + // } + // None => { + // // If we found nothing at this level it means that we must find + // // the same bounds but at a deeper, more precise level. + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // left, + // right, + // output, + // )?; + // } + // } + + // Ok(()) } fn evaluate_operator( @@ -277,23 +326,27 @@ impl<'a> Filter<'a> { return Ok(exist); } Condition::Equal(val) => { - let (_original_value, string_docids) = strings_db - .get(rtxn, &(field_id, &val.value().to_lowercase()))? + let string_docids = strings_db + .get( + rtxn, + &FacetKey { field_id, level: 0, left_bound: &val.value().to_lowercase() }, + )? + .map(|v| v.bitmap) .unwrap_or_default(); let number = val.parse::().ok(); let number_docids = match number { Some(n) => { let n = Included(n); let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - 0, - n, - n, - &mut output, - )?; + // Self::explore_facet_number_levels( + // rtxn, + // numbers_db, + // field_id, + // 0, + // n, + // n, + // &mut output, + // )?; output } None => RoaringBitmap::new(), @@ -312,21 +365,32 @@ impl<'a> Filter<'a> { // that's fine if it don't, the value just before will be returned instead. let biggest_level = numbers_db .remap_data_type::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? - .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + .get_lower_than_or_equal_to( + rtxn, + &FacetKey { field_id, level: u8::MAX, left_bound: f64::MAX }, + )? + .and_then( + |(FacetKey { field_id: id, level, .. }, _)| { + if id == field_id { + Some(level) + } else { + None + } + }, + ); match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - level, - left, - right, - &mut output, - )?; + // Self::explore_facet_number_levels( + // rtxn, + // numbers_db, + // field_id, + // level, + // left, + // right, + // &mut output, + // )?; Ok(output) } None => Ok(RoaringBitmap::new()), diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e3ac95882..13b00d2de 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,6 +1,6 @@ pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; -pub use self::facet_string::FacetStringIter; +// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; +// pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1b62a67c7..d05e807df 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, Filter, DEFAULT_VALUES_PER_FACET}; +pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index e9c92a949..4031c9b06 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,10 +6,7 @@ use heed::types::ByteSlice; use heed::BytesDecode; use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; +use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; #[track_caller] @@ -232,46 +229,48 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { snap } pub fn snap_facet_id_f64_docids(index: &Index) -> String { - let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - (facet_id, level, left, right), - b, - )| { - &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) - }); - snap + todo!() + // let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + // (facet_id, level, left, right), + // b, + // )| { + // &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + // }); + // snap } pub fn snap_facet_id_string_docids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let bytes_db = index.facet_id_string_docids.remap_types::(); - let iter = bytes_db.iter(&rtxn).unwrap(); - let mut snap = String::new(); + todo!() + // let rtxn = index.read_txn().unwrap(); + // let bytes_db = index.facet_id_string_docids.remap_types::(); + // let iter = bytes_db.iter(&rtxn).unwrap(); + // let mut snap = String::new(); - for x in iter { - let (key, value) = x.unwrap(); - if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { - let (orig_string, docids) = - FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - snap.push_str(&format!( - "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - display_bitmap(&docids) - )); - } else if let Some((field_id, level, left, right)) = - FacetLevelValueU32Codec::bytes_decode(key) - { - snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - let (bounds, docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(value) - .unwrap(); - if let Some((left, right)) = bounds { - snap.push_str(&format!("{left:<8} {right:<8} ")); - } - snap.push_str(&display_bitmap(&docids)); - snap.push('\n'); - } else { - panic!(); - } - } - snap + // for x in iter { + // let (key, value) = x.unwrap(); + // if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { + // let (orig_string, docids) = + // FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + // snap.push_str(&format!( + // "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + // display_bitmap(&docids) + // )); + // } else if let Some((field_id, level, left, right)) = + // FacetLevelValueU32Codec::bytes_decode(key) + // { + // snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + // let (bounds, docids) = + // FacetStringZeroBoundsValueCodec::::bytes_decode(value) + // .unwrap(); + // if let Some((left, right)) = bounds { + // snap.push_str(&format!("{left:<8} {right:<8} ")); + // } + // snap.push_str(&display_bitmap(&docids)); + // snap.push('\n'); + // } else { + // panic!(); + // } + // } + // snap } pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 54328b50d..bb30f24c9 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -10,9 +10,7 @@ use time::OffsetDateTime; use super::ClearDocuments; use crate::error::{InternalError, SerializationError, UserError}; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, -}; +use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ @@ -442,11 +440,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( - self.wtxn, - facet_id_f64_docids, - &self.to_delete_docids, - )?; + // TODO: remove_docids_from_facet_field_id_docids( + // self.wtxn, + // facet_id_f64_docids, + // &self.to_delete_docids, + // )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_docids( self.wtxn, @@ -587,57 +585,57 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( db: &heed::Database, to_remove: &RoaringBitmap, ) -> crate::Result<()> { - let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); - let mut iter = db.remap_types::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (key, val) = result?; - match FacetLevelValueU32Codec::bytes_decode(key) { - Some(_) => { - // If we are able to parse this key it means it is a facet string group - // level key. We must then parse the value using the appropriate codec. - let (group, mut docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + // let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); + // let mut iter = db.remap_types::().iter_mut(wtxn)?; + // while let Some(result) = iter.next() { + // let (key, val) = result?; + // match FacetLevelValueU32Codec::bytes_decode(key) { + // Some(_) => { + // // If we are able to parse this key it means it is a facet string group + // // level key. We must then parse the value using the appropriate codec. + // let (group, mut docids) = + // FacetStringZeroBoundsValueCodec::::bytes_decode(val) + // .ok_or_else(|| SerializationError::Decoding { db_name })?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(group, docids); - let value_bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + // let previous_len = docids.len(); + // docids -= to_remove; + // if docids.is_empty() { + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.del_current()? }; + // } else if docids.len() != previous_len { + // let key = key.to_owned(); + // let val = &(group, docids); + // let value_bytes = + // FacetStringZeroBoundsValueCodec::::bytes_encode(val) + // .ok_or_else(|| SerializationError::Encoding { db_name })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - None => { - // The key corresponds to a level zero facet string. - let (original_value, mut docids) = - FacetStringLevelZeroValueCodec::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.put_current(&key, &value_bytes)? }; + // } + // } + // None => { + // // The key corresponds to a level zero facet string. + // let (original_value, mut docids) = + // FacetStringLevelZeroValueCodec::bytes_decode(val) + // .ok_or_else(|| SerializationError::Decoding { db_name })?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(original_value, docids); - let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + // let previous_len = docids.len(); + // docids -= to_remove; + // if docids.is_empty() { + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.del_current()? }; + // } else if docids.len() != previous_len { + // let key = key.to_owned(); + // let val = &(original_value, docids); + // let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) + // .ok_or_else(|| SerializationError::Encoding { db_name })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - } - } + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.put_current(&key, &value_bytes)? }; + // } + // } + // } + // } Ok(()) } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 108acae4f..0926b63f4 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -136,11 +136,12 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; use crate::error::InternalError; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, }; -use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; @@ -187,16 +188,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + let mut nested_wtxn = self.index.env.nested_write_txn(self.wtxn).unwrap(); + for field_id in faceted_fields { // Clear the facet string levels. - clear_field_string_levels( - self.wtxn, - self.index.facet_id_string_docids.remap_types::(), - field_id, - )?; + // clear_field_string_levels( + // &mut nested_wtxn, + // self.index.facet_id_string_docids.remap_types::(), + // field_id, + // )?; let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( - self.wtxn, + &mut nested_wtxn, self.index.facet_id_string_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -206,13 +209,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; self.index.put_string_faceted_documents_ids( - self.wtxn, + &mut nested_wtxn, field_id, &string_documents_ids, )?; for facet_strings_level in facet_string_levels { write_into_lmdb_database( - self.wtxn, + &mut nested_wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_strings_level, |_, _| { @@ -221,11 +224,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; } - // Clear the facet number levels. - clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; + // // Clear the facet number levels. + // clear_field_number_levels(&mut nested_wtxn, self.index.facet_id_f64_docids, field_id)?; let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( - self.wtxn, + &mut nested_wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -235,14 +238,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; self.index.put_number_faceted_documents_ids( - self.wtxn, + &mut nested_wtxn, field_id, &number_documents_ids, )?; for facet_number_level in facet_number_levels { write_into_lmdb_database( - self.wtxn, + &mut nested_wtxn, *self.index.facet_id_f64_docids.as_polymorph(), facet_number_level, |_, _| { @@ -263,8 +266,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_number_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, level_group_size: NonZeroUsize, @@ -277,7 +280,7 @@ fn compute_facet_number_levels<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_start = (field_id, 0, f64::MIN, f64::MIN); + let level_0_start = FacetKey { field_id, level: 0, left_bound: f64::MIN }; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -289,37 +292,31 @@ fn compute_facet_number_levels<'t>( let mut number_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = - recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - &|_i, (_field_id, _level, left, _right)| *left, - &|bitmap| bitmap, - &|writer, level, left, right, docids| { - write_number_entry(writer, field_id, level.get(), left, right, &docids)?; - Ok(()) - }, - )?; + let subwriters = recursive_compute_levels::( + rtxn, + db, + compression_type, + compression_level, + field_id, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + )?; Ok((subwriters, number_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, docids) = result?; - documents_ids |= docids; + let (_key, group_value) = result?; + documents_ids |= group_value.bitmap; } Ok((vec![], documents_ids)) @@ -333,8 +330,8 @@ fn compute_facet_number_levels<'t>( /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_strings_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, level_group_size: NonZeroUsize, @@ -347,7 +344,7 @@ fn compute_facet_strings_levels<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_start = (field_id, ""); + let level_0_start = FacetKey { field_id, level: 0, left_bound: "" }; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -359,40 +356,31 @@ fn compute_facet_strings_levels<'t>( let mut strings_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - (u32, &str), - >( + let subwriters = recursive_compute_levels::( rtxn, db, compression_type, compression_level, + field_id, *top_level, level_0_start, &(level_0_start..), first_level_size, level_group_size, - &mut |bitmaps, _, _| { + &mut |bitmaps, _| { for bitmap in bitmaps { strings_document_ids |= bitmap; } Ok(()) }, - &|i, (_field_id, value)| (i as u32, *value), - &|value| value.1, - &|writer, level, start_bound, end_bound, docids| { - write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; - Ok(()) - }, )?; Ok((subwriters, strings_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; + let (_key, group_value) = result?; + documents_ids |= group_value.bitmap; } Ok((vec![], documents_ids)) @@ -436,29 +424,26 @@ from the level below were read/created. Its arguments are: A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` that must be inserted into the database. */ -fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( - rtxn: &'t heed::RoTxn, - db: heed::Database, +fn recursive_compute_levels<'t, BoundCodec>( + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, + field_id: FieldId, level: u8, - level_0_start: >::DItem, - level_0_range: &'t RangeFrom<>::DItem>, + level_0_start: FacetKey<>::EItem>, + level_0_range: &'t RangeFrom>::EItem>>, level_0_size: usize, level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, - bound_from_db_key: &dyn for<'a> Fn(usize, &'a >::DItem) -> Bound, - bitmap_from_db_value: &dyn Fn(>::DItem) -> RoaringBitmap, - write_entry: &dyn Fn(&mut Writer, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, + computed_group_bitmap: &mut dyn FnMut( + &[RoaringBitmap], + >::EItem, + ) -> Result<()>, ) -> Result>> where - KeyCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - ValueCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - Bound: Copy, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Copy + Sized, { if level == 0 { // base case for the recursion @@ -468,31 +453,32 @@ where // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; - let mut start_bound = bound_from_db_key(0, &level_0_start); - let mut end_bound = bound_from_db_key(0, &level_0_start); + let mut start_bound = level_0_start.left_bound; + // let mut end_bound = level_0_start.bound; + let mut first_iteration_for_new_group = true; for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { let (key, value) = db_result_item?; - let bound = bound_from_db_key(i, &key); - let docids = bitmap_from_db_value(value); + let bound = key.left_bound; + let docids = value.bitmap; if first_iteration_for_new_group { start_bound = bound; first_iteration_for_new_group = false; } - end_bound = bound; + // end_bound = bound; bitmaps.push(docids); if bitmaps.len() == level_group_size.get() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + computed_group_bitmap(&bitmaps, start_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + computed_group_bitmap(&bitmaps, start_bound)?; bitmaps.clear(); } // level 0 is already stored in the DB @@ -516,48 +502,52 @@ where db, compression_type, compression_level, + field_id, level - 1, level_0_start, level_0_range, level_0_size, level_group_size, - &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { + &mut |sub_bitmaps: &[RoaringBitmap], + start_range: >::EItem| { let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } - range_for_bitmaps.push((start_range, end_range)); + range_for_bitmaps.push(start_range); bitmaps.push(combined_bitmap); if bitmaps.len() == level_group_size.get() { - let start_bound = range_for_bitmaps.first().unwrap().0; - let end_bound = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; - for (bitmap, (start_bound, end_bound)) in - bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) + let start_bound = range_for_bitmaps.first().unwrap(); + computed_group_bitmap(&bitmaps, *start_bound)?; + for (bitmap, start_bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry( + write_entry::( &mut cur_writer, + field_id, NonZeroU8::new(level).unwrap(), start_bound, - end_bound, bitmap, )?; } } Ok(()) }, - bound_from_db_key, - bitmap_from_db_value, - write_entry, )?; + // don't forget to insert the leftover elements into the writer as well if !bitmaps.is_empty() { - let start_range = range_for_bitmaps.first().unwrap().0; - let end_range = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_range, end_range)?; - for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; + let start_range = range_for_bitmaps.first().unwrap(); + let end_range = range_for_bitmaps.last().unwrap(); + computed_group_bitmap(&bitmaps, *start_range)?; + for (bitmap, bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { + write_entry( + &mut cur_writer, + field_id, + NonZeroU8::new(level).unwrap(), + bound, + bitmap, + )?; } } @@ -566,60 +556,25 @@ where } } -fn clear_field_number_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, 1, f64::MIN, f64::MIN); - let right = (field_id, u8::MAX, f64::MAX, f64::MAX); - let range = left..=right; - db.delete_range(wtxn, &range).map(drop) -} - -fn clear_field_string_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); - let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); - let range = left..=right; - db.remap_key_type::().delete_range(wtxn, &range).map(drop) -} - -fn write_number_entry( - writer: &mut Writer, - field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} -fn write_string_entry( +fn write_entry( writer: &mut Writer, field_id: FieldId, level: NonZeroU8, - (left_id, left_value): (u32, &str), - (right_id, right_value): (u32, &str), + bound: >::EItem, docids: RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left_id, right_id); - let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = match level.get() { - 1 => (Some((left_value, right_value)), docids), - _ => (None, docids), - }; - let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) - .ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) +) -> Result<()> +where + for<'a> BoundCodec: BytesEncode<'a>, + for<'a> >::EItem: Copy + Sized, +{ + todo!() + // let key = FacetKey { field_id, level: level.get(), left_bound: bound }; + // let key_bytes = FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + // let value_bytes = + // FacetGroupValueCodec::bytes_encode(&FacetGroupValue { size: 4, bitmap: docids }) + // .ok_or(Error::Encoding)?; + // writer.insert(&key_bytes, &value_bytes)?; + // Ok(()) } #[cfg(test)] diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 61157fa35..c5424a346 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,7 +6,7 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -31,13 +31,14 @@ pub fn extract_facet_number_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { - let (field_id, document_id, number) = - FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + todo!() + // let (field_id, document_id, number) = + // FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - let key = (field_id, 0, number, number); - let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + // let key = (field_id, 0, number, number); + // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f7aa3730c..4e655329e 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,11 +4,9 @@ use std::{io, str}; use roaring::RoaringBitmap; -use super::helpers::{ - create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; +use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -24,7 +22,7 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, // TODO: check indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -42,14 +40,16 @@ pub fn extract_facet_string_docids( let original_value = str::from_utf8(original_value_bytes)?; key_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into( - field_id, - str::from_utf8(normalized_value_bytes)?, - &mut key_buffer, - ); + // TODO + // FacetStringLevelZeroCodec::serialize_into( + // field_id, + // str::from_utf8(normalized_value_bytes)?, + // &mut key_buffer, + // ); value_buffer.clear(); - encode_prefix_string(original_value, &mut value_buffer)?; + // TODO + // encode_prefix_string(original_value, &mut value_buffer)?; let bitmap = RoaringBitmap::from_iter(Some(document_id)); bitmap.serialize_into(&mut value_buffer)?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 50cc04610..1e414458f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -25,8 +25,8 @@ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, + GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_roaring_bitmaps, // TODO: check (cbo?) TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index dbe3c0344..cef27ab30 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -5,7 +5,7 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use super::read_u32_ne_bytes; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +// use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -49,32 +49,32 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let original = decode_prefix_string(&values[0]).unwrap().0; - let merged_bitmaps = values - .iter() - .map(AsRef::as_ref) - .map(decode_prefix_string) - .map(Option::unwrap) - .map(|(_, bitmap_bytes)| bitmap_bytes) - .map(RoaringBitmap::deserialize_from) - .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); +// pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( +// _key: &[u8], +// values: &[Cow<'a, [u8]>], +// ) -> Result> { +// if values.len() == 1 { +// Ok(values[0].clone()) +// } else { +// let original = decode_prefix_string(&values[0]).unwrap().0; +// let merged_bitmaps = values +// .iter() +// .map(AsRef::as_ref) +// .map(decode_prefix_string) +// .map(Option::unwrap) +// .map(|(_, bitmap_bytes)| bitmap_bytes) +// .map(RoaringBitmap::deserialize_from) +// .map(StdResult::unwrap) +// .reduce(|a, b| a | b) +// .unwrap(); - let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); - let mut buffer = Vec::with_capacity(cap); - encode_prefix_string(original, &mut buffer)?; - merged_bitmaps.serialize_into(&mut buffer)?; - Ok(Cow::Owned(buffer)) - } -} +// let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); +// let mut buffer = Vec::with_capacity(cap); +// encode_prefix_string(original, &mut buffer)?; +// merged_bitmaps.serialize_into(&mut buffer)?; +// Ok(Cow::Owned(buffer)) +// } +// } pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 6466a636b..7e2ebd2d3 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -13,9 +13,9 @@ pub use grenad_helpers::{ writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, - merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, + concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, + merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array, + serialize_roaring_bitmap, MergeFn, }; /// The maximum length a word can be diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8464c98b6..7a9787bdb 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,6 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, @@ -197,13 +196,14 @@ pub(crate) fn write_typed_chunk_into_index( index_is_empty, |value, _buffer| Ok(value), |new_values, db_values, buffer| { - let (_, new_values) = decode_prefix_string(new_values).unwrap(); - let new_values = RoaringBitmap::deserialize_from(new_values)?; - let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); - let db_values = RoaringBitmap::deserialize_from(db_values)?; - let values = new_values | db_values; - encode_prefix_string(db_original, buffer)?; - Ok(values.serialize_into(buffer)?) + todo!() + // let (_, new_values) = decode_prefix_string(new_values).unwrap(); + // let new_values = RoaringBitmap::deserialize_from(new_values)?; + // let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); + // let db_values = RoaringBitmap::deserialize_from(db_values)?; + // let values = new_values | db_values; + // encode_prefix_string(db_original, buffer)?; + // Ok(values.serialize_into(buffer)?) }, )?; is_merged_database = true; From 7913d6365ca3dbb759c1e95dd47ac107aa0d7648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 30 Aug 2022 14:03:18 +0200 Subject: [PATCH 02/58] Update Facets indexing to be compatible with new database structure --- milli/src/update/facets.rs | 631 ++++++------------------ milli/src/update/index_documents/mod.rs | 24 +- 2 files changed, 178 insertions(+), 477 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 0926b63f4..aaaa445da 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,168 +1,43 @@ -/*! -This module initialises the databases that are used to quickly get the list -of documents with a faceted field value falling within a certain range. For -example, they can be used to implement filters such as `x >= 3`. - -These databases are `facet_id_string_docids` and `facet_id_f64_docids`. - -## Example with numbers - -In the case of numbers, we start with a sorted list whose keys are -`(field_id, number_value)` and whose value is a roaring bitmap of the document ids -which contain the value `number_value` for the faceted field `field_id`. - -From this list, we want to compute two things: - -1. the bitmap of all documents that contain **any** number for each faceted field -2. a structure that allows us to use a (sort of) binary search to find all documents -containing numbers inside a certain range for a faceted field - -To achieve goal (2), we recursively split the list into chunks. Every time we split it, we -create a new "level" that is several times smaller than the level below it. The base level, -level 0, is the starting list. Level 1 is composed of chunks of up to N elements. Each element -contains a range and a bitmap of docids. Level 2 is composed of chunks up to N^2 elements, etc. - -For example, let's say we have 26 documents which we identify through the letters a-z. -We will focus on a single faceted field. When there are multiple faceted fields, the structure -described below is simply repeated for each field. - -What we want to obtain is the following structure for each faceted field: -```text -┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ -│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ -└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ - ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ -┌───────┐ │ 1.2 – 2 │ 3.4 – 100 │ 102 – 104 │ -│Level 2│ │ │ │ │ -└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ - ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ -┌───────┐ │ 1.2 – 1.3 │ 1.6 – 2 │ 3.4 – 12 │ 12.3 – 100 │ 102 – 104 │ -│Level 1│ │ │ │ │ │ │ -└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ - ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ -┌───────┐ │ 1.2 │ 1.3 │ 1.6 │ 2 │ 3.4 │ 12 │ 12.3 │ 100 │ 102 │ 104 │ -│Level 0│ │ │ │ │ │ │ │ │ │ │ │ -└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ - └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ -``` - -You can read more about this structure (for strings) in `[crate::search::facet::facet_strings]`. - -To create the levels, we use a recursive algorithm which makes sure that we only need to iterate -over the elements of level 0 once. It is implemented by [`recursive_compute_levels`]. - -## Encoding - -### Numbers -For numbers we use the same encoding for level 0 and the other levels. - -The key is given by `FacetLevelValueF64Codec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 -3. The start bound : f64 -4. The end bound : f64 -Note that at level 0, we have start bound == end bound. - -The value is a serialised `RoaringBitmap`. - -### Strings - -For strings, we use a different encoding for level 0 and the other levels. - -At level 0, the key is given by `FacetStringLevelZeroCodec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 <-- always == 0 -3. The normalised string value : &str - -And the value is given by `FacetStringLevelZeroValueCodec`. It consists of: -1. The original string -2. A serialised `RoaringBitmap` - -At level 1, the key is given by `FacetLevelValueU32Codec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 <-- always >= 1 -3. The start bound : u32 -4. The end bound : u32 -where the bounds are indices inside level 0. - -The value is given by `FacetStringZeroBoundsValueCodec`. -If the level is 1, then it consists of: -1. The normalised string of the start bound -2. The normalised string of the end bound -3. A serialised `RoaringBitmap` - -If the level is higher, then it consists only of the serialised roaring bitmap. - -The distinction between the value encoding of level 1 and the levels above it -is to allow us to retrieve the value in level 0 quickly by reading the key of -level 1 (we obtain the string value of the bound and execute a prefix search -in the database). - -Therefore, for strings, the structure for a single faceted field looks more like this: -```text -┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ -│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ -└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ - - ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ -┌───────┐ │ 0 – 3 │ 4 – 7 │ 8 – 9 │ -│Level 2│ │ │ │ │ -└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ - ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ -┌───────┐ │ 0 – 1 │ 2 – 3 │ 4 – 5 │ 6 – 7 │ 8 – 9 │ -│Level 1│ │ "ab" – "ac" │ "ba" – "bac" │ "gaf" – "gal" │"form" – "wow" │ "woz" – "zz" │ -└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ - ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ -┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ -│Level 0│ │ "AB" │ " Ac" │ "ba " │ "Bac" │ " GAF"│ "gal" │ "Form"│ " wow"│ "woz" │ "ZZ" │ -└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ - └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ - -The first line in a cell is its key (without the field id and level height) and the last two -lines are its values. -``` -*/ - use std::cmp; use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeFrom; +use std::num::NonZeroUsize; -use grenad::{CompressionType, Reader, Writer}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, BytesEncode, Error}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn}; use log::debug; use roaring::RoaringBitmap; use time::OffsetDateTime; use crate::error::InternalError; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; // use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; -pub struct Facets<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct Facets<'i> { index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: usize, + min_level_size: usize, } -impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { +impl<'i> Facets<'i> { + pub fn new( + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + ) -> Facets<'i> { Facets { - wtxn, index, + database, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - level_group_size: NonZeroUsize::new(4).unwrap(), - min_level_size: NonZeroUsize::new(5).unwrap(), + level_group_size: 4, + min_level_size: 5, } } @@ -170,413 +45,233 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// /// This setting is always greater than or equal to 2. pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + self.level_group_size = cmp::max(value.get(), 2); self } /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.min_level_size = value; + self.min_level_size = value.get(); self } + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { + let left = FacetKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + self.database.delete_range(wtxn, &range).map(drop)?; + Ok(()) + } + #[logging_timer::time("Facets::{}")] - pub fn execute(self) -> Result<()> { - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let mut nested_wtxn = self.index.env.nested_write_txn(self.wtxn).unwrap(); + for &field_id in faceted_fields.iter() { + self.clear_levels(wtxn, field_id)?; + } - for field_id in faceted_fields { - // Clear the facet string levels. - // clear_field_string_levels( - // &mut nested_wtxn, - // self.index.facet_id_string_docids.remap_types::(), - // field_id, - // )?; + let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; - let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( - &mut nested_wtxn, - self.index.facet_id_string_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - field_id, - )?; + for &field_id in faceted_fields.iter() { + let (level_readers, all_docids) = + self.compute_levels_for_field_id(field_id, &nested_wtxn)?; - self.index.put_string_faceted_documents_ids( - &mut nested_wtxn, - field_id, - &string_documents_ids, - )?; - for facet_strings_level in facet_string_levels { + // TODO: this will need to be an argument to Facets as well + self.index.put_string_faceted_documents_ids(&mut nested_wtxn, field_id, &all_docids)?; + + for level_reader in level_readers { + // TODO: append instead of write with merge write_into_lmdb_database( &mut nested_wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_strings_level, + *self.database.as_polymorph(), + level_reader, |_, _| { Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? }, )?; } - - // // Clear the facet number levels. - // clear_field_number_levels(&mut nested_wtxn, self.index.facet_id_f64_docids, field_id)?; - - let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( - &mut nested_wtxn, - self.index.facet_id_f64_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - field_id, - )?; - - self.index.put_number_faceted_documents_ids( - &mut nested_wtxn, - field_id, - &number_documents_ids, - )?; - - for facet_number_level in facet_number_levels { - write_into_lmdb_database( - &mut nested_wtxn, - *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_level, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? - }, - )?; - } } Ok(()) } -} -/// Compute the content of the database levels from its level 0 for the given field id. -/// -/// ## Returns: -/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -/// that must be inserted into the database. -/// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_number_levels<'t>( - rtxn: &'t mut heed::RwTxn, - db: heed::Database, FacetGroupValueCodec>, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, - field_id: FieldId, -) -> Result<(Vec>, RoaringBitmap)> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - let level_0_start = FacetKey { field_id, level: 0, left_bound: f64::MIN }; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) - .collect::>(); - - let mut number_document_ids = RoaringBitmap::new(); - - if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, + fn compute_levels_for_field_id( + &self, + field_id: FieldId, + txn: &RoTxn, + ) -> Result<(Vec>, RoaringBitmap)> { + let algo = CreateFacetsAlgo { + rtxn: txn, + db: &self.database, field_id, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - )?; + level_group_size: self.level_group_size, + min_level_size: self.min_level_size, + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + }; + // TODO: first check whether there is anything in level 0 - Ok((subwriters, number_document_ids)) - } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, group_value) = result?; - documents_ids |= group_value.bitmap; - } + let mut all_docids = RoaringBitmap::new(); + let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { + for bitmap in bitmaps { + all_docids |= bitmap; + } + Ok(()) + })?; + drop(algo); - Ok((vec![], documents_ids)) + Ok((subwriters, all_docids)) } } -/// Compute the content of the database levels from its level 0 for the given field id. -/// -/// ## Returns: -/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -/// that must be inserted into the database. -/// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_strings_levels<'t>( - rtxn: &'t mut heed::RwTxn, - db: heed::Database, FacetGroupValueCodec>, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, - field_id: FieldId, -) -> Result<(Vec>, RoaringBitmap)> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - let level_0_start = FacetKey { field_id, level: 0, left_bound: "" }; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) - .collect::>(); - - let mut strings_document_ids = RoaringBitmap::new(); - - if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, - field_id, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _| { - for bitmap in bitmaps { - strings_document_ids |= bitmap; - } - Ok(()) - }, - )?; - - Ok((subwriters, strings_document_ids)) - } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, group_value) = result?; - documents_ids |= group_value.bitmap; - } - - Ok((vec![], documents_ids)) - } +pub struct CreateFacetsAlgo<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, + field_id: u16, + level_group_size: usize, + min_level_size: usize, } - -/** -Compute a level from the levels below it, with the elements of level 0 already existing in the given `db`. - -This function is generic to work with both numbers and strings. The generic type parameters are: -* `KeyCodec`/`ValueCodec`: the codecs used to read the elements of the database. -* `Bound`: part of the range in the levels structure. For example, for numbers, the `Bound` is `f64` -because each chunk in a level contains a range such as (1.2 ..= 4.5). - -## Arguments -* `rtxn` : LMDB read transaction -* `db`: a database which already contains a `level 0` -* `compression_type`/`compression_level`: parameters used to create the `grenad::Writer` that -will contain the new levels -* `level` : the height of the level to create, or `0` to read elements from level 0. -* `level_0_start` : a key in the database that points to the beginning of its level 0 -* `level_0_range` : equivalent to `level_0_start..` -* `level_0_size` : the number of elements in level 0 -* `level_group_size` : the number of elements from the level below that are represented by a -single element of the new level -* `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements -from the level below were read/created. Its arguments are: - 0. the list of bitmaps from each read/created element of the level below - 1. the start bound corresponding to the first element - 2. the end bound corresponding to the last element -* `bound_from_db_key` : finds the `Bound` from a key in the database -* `bitmap_from_db_value` : finds the `RoaringBitmap` from a value in the database -* `write_entry` : writes an element of a level into the writer. The arguments are: - 0. the writer - 1. the height of the level - 2. the start bound - 3. the end bound - 4. the docids of all elements between the start and end bound - -## Return -A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -that must be inserted into the database. -*/ -fn recursive_compute_levels<'t, BoundCodec>( - rtxn: &'t mut heed::RwTxn, - db: heed::Database, FacetGroupValueCodec>, - compression_type: CompressionType, - compression_level: Option, - field_id: FieldId, - level: u8, - level_0_start: FacetKey<>::EItem>, - level_0_range: &'t RangeFrom>::EItem>>, - level_0_size: usize, - level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut( - &[RoaringBitmap], - >::EItem, - ) -> Result<()>, -) -> Result>> -where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Copy + Sized, -{ - if level == 0 { - // base case for the recursion - +impl<'t> CreateFacetsAlgo<'t> { + fn read_level_0( + &self, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result<()> { // we read the elements one by one and - // 1. keep track of the start and end bounds + // 1. keep track of the left bound // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; - let mut start_bound = level_0_start.left_bound; - // let mut end_bound = level_0_start.bound; + let mut level_0_prefix = vec![]; + level_0_prefix.extend_from_slice(&self.field_id.to_be_bytes()); + level_0_prefix.push(0); + let level_0_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; - for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { - let (key, value) = db_result_item?; - + for el in level_0_iter { + let (key, value) = el?; let bound = key.left_bound; let docids = value.bitmap; if first_iteration_for_new_group { - start_bound = bound; + left_bound = bound; first_iteration_for_new_group = false; } - // end_bound = bound; bitmaps.push(docids); - if bitmaps.len() == level_group_size.get() { - computed_group_bitmap(&bitmaps, start_bound)?; + if bitmaps.len() == self.level_group_size { + handle_group(&bitmaps, left_bound); first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - computed_group_bitmap(&bitmaps, start_bound)?; + handle_group(&bitmaps, left_bound); bitmaps.clear(); } - // level 0 is already stored in the DB - return Ok(vec![]); - } else { + Ok(()) + } + + /// Compute the content of the database levels from its level 0 for the given field id. + /// + /// ## Returns: + /// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` + /// that must be inserted into the database. + /// 2. a roaring bitmap of all the document ids present in the database + fn compute_higher_levels( + &self, + level: u8, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result>> { + if level == 0 { + self.read_level_0(handle_group); + // Level 0 is already in the database + return Ok(vec![]); + } // level >= 1 // we compute each element of this level based on the elements of the level below it - // once we have computed `level_group_size` elements, we give the start and end bounds + // once we have computed `level_group_size` elements, we give the left bound // of those elements, and their bitmaps, to the level above - let mut cur_writer = - create_writer(compression_type, compression_level, tempfile::tempfile()?); + let mut cur_writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); + let mut cur_writer_len = 0; - let mut range_for_bitmaps = vec![]; + let mut group_sizes = vec![]; + let mut left_bounds = vec![]; let mut bitmaps = vec![]; // compute the levels below // in the callback, we fill `cur_writer` with the correct elements for this level - let mut sub_writers = recursive_compute_levels( - rtxn, - db, - compression_type, - compression_level, - field_id, - level - 1, - level_0_start, - level_0_range, - level_0_size, - level_group_size, - &mut |sub_bitmaps: &[RoaringBitmap], - start_range: >::EItem| { + let mut sub_writers = + self.compute_higher_levels(level - 1, &mut |sub_bitmaps, left_bound| { let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } - range_for_bitmaps.push(start_range); + group_sizes.push(sub_bitmaps.len() as u8); + left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() == level_group_size.get() { - let start_bound = range_for_bitmaps.first().unwrap(); - computed_group_bitmap(&bitmaps, *start_bound)?; - for (bitmap, start_bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) - { - write_entry::( - &mut cur_writer, - field_id, - NonZeroU8::new(level).unwrap(), - start_bound, - bitmap, - )?; - } + if bitmaps.len() != self.level_group_size { + return Ok(()); + } + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = + FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = + FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; } Ok(()) - }, - )?; - + })?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() { - let start_range = range_for_bitmaps.first().unwrap(); - let end_range = range_for_bitmaps.last().unwrap(); - computed_group_bitmap(&bitmaps, *start_range)?; - for (bitmap, bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry( - &mut cur_writer, - field_id, - NonZeroU8::new(level).unwrap(), - bound, - bitmap, - )?; + if !bitmaps.is_empty() && cur_writer_len >= self.level_group_size * self.min_level_size { + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = + FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; } } - - sub_writers.push(writer_into_reader(cur_writer)?); + if cur_writer_len > self.level_group_size * self.min_level_size { + sub_writers.push(writer_into_reader(cur_writer)?); + } return Ok(sub_writers); } } -fn write_entry( - writer: &mut Writer, - field_id: FieldId, - level: NonZeroU8, - bound: >::EItem, - docids: RoaringBitmap, -) -> Result<()> -where - for<'a> BoundCodec: BytesEncode<'a>, - for<'a> >::EItem: Copy + Sized, -{ - todo!() - // let key = FacetKey { field_id, level: level.get(), left_bound: bound }; - // let key_bytes = FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; - // let value_bytes = - // FacetGroupValueCodec::bytes_encode(&FacetGroupValue { size: 4, bitmap: docids }) - // .ok_or(Error::Encoding)?; - // writer.insert(&key_bytes, &value_bytes)?; - // Ok(()) -} - #[cfg(test)] mod tests { use std::num::NonZeroUsize; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f13ac13a8..5a9066eba 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -34,6 +34,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; +use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, @@ -431,16 +432,21 @@ where let mut databases_seen = MERGED_DATABASE_COUNT; // Run the facets update operation. - let mut builder = Facets::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - if let Some(value) = self.config.facet_level_group_size { - builder.level_group_size(value); + for facet_db in [ + (&self.index.facet_id_string_docids).remap_key_type::>(), + (&self.index.facet_id_f64_docids).remap_key_type::>(), + ] { + let mut builder = Facets::new(self.index, facet_db); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + if let Some(value) = self.config.facet_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.config.facet_min_level_size { + builder.min_level_size(value); + } + builder.execute(self.wtxn)?; } - if let Some(value) = self.config.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute()?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { From 63ef0aba181387a76283edaab126e2210987d284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 30 Aug 2022 14:17:40 +0200 Subject: [PATCH 03/58] Start porting facet distribution and sort to new database structure --- .../search/facet/facet_distribution_iter.rs | 199 ++++++ milli/src/search/facet/facet_number.rs | 335 --------- .../src/search/facet/facet_sort_ascending.rs | 147 ++++ .../src/search/facet/facet_sort_descending.rs | 172 +++++ milli/src/search/facet/facet_string.rs | 649 ------------------ milli/src/search/facet/mod.rs | 74 +- milli/src/update/facets.rs | 8 +- 7 files changed, 594 insertions(+), 990 deletions(-) create mode 100644 milli/src/search/facet/facet_distribution_iter.rs delete mode 100644 milli/src/search/facet/facet_number.rs create mode 100644 milli/src/search/facet/facet_sort_ascending.rs create mode 100644 milli/src/search/facet/facet_sort_descending.rs delete mode 100644 milli/src/search/facet/facet_string.rs diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs new file mode 100644 index 000000000..2dfe3580f --- /dev/null +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -0,0 +1,199 @@ +use roaring::RoaringBitmap; +use std::ops::ControlFlow; + +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; + +use super::{get_first_facet_value, get_highest_level}; + +pub fn iterate_over_facet_distribution<'t, CB>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: &RoaringBitmap, + callback: CB, +) where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + let mut fd = FacetDistribution { rtxn, db, field_id, callback }; + let highest_level = + get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + fd.iterate(candidates, highest_level, first_bound, usize::MAX); + return; + } else { + return; + } +} + +struct FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + callback: CB, +} + +impl<'t, CB> FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + fn iterate_level_0( + &mut self, + candidates: &RoaringBitmap, + starting_bound: &'t [u8], + group_size: usize, + ) -> ControlFlow<()> { + let starting_key = + FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size); + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return ControlFlow::Break(()); + } + let docids_in_common = value.bitmap.intersection_len(candidates); + if docids_in_common > 0 { + match (self.callback)(key.left_bound, docids_in_common) { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return ControlFlow::Break(()), + } + } + } + return ControlFlow::Continue(()); + } + fn iterate( + &mut self, + candidates: &RoaringBitmap, + level: u8, + starting_bound: &'t [u8], + group_size: usize, + ) -> ControlFlow<()> { + if level == 0 { + return self.iterate_level_0(candidates, starting_bound, group_size); + } + let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound }; + let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); + + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return ControlFlow::Break(()); + } + let docids_in_common = value.bitmap & candidates; + if docids_in_common.len() > 0 { + let cf = + self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize); + match cf { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return ControlFlow::Break(()), + } + } + } + + return ControlFlow::Continue(()); + } +} + +#[cfg(test)] +mod tests { + use crate::{codec::U16Codec, Index}; + use heed::BytesDecode; + use roaring::RoaringBitmap; + use std::ops::ControlFlow; + + use super::iterate_over_facet_distribution; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_distribution_all() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + iterate_over_facet_distribution( + &txn, + &index.db.content, + 0, + &candidates, + |facet, count| { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + ControlFlow::Continue(()) + }, + ); + insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_distribution_all_stop_early() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + let mut nbr_facets = 0; + iterate_over_facet_distribution( + &txn, + &index.db.content, + 0, + &candidates, + |facet, count| { + let facet = U16Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return ControlFlow::Break(()); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); + + ControlFlow::Continue(()) + } + }, + ); + insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs deleted file mode 100644 index 5f7bd5325..000000000 --- a/milli/src/search/facet/facet_number.rs +++ /dev/null @@ -1,335 +0,0 @@ -// use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -// use either::Either::{self, Left, Right}; -// use heed::types::{ByteSlice, DecodeIgnore}; -// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange}; -// use obkv::Key; -// use roaring::RoaringBitmap; - -// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; -// use crate::heed_codec::CboRoaringBitmapCodec; -// use crate::{FieldId, Index}; - -// pub struct FacetNumberRange<'t, 'e> { -// rtxn: &'t heed::RoTxn<'e>, -// db: Database, FacetGroupValueCodec>, -// iter: RoRange<'t, FacetKeyCodec, LazyDecode>, -// max_bound: f64, -// previous: Option<(FacetKey, Lazy<'t, FacetGroupValueCodec>)>, -// field_id: FieldId, -// end: Bound, -// } - -// impl<'t, 'e> FacetNumberRange<'t, 'e> { -// pub fn new( -// rtxn: &'t heed::RoTxn<'e>, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level: u8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let left_bound = match left { -// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }), -// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }), -// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), -// }; - -// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?; -// let mut previous = iter.next().transpose()?; - -// // Compute the maximum end bound by looking at the key of the last element in level 0 -// let mut prefix_level_0 = vec![]; -// prefix_level_0.extend_from_slice(&field_id.to_be_bytes()); -// prefix_level_0.push(level); - -// let mut rev_iter = -// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?; - -// let rev_iter_first = rev_iter.next().transpose()?; -// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first { -// let max_bound_key = -// FacetKeyCodec::::bytes_decode(max_bound_key).unwrap(); -// max_bound_key.left_bound -// } else { -// // I can't imagine when that would happen, but let's handle it correctly anyway -// // by making the iterator empty -// previous = None; -// 0.0 // doesn't matter since previous = None so the iterator will always early exit -// // and return None itself -// }; - -// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right }) -// } -// } - -// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> { -// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// // The idea here is to return the **previous** element only if the left -// // bound of the current key fits within the range given to the iter -// // if it doesn't, then there is still a chance that it must be returned, -// // but we need to check the actual right bound of the group by looking for -// // the key preceding the first key of the next group in level 0 - -// let (prev_key, prev_value) = self.previous?; - -// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() { -// let (key, group_value) = match next { -// Ok(n) => n, -// Err(e) => return Some(Err(e)), -// }; -// (key.left_bound, Some((key, group_value))) -// } else { -// // we're at the end of the level iter, so we need to fetch the max bound instead -// (self.max_bound, None) -// }; -// let must_be_returned = match self.end { -// Included(end) => next_left_bound <= end, -// Excluded(end) => next_left_bound < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match prev_value.decode() { -// Ok(group_value) => { -// self.previous = next_previous; -// Some(Ok((prev_key, group_value.bitmap))) -// } -// Err(e) => Some(Err(e)), -// } -// } else { -// // it still possible that we want to return the value (one last time) -// // but to do so, we need to fetch the right bound of the current group -// // this is done by getting the first element at level 0 of the next group -// // then iterating in reverse from it -// // once we have the right bound, we can compare it, and then return or not -// // then we still set self.previous to None so that no other element can return -// // from it? -// let mut level_0_key_prefix = vec![]; -// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes()); -// level_0_key_prefix.push(0); -// let key = -// FacetKey:: { field_id: self.field_id, level: 0, left_bound: next_left_bound }; -// let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); -// level_0_key_prefix.extend_from_slice(&key_bytes); - -// let mut rev_iter_next_group_level_0 = self -// .db -// .as_polymorph() -// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix) -// .unwrap(); -// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap(); -// let key_for_right_bound = -// FacetKeyCodec::::bytes_decode(key_for_right_bound).unwrap(); -// let right_bound = key_for_right_bound.left_bound; -// let must_be_returned = match self.end { -// Included(end) => right_bound <= end, -// Excluded(end) => right_bound < end, -// Unbounded => unreachable!(), -// }; -// self.previous = None; -// if must_be_returned { -// match prev_value.decode() { -// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))), -// Err(e) => Some(Err(e)), -// } -// } else { -// None -// } -// } -// } -// } - -// pub struct FacetNumberRevRange<'t> { -// iter: RoRevRange<'t, FacetKeyCodec, LazyDecode>, -// end: Bound, -// } - -// impl<'t> FacetNumberRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level: u8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let left_bound = match left { -// Included(left) => Included(FacetKey { field_id, level, left_bound: left }), -// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }), -// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), -// }; -// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX }); -// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetNumberRevRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetNumberRevRange<'t> { -// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// loop { -// match self.iter.next() { -// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => { -// let must_be_returned = match self.end { -// Included(end) => todo!(), //right <= end, -// Excluded(end) => todo!(), //right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok(docids) => { -// return Some(Ok(( -// FacetKey { field_id, level, left_bound }, -// docids.bitmap, -// ))) -// } -// Err(e) => return Some(Err(e)), -// } -// } -// continue; -// } -// Some(Err(e)) => return Some(Err(e)), -// None => return None, -// } -// } -// } -// } - -// pub struct FacetNumberIter<'t, 'e> { -// rtxn: &'t heed::RoTxn<'t>, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, -// must_reduce: bool, -// } - -// impl<'t, 'e> FacetNumberIter<'t, 'e> { -// /// Create a `FacetNumberIter` that will iterate on the different facet entries -// /// (facet value + documents ids) and that will reduce the given documents ids -// /// while iterating on the different facet levels. -// pub fn new_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Left(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) -// } - -// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse -// /// (facet value + documents ids) and that will reduce the given documents ids -// /// while iterating on the different facet levels. -// pub fn new_reverse_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Right(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) -// } - -// /// Create a `FacetNumberIter` that will iterate on the different facet entries -// /// (facet value + documents ids) and that will not reduce the given documents ids -// /// while iterating on the different facet levels, possibly returning multiple times -// /// a document id associated with multiple facet values. -// pub fn new_non_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Left(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) -// } - -// fn highest_level( -// rtxn: &'t heed::RoTxn, -// db: Database, X>, -// fid: FieldId, -// ) -> heed::Result> { -// let level = db -// .remap_types::() -// .prefix_iter(rtxn, &fid.to_be_bytes())? -// .remap_key_type::>() -// .last() -// .transpose()? -// .map(|(key, _)| key.level); -// Ok(level) -// } -// } - -// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> { -// type Item = heed::Result<(f64, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// 'outer: loop { -// let (documents_ids, last) = self.level_iters.last_mut()?; -// let is_ascending = last.is_left(); -// for result in last { -// // If the last iterator must find an empty set of documents it means -// // that we found all the documents in the sub level iterations already, -// // we can pop this level iterator. -// if documents_ids.is_empty() { -// break; -// } - -// match result { -// Ok((key, mut docids)) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } - -// if level == 0 { -// return Some(Ok((left, docids))); -// } - -// let rtxn = self.rtxn; -// let db = self.db; -// let fid = self.field_id; -// let left = Included(left); -// let right = Included(right); - -// let result = if is_ascending { -// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) -// .map(Left) -// } else { -// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) -// .map(Right) -// }; - -// match result { -// Ok(iter) => { -// self.level_iters.push((docids, iter)); -// continue 'outer; -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// self.level_iters.pop(); -// } -// } -// } diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs new file mode 100644 index 000000000..c9abd9556 --- /dev/null +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -0,0 +1,147 @@ +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; + +use super::{get_first_facet_value, get_highest_level}; + +pub fn ascending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Box + 't> { + let highest_level = + get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + if let Some(first_bound) = get_first_facet_value::( + rtxn, + &db.remap_key_type::>(), + field_id, + ) { + let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); + + Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }) + } else { + return Box::new(std::iter::empty()); + } +} + +struct AscendingFacetSort<'t, 'e> { + rtxn: &'t heed::RoTxn<'e>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take, FacetGroupValueCodec>>, + )>, +} + +impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { + type Item = (&'t [u8], RoaringBitmap); + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter) = self.stack.last_mut()?; + for result in deepest_iter { + let ( + FacetKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some((left_bound, bitmap)); + } + let starting_key_below = + FacetKey { field_id: self.field_id, level: level - 1, left_bound }; + let iter = self + .db + .range(&self.rtxn, &(starting_key_below..)) + .unwrap() + .take(group_size as usize); + + self.stack.push((bitmap, iter)); + continue 'outer; + } + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index, + }; + use heed::BytesDecode; + use roaring::RoaringBitmap; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_sort() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); + for (facet, docids) in iter { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs new file mode 100644 index 000000000..d3c9d54f8 --- /dev/null +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -0,0 +1,172 @@ +use std::ops::Bound; + +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; + +fn descending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Box + 't> { + let highest_level = get_highest_level(rtxn, db, field_id); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id).unwrap(); + let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; + let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX); + Box::new(DescendingFacetSort { + rtxn, + db, + field_id, + stack: vec![(candidates, iter, Bound::Included(last_bound))], + }) + } else { + return Box::new(std::iter::empty()); + } +} + +struct DescendingFacetSort<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take, FacetGroupValueCodec>>, + Bound<&'t [u8]>, + )>, +} + +impl<'t> Iterator for DescendingFacetSort<'t> { + type Item = (&'t [u8], RoaringBitmap); + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; + while let Some(result) = deepest_iter.next() { + let ( + FacetKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some((left_bound, bitmap)); + } + let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; + + let end_key_kelow = match *right_bound { + Bound::Included(right) => Bound::Included(FacetKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Excluded(right) => Bound::Excluded(FacetKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Unbounded => Bound::Unbounded, + }; + let prev_right_bound = *right_bound; + *right_bound = Bound::Excluded(left_bound); + let iter = self + .db + .rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) + .unwrap() + .take(group_size as usize); + + self.stack.push((bitmap, iter, prev_right_bound)); + continue 'outer; + } + *right_bound = Bound::Excluded(left_bound); + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + codec::{MyByteSlice, U16Codec}, + descending_facet_sort::descending_facet_sort, + display_bitmap, FacetKeyCodec, Index, + }; + use heed::BytesDecode; + use roaring::RoaringBitmap; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_sort_descending() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.db.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, &db, 0, candidates); + for (facet, docids) in iter { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs deleted file mode 100644 index b01359503..000000000 --- a/milli/src/search/facet/facet_string.rs +++ /dev/null @@ -1,649 +0,0 @@ -// //! This module contains helpers iterators for facet strings. -// //! -// //! The purpose is to help iterate over the quite complex system of facets strings. A simple -// //! description of the system would be that every facet string value is stored into an LMDB database -// //! and that every value is associated with the document ids which are associated with this facet -// //! string value. -// //! -// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet -// //! string values, those aggregations helps in choosing the right groups of facets to follow. -// //! -// //! ## A typical algorithm run -// //! -// //! If a group of aggregated facets values contains one of the documents ids, we must continue -// //! iterating over the sub-groups. -// //! -// //! If this group is the lowest level and contain at least one document id we yield the associated -// //! facet documents ids. -// //! -// //! If the group doesn't contain one of our documents ids, we continue to the next group at this -// //! same level. -// //! -// //! ## The complexity comes from the strings -// //! -// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -// //! two numbers bounds, the left and the right bound of the group, both inclusive. -// //! -// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -// //! are simple unions of the documents ids coming from the groups below. -// //! -// //! ### Example of what a facet number LMDB database contain -// //! -// //! | level | left-bound | right-bound | documents ids | -// //! |-------|------------|-------------|------------------| -// //! | 0 | 0 | _skipped_ | 1, 2 | -// //! | 0 | 1 | _skipped_ | 6, 7 | -// //! | 0 | 3 | _skipped_ | 4, 7 | -// //! | 0 | 5 | _skipped_ | 2, 3, 4 | -// //! | 1 | 0 | 1 | 1, 2, 6, 7 | -// //! | 1 | 3 | 5 | 2, 3, 4, 7 | -// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -// //! -// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -// //! bound, that's the base level where you can directly fetch the documents ids associated with an -// //! exact number. -// //! -// //! The next levels have two different bounds and the associated documents ids are simply the result -// //! of an union of all the documents ids associated with the aggregated groups above. -// //! -// //! ## The complexity of defining groups for facet strings -// //! -// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw -// //! form and a simple `strcmp` will define the order in which keys will be read from the store. -// //! -// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -// //! first number then by the second if the the first number is equal on two keys. -// //! -// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet -// //! strings is different for each facet value. -// //! -// //! ### Basic approach: padding the keys -// //! -// //! A first approach would be to simply define the maximum size of a facet string and pad the keys -// //! with zeroes. The big problem of this approach is that it: -// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -// //! other. -// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -// //! performances. -// //! -// //! ### Better approach: number the facet groups -// //! -// //! A better approach would be to number the groups, this way we don't have the downsides of the -// //! previously described approach but we need to be able to describe the groups by using a number. -// //! -// //! #### Example of facet strings with numbered groups -// //! -// //! | level | left-bound | right-bound | left-string | right-string | documents ids | -// //! |-------|------------|-------------|-------------|--------------|------------------| -// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -// //! -// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -// //! need to store the facet string value two times. -// //! -// //! The number in the left-bound and right-bound columns are incremental numbers representing the -// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -// //! of the LMDB keys. -// //! -// //! In the value, not in the key, you can see that we added two new values: the left-string and the -// //! right-string, which defines the original facet strings associated with the given group. -// //! -// //! We put those two strings inside of the value, this way we do not limit the maximum size of the -// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -// //! values on another page, this helps in iterating over keys fast enough and only fetch the page -// //! with the values when required. -// //! -// //! The other little advantage with this solution is that there is no a big overhead, compared with -// //! the facet number levels, we only duplicate the facet strings once for the level 1. -// //! -// //! #### A typical algorithm run -// //! -// //! Note that the algorithm is always moving from the highest level to the lowest one, one level -// //! by one level, this is why it is ok to only store the facets string on the level 1. -// //! -// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -// //! we must continue iterating over the sub-groups. To do so: -// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -// //! and iterate over the facet groups defined by these numbers over the current level - 1. -// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -// //! value and just do the same as with the facet numbers but with strings: iterate over the -// //! current level - 1 with both keys. -// //! -// //! If this group is the lowest level (level 0) and contain at least one document id we yield the -// //! associated facet documents ids. -// //! -// //! If the group doesn't contain one of our documents ids, we continue to the next group at this -// //! same level. -// //! - -// use std::num::NonZeroU8; -// use std::ops::Bound; -// use std::ops::Bound::{Excluded, Included, Unbounded}; - -// use either::{Either, Left, Right}; -// use heed::types::{ByteSlice, DecodeIgnore}; -// use heed::{Database, LazyDecode, RoRange, RoRevRange}; -// use roaring::RoaringBitmap; - -// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; -// use crate::heed_codec::CboRoaringBitmapCodec; -// use crate::{FieldId, Index}; - -// /// An iterator that is used to explore the facets level strings -// /// from the level 1 to infinity. -// /// -// /// It yields the level, group id that an entry covers, the optional group strings -// /// that it covers of the level 0 only if it is an entry from the level 1 and -// /// the roaring bitmap associated. -// pub struct FacetStringGroupRange<'t> { -// iter: RoRange< -// 't, -// FacetLevelValueU32Codec, -// LazyDecode>, -// >, -// end: Bound, -// } - -// impl<'t> FacetStringGroupRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// level: NonZeroU8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let db = db.remap_types::< -// FacetLevelValueU32Codec, -// FacetStringZeroBoundsValueCodec, -// >(); -// let left_bound = match left { -// Included(left) => Included((field_id, level, left, u32::MIN)), -// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), -// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), -// }; -// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); -// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetStringGroupRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetStringGroupRange<'t> { -// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, level, left, right), docids))) => { -// let must_be_returned = match self.end { -// Included(end) => right <= end, -// Excluded(end) => right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), -// Err(e) => Some(Err(e)), -// } -// } else { -// None -// } -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// pub struct FacetStringGroupRevRange<'t> { -// iter: RoRevRange< -// 't, -// FacetLevelValueU32Codec, -// LazyDecode>, -// >, -// end: Bound, -// } - -// impl<'t> FacetStringGroupRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// level: NonZeroU8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let db = db.remap_types::< -// FacetLevelValueU32Codec, -// FacetStringZeroBoundsValueCodec, -// >(); -// let left_bound = match left { -// Included(left) => Included((field_id, level, left, u32::MIN)), -// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), -// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), -// }; -// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); -// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetStringGroupRevRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetStringGroupRevRange<'t> { -// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - -// fn next(&mut self) -> Option { -// loop { -// match self.iter.next() { -// Some(Ok(((_fid, level, left, right), docids))) => { -// let must_be_returned = match self.end { -// Included(end) => right <= end, -// Excluded(end) => right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok((bounds, docids)) => { -// return Some(Ok(((level, left, right), (bounds, docids)))) -// } -// Err(e) => return Some(Err(e)), -// } -// } -// continue; -// } -// Some(Err(e)) => return Some(Err(e)), -// None => return None, -// } -// } -// } -// } - -// /// An iterator that is used to explore the level 0 of the facets string database. -// /// -// /// It yields the facet string and the roaring bitmap associated with it. -// pub struct FacetStringLevelZeroRange<'t> { -// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -// } - -// impl<'t> FacetStringLevelZeroRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// left: Bound<&str>, -// right: Bound<&str>, -// ) -> heed::Result> { -// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { -// buffer.extend_from_slice(&field_id.to_be_bytes()); -// buffer.push(0); -// buffer.extend_from_slice(value.as_bytes()); -// &buffer[..] -// } - -// let mut left_buffer = Vec::new(); -// let left_bound = match left { -// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), -// Unbounded => { -// left_buffer.extend_from_slice(&field_id.to_be_bytes()); -// left_buffer.push(0); -// Included(&left_buffer[..]) -// } -// }; - -// let mut right_buffer = Vec::new(); -// let right_bound = match right { -// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), -// Unbounded => { -// right_buffer.extend_from_slice(&field_id.to_be_bytes()); -// right_buffer.push(1); // we must only get the level 0 -// Excluded(&right_buffer[..]) -// } -// }; - -// let iter = db -// .remap_key_type::() -// .range(rtxn, &(left_bound, right_bound))? -// .remap_types::(); - -// Ok(FacetStringLevelZeroRange { iter }) -// } -// } - -// impl<'t> Iterator for FacetStringLevelZeroRange<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, normalized), (original, docids)))) => { -// Some(Ok((normalized, original, docids))) -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// pub struct FacetStringLevelZeroRevRange<'t> { -// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -// } - -// impl<'t> FacetStringLevelZeroRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// left: Bound<&str>, -// right: Bound<&str>, -// ) -> heed::Result> { -// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { -// buffer.extend_from_slice(&field_id.to_be_bytes()); -// buffer.push(0); -// buffer.extend_from_slice(value.as_bytes()); -// &buffer[..] -// } - -// let mut left_buffer = Vec::new(); -// let left_bound = match left { -// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), -// Unbounded => { -// left_buffer.extend_from_slice(&field_id.to_be_bytes()); -// left_buffer.push(0); -// Included(&left_buffer[..]) -// } -// }; - -// let mut right_buffer = Vec::new(); -// let right_bound = match right { -// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), -// Unbounded => { -// right_buffer.extend_from_slice(&field_id.to_be_bytes()); -// right_buffer.push(1); // we must only get the level 0 -// Excluded(&right_buffer[..]) -// } -// }; - -// let iter = db -// .remap_key_type::() -// .rev_range(rtxn, &(left_bound, right_bound))? -// .remap_types::(); - -// Ok(FacetStringLevelZeroRevRange { iter }) -// } -// } - -// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, normalized), (original, docids)))) => { -// Some(Ok((normalized, original, docids))) -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -// type EitherStringRevRange<'t> = -// Either, FacetStringLevelZeroRevRange<'t>>; - -// /// An iterator that is used to explore the facet strings level by level, -// /// it will only return facets strings that are associated with the -// /// candidates documents ids given. -// pub struct FacetStringIter<'t> { -// rtxn: &'t heed::RoTxn<'t>, -// db: Database, -// field_id: FieldId, -// level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, -// must_reduce: bool, -// } - -// impl<'t> FacetStringIter<'t> { -// pub fn new_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Left(highest_iter))], -// must_reduce: true, -// }) -// } - -// pub fn new_reverse_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Right(highest_reverse_iter))], -// must_reduce: true, -// }) -// } - -// pub fn new_non_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Left(highest_iter))], -// must_reduce: false, -// }) -// } - -// fn highest_level( -// rtxn: &'t heed::RoTxn, -// db: Database, -// fid: FieldId, -// ) -> heed::Result> { -// Ok(db -// .remap_types::() -// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits -// .last() -// .transpose()? -// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit -// } - -// fn highest_iter( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// db: Database, -// field_id: FieldId, -// ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// match NonZeroU8::new(highest_level) { -// Some(highest_level) => FacetStringGroupRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// highest_level, -// Unbounded, -// Unbounded, -// ) -// .map(Left), -// None => FacetStringLevelZeroRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// Unbounded, -// Unbounded, -// ) -// .map(Right), -// } -// } - -// fn highest_reverse_iter( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// db: Database, -// field_id: FieldId, -// ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// match NonZeroU8::new(highest_level) { -// Some(highest_level) => FacetStringGroupRevRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// highest_level, -// Unbounded, -// Unbounded, -// ) -// .map(Left), -// None => FacetStringLevelZeroRevRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// Unbounded, -// Unbounded, -// ) -// .map(Right), -// } -// } -// } - -// impl<'t> Iterator for FacetStringIter<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// 'outer: loop { -// let (documents_ids, last) = self.level_iters.last_mut()?; -// let is_ascending = last.is_left(); - -// // We remap the different iterator types to make -// // the algorithm less complex to understand. -// let last = match last { -// Left(ascending) => match ascending { -// Left(group) => Left(Left(group)), -// Right(zero_level) => Right(Left(zero_level)), -// }, -// Right(descending) => match descending { -// Left(group) => Left(Right(group)), -// Right(zero_level) => Right(Right(zero_level)), -// }, -// }; - -// match last { -// Left(group) => { -// for result in group { -// match result { -// Ok(((level, left, right), (string_bounds, mut docids))) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } - -// let result = if is_ascending { -// match string_bounds { -// Some((left, right)) => FacetStringLevelZeroRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// Included(left), -// Included(right), -// ) -// .map(Right), -// None => FacetStringGroupRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// NonZeroU8::new(level.get() - 1).unwrap(), -// Included(left), -// Included(right), -// ) -// .map(Left), -// } -// .map(Left) -// } else { -// match string_bounds { -// Some((left, right)) => { -// FacetStringLevelZeroRevRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// Included(left), -// Included(right), -// ) -// .map(Right) -// } -// None => FacetStringGroupRevRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// NonZeroU8::new(level.get() - 1).unwrap(), -// Included(left), -// Included(right), -// ) -// .map(Left), -// } -// .map(Right) -// }; - -// match result { -// Ok(iter) => { -// self.level_iters.push((docids, iter)); -// continue 'outer; -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Right(zero_level) => { -// // level zero only -// for result in zero_level { -// match result { -// Ok((normalized, original, mut docids)) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } -// return Some(Ok((normalized, original, docids))); -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// } - -// self.level_iters.pop(); -// } -// } -// } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 13b00d2de..ceedff1e0 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,9 +1,79 @@ +use heed::types::ByteSlice; +use heed::{BytesDecode, RoTxn}; + +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; // pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; // pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; -mod facet_number; -mod facet_string; +mod facet_distribution_iter; +mod facet_sort_ascending; +mod facet_sort_descending; mod filter; + +fn get_first_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> Option +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) + .unwrap(); + if let Some(first) = level0_iter_forward.next() { + let (first_key, _) = first.unwrap(); + let first_key = FacetKeyCodec::::bytes_decode(first_key).unwrap(); + Some(first_key.left_bound) + } else { + None + } +} +fn get_last_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> Option +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_backward = db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) + .unwrap(); + if let Some(last) = level0_iter_backward.next() { + let (last_key, _) = last.unwrap(); + let last_key = FacetKeyCodec::::bytes_decode(last_key).unwrap(); + Some(last_key.left_bound) + } else { + None + } +} +fn get_highest_level<'t>( + txn: &'t RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> u8 { + let field_id_prefix = &field_id.to_be_bytes(); + db.as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix) + .unwrap() + .next() + .map(|el| { + let (key, _) = el.unwrap(); + let key = FacetKeyCodec::::bytes_decode(key).unwrap(); + key.level + }) + .unwrap_or(0) +} diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index aaaa445da..fe8c2855e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -64,7 +64,7 @@ impl<'i> Facets<'i> { } #[logging_timer::time("Facets::{}")] - pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); @@ -172,14 +172,14 @@ impl<'t> CreateFacetsAlgo<'t> { bitmaps.push(docids); if bitmaps.len() == self.level_group_size { - handle_group(&bitmaps, left_bound); + handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - handle_group(&bitmaps, left_bound); + handle_group(&bitmaps, left_bound)?; bitmaps.clear(); } Ok(()) @@ -197,7 +197,7 @@ impl<'t> CreateFacetsAlgo<'t> { handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { - self.read_level_0(handle_group); + self.read_level_0(handle_group)?; // Level 0 is already in the database return Ok(vec![]); } From b8a1caad5e8d9a55ba7c7807805a4ee2fbb6b980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 30 Aug 2022 15:22:39 +0200 Subject: [PATCH 04/58] Add range search and incremental indexing algorithm --- milli/Cargo.toml | 2 +- .../search/facet/facet_distribution_iter.rs | 70 +-- milli/src/search/facet/facet_range_search.rs | 451 +++++++++++++++++ .../src/search/facet/facet_sort_ascending.rs | 56 ++- .../src/search/facet/facet_sort_descending.rs | 73 +-- milli/src/search/facet/filter.rs | 1 - milli/src/search/facet/incremental_update.rs | 459 ++++++++++++++++++ milli/src/search/facet/mod.rs | 148 +++++- 8 files changed, 1145 insertions(+), 115 deletions(-) create mode 100644 milli/src/search/facet/facet_range_search.rs create mode 100644 milli/src/search/facet/incremental_update.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 835425714..658ef0d24 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -54,7 +54,7 @@ big_s = "1.0.2" insta = "1.21.0" maplit = "1.0.2" md5 = "0.7.0" -rand = "0.8.5" +rand = {version = "0.8.5", features = ["small_rng"] } [features] default = [ "charabia/default" ] diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 2dfe3580f..83079028c 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,8 +1,8 @@ +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; +use crate::Result; use roaring::RoaringBitmap; use std::ops::ControlFlow; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; - use super::{get_first_facet_value, get_highest_level}; pub fn iterate_over_facet_distribution<'t, CB>( @@ -11,18 +11,19 @@ pub fn iterate_over_facet_distribution<'t, CB>( field_id: u16, candidates: &RoaringBitmap, callback: CB, -) where +) -> Result<()> +where CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX); - return; + return Ok(()); } else { - return; + return Ok(()); } } @@ -45,26 +46,26 @@ where candidates: &RoaringBitmap, starting_bound: &'t [u8], group_size: usize, - ) -> ControlFlow<()> { + ) -> Result> { let starting_key = FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; - let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size); + let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size); for el in iter { - let (key, value) = el.unwrap(); + let (key, value) = el?; // The range is unbounded on the right and the group size for the highest level is MAX, // so we need to check that we are not iterating over the next field id if key.field_id != self.field_id { - return ControlFlow::Break(()); + return Ok(ControlFlow::Break(())); } let docids_in_common = value.bitmap.intersection_len(candidates); if docids_in_common > 0 { match (self.callback)(key.left_bound, docids_in_common) { ControlFlow::Continue(_) => {} - ControlFlow::Break(_) => return ControlFlow::Break(()), + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } } - return ControlFlow::Continue(()); + return Ok(ControlFlow::Continue(())); } fn iterate( &mut self, @@ -72,7 +73,7 @@ where level: u8, starting_bound: &'t [u8], group_size: usize, - ) -> ControlFlow<()> { + ) -> Result> { if level == 0 { return self.iterate_level_0(candidates, starting_bound, group_size); } @@ -84,34 +85,42 @@ where // The range is unbounded on the right and the group size for the highest level is MAX, // so we need to check that we are not iterating over the next field id if key.field_id != self.field_id { - return ControlFlow::Break(()); + return Ok(ControlFlow::Break(())); } let docids_in_common = value.bitmap & candidates; if docids_in_common.len() > 0 { - let cf = - self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize); + let cf = self.iterate( + &docids_in_common, + level - 1, + key.left_bound, + value.size as usize, + )?; match cf { ControlFlow::Continue(_) => {} - ControlFlow::Break(_) => return ControlFlow::Break(()), + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } } - return ControlFlow::Continue(()); + return Ok(ControlFlow::Continue(())); } } #[cfg(test)] mod tests { - use crate::{codec::U16Codec, Index}; use heed::BytesDecode; + use rand::{rngs::SmallRng, Rng, SeedableRng}; use roaring::RoaringBitmap; use std::ops::ControlFlow; + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, + }; + use super::iterate_over_facet_distribution; - fn get_simple_index() -> Index { - let index = Index::::new(4, 8); + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -121,18 +130,19 @@ mod tests { txn.commit().unwrap(); index } - fn get_random_looking_index() -> Index { - let index = Index::::new(4, 8); + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = fastrand::Rng::with_seed(0); - let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as u16), &bitmap); + bitmap.insert(key + 100.); + index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); index @@ -156,7 +166,7 @@ mod tests { 0, &candidates, |facet, count| { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); results.push_str(&format!("{facet}: {count}\n")); ControlFlow::Continue(()) }, @@ -180,7 +190,7 @@ mod tests { 0, &candidates, |facet, count| { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); if nbr_facets == 100 { return ControlFlow::Break(()); } else { diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs new file mode 100644 index 000000000..c01346b25 --- /dev/null +++ b/milli/src/search/facet/facet_range_search.rs @@ -0,0 +1,451 @@ +use heed::BytesEncode; +use roaring::RoaringBitmap; +use std::ops::Bound; +use std::ops::RangeBounds; + +use crate::heed_codec::facet::new::FacetGroupValueCodec; +use crate::heed_codec::facet::new::FacetKey; +use crate::heed_codec::facet::new::FacetKeyCodec; +use crate::heed_codec::facet::new::MyByteSlice; +use crate::Result; + +use super::get_first_facet_value; +use super::get_highest_level; +use super::get_last_facet_value; + +pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: &'t Bound<>::EItem>, + right: &'t Bound<>::EItem>, +) -> Result +where + BoundCodec: for<'a> BytesEncode<'a>, + for<'a> >::EItem: Sized, +{ + let inner; + let left = match left { + Bound::Included(left) => { + inner = BoundCodec::bytes_encode(left).unwrap(); + Bound::Included(inner.as_ref()) + } + Bound::Excluded(left) => { + inner = BoundCodec::bytes_encode(left).unwrap(); + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + let inner; + let right = match right { + Bound::Included(right) => { + inner = BoundCodec::bytes_encode(right).unwrap(); + Bound::Included(inner.as_ref()) + } + Bound::Excluded(right) => { + inner = BoundCodec::bytes_encode(right).unwrap(); + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + + let mut docids = RoaringBitmap::new(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, db, field_id)?; + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; + Ok(docids) + } else { + return Ok(RoaringBitmap::new()); + } +} + +/// Fetch the document ids that have a facet with a value between the two given bounds +struct FacetRangeSearch<'t, 'b, 'bitmap> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: Bound<&'b [u8]>, + right: Bound<&'b [u8]>, + docids: &'bitmap mut RoaringBitmap, +} +impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { + fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { + let left_key = + FacetKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; + let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + for el in iter { + let (key, value) = el?; + // the right side of the iter range is unbounded, so we need to make sure that we are not iterating + // on the next field id + if key.field_id != self.field_id { + return Ok(()); + } + let should_skip = { + match self.left { + Bound::Included(left) => left > key.left_bound, + Bound::Excluded(left) => left >= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_skip { + continue; + } + let should_stop = { + match self.right { + Bound::Included(right) => right < key.left_bound, + Bound::Excluded(right) => right <= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + break; + } + + if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { + *self.docids |= value.bitmap; + } + } + Ok(()) + } + + /// Recursive part of the algorithm for level > 0 + fn run( + &mut self, + level: u8, + starting_left_bound: &'t [u8], + rightmost_bound: Bound<&'t [u8]>, + group_size: usize, + ) -> Result<()> { + if level == 0 { + return self.run_level_0(starting_left_bound, group_size); + } + + let left_key = FacetKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + + let (mut previous_key, mut previous_value) = iter.next().unwrap()?; + for el in iter { + let (next_key, next_value) = el?; + // the right of the iter range is unbounded, so we need to make sure that we are not iterating + // on the next field id + if next_key.field_id != self.field_id { + return Ok(()); + } + // now, do we skip, stop, or visit? + let should_skip = { + match self.left { + Bound::Included(left) => left >= next_key.left_bound, + Bound::Excluded(left) => left >= next_key.left_bound, // TODO: use > instead? + Bound::Unbounded => false, + } + }; + if should_skip { + previous_key = next_key; + previous_value = next_value; + continue; + } + + // should we stop? + let should_stop = { + match self.right { + Bound::Included(right) => right < previous_key.left_bound, + Bound::Excluded(right) => right <= previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match self.right { + Bound::Included(right) => next_key.left_bound <= right, + Bound::Excluded(right) => next_key.left_bound <= right, + Bound::Unbounded => true, + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + previous_key = next_key; + previous_value = next_value; + continue; + } + + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let rightmost_bound = Bound::Excluded(next_key.left_bound); + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + + previous_key = next_key; + previous_value = next_value; + } + // previous_key/previous_value are the last element + + // now, do we skip, stop, or visit? + let should_skip = { + match (self.left, rightmost_bound) { + (Bound::Included(left), Bound::Included(right)) => left > right, + (Bound::Included(left), Bound::Excluded(right)) => left >= right, + (Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => { + left >= right + } + (Bound::Unbounded, _) => false, + (_, Bound::Unbounded) => false, // should never run? + } + }; + if should_skip { + return Ok(()); + } + + // should we stop? + let should_stop = { + match self.right { + Bound::Included(right) => right <= previous_key.left_bound, + Bound::Excluded(right) => right < previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match (self.right, rightmost_bound) { + (Bound::Included(right), Bound::Included(rightmost)) => rightmost <= right, + (Bound::Included(right), Bound::Excluded(rightmost)) => rightmost < right, + // e.g. x < 8 and rightmost is <= y + // condition met if rightmost < 8 + (Bound::Excluded(right), Bound::Included(rightmost)) => rightmost < right, + // e.g. x < 8 and rightmost is < y + // condition met only if y <= 8? + (Bound::Excluded(right), Bound::Excluded(rightmost)) => rightmost <= right, + // e.g. x < inf. , so yes we take the whole thing + (Bound::Unbounded, _) => true, + // e.g. x < 7 , righmost is inf + (_, Bound::Unbounded) => false, // panic? + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + } else { + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, + search::facet::test::FacetIndex, snapshot_tests::display_bitmap, + }; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use std::ops::Bound; + + use super::find_docids_of_facet_within_bounds; + + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_range_increasing() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(0.); + let end = Bound::Included(i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!( + format!("filter_range_{i}_increasing_included_bounds"), + results + ); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Excluded(0.); + let end = Bound::Excluded(i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!( + format!("filter_range_{i}_increasing_excluded_bounds"), + results + ); + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_decreasing() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255.); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!( + format!("filter_range_{i}_decreasing_included_bounds"), + results + ); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255.); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!( + format!("filter_range_{i}_decreasing_excluded_bounds"), + results + ); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_pinch() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255. - i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!(format!("filter_range_{i}_pinch_included_bounds"), results); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255. - i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!(format!("filter_range_{i}_pinch_excluded_bounds"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index c9abd9556..73491d4ae 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -1,8 +1,8 @@ -use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; +use crate::Result; +use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; @@ -11,20 +11,20 @@ pub fn ascending_facet_sort<'t>( db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Box + 't> { +) -> Result> + 't>> { let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::( rtxn, &db.remap_key_type::>(), field_id, - ) { + )? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); - Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }) + Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })) } else { - return Box::new(std::iter::empty()); + Ok(Box::new(std::iter::empty())) } } @@ -39,7 +39,7 @@ struct AscendingFacetSort<'t, 'e> { } impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { - type Item = (&'t [u8], RoaringBitmap); + type Item = Result<(&'t [u8], RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -67,15 +67,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { *documents_ids -= &bitmap; if level == 0 { - return Some((left_bound, bitmap)); + return Some(Ok((left_bound, bitmap))); } let starting_key_below = FacetKey { field_id: self.field_id, level: level - 1, left_bound }; - let iter = self - .db - .range(&self.rtxn, &(starting_key_below..)) - .unwrap() - .take(group_size as usize); + let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter)); continue 'outer; @@ -88,14 +88,19 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use crate::{ - ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index, - }; use heed::BytesDecode; + use rand::Rng; + use rand::SeedableRng; use roaring::RoaringBitmap; - fn get_simple_index() -> Index { - let index = Index::::new(4, 8); + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, + search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex}, + snapshot_tests::display_bitmap, + }; + + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -105,18 +110,19 @@ mod tests { txn.commit().unwrap(); index } - fn get_random_looking_index() -> Index { - let index = Index::::new(4, 8); + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = fastrand::Rng::with_seed(0); - let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as u16), &bitmap); + index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); index @@ -136,7 +142,7 @@ mod tests { let mut results = String::new(); let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); for (facet, docids) in iter { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); } insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index d3c9d54f8..81b0eb09d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -1,10 +1,10 @@ use std::ops::Bound; -use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; +use crate::Result; +use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; @@ -13,21 +13,21 @@ fn descending_facet_sort<'t>( db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Box + 't> { - let highest_level = get_highest_level(rtxn, db, field_id); - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id).unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; - let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX); - Box::new(DescendingFacetSort { + let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); + Ok(Box::new(DescendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter, Bound::Included(last_bound))], - }) + })) } else { - return Box::new(std::iter::empty()); + Ok(Box::new(std::iter::empty())) } } @@ -43,7 +43,7 @@ struct DescendingFacetSort<'t> { } impl<'t> Iterator for DescendingFacetSort<'t> { - type Item = (&'t [u8], RoaringBitmap); + type Item = Result<(&'t [u8], RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -70,7 +70,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *documents_ids -= &bitmap; if level == 0 { - return Some((left_bound, bitmap)); + return Some(Ok((left_bound, bitmap))); } let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; @@ -89,14 +89,14 @@ impl<'t> Iterator for DescendingFacetSort<'t> { }; let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); - let iter = self - .db - .rev_range( - &self.rtxn, - &(Bound::Included(starting_key_below), end_key_kelow), - ) - .unwrap() - .take(group_size as usize); + let iter = match self.db.rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter, prev_right_bound)); continue 'outer; @@ -110,16 +110,20 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use crate::{ - codec::{MyByteSlice, U16Codec}, - descending_facet_sort::descending_facet_sort, - display_bitmap, FacetKeyCodec, Index, - }; + use heed::BytesDecode; + use rand::Rng; + use rand::SeedableRng; use roaring::RoaringBitmap; - fn get_simple_index() -> Index { - let index = Index::::new(4, 8); + use crate::{ + heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice}, + search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex}, + snapshot_tests::display_bitmap, + }; + + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -129,18 +133,19 @@ mod tests { txn.commit().unwrap(); index } - fn get_random_looking_index() -> Index { - let index = Index::::new(4, 8); + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = fastrand::Rng::with_seed(0); - let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as u16), &bitmap); + bitmap.insert(key + 100.); + index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); index @@ -161,7 +166,7 @@ mod tests { let db = index.db.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, &db, 0, candidates); for (facet, docids) in iter { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); } insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index e911dfb15..dd34abe6d 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -7,7 +7,6 @@ use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use heed::LazyDecode; -use log::debug; use roaring::RoaringBitmap; // use super::FacetNumberRange; diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/search/facet/incremental_update.rs new file mode 100644 index 000000000..a437efb2d --- /dev/null +++ b/milli/src/search/facet/incremental_update.rs @@ -0,0 +1,459 @@ +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; +use crate::Result; +use heed::Error; +use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + +use super::get_highest_level; + +enum InsertionResult { + InPlace, + Insert, +} +enum DeletionResult { + InPlace, + Reduce { prev: Option>, next: Option> }, + Remove { prev: Option>, next: Option> }, +} + +struct IncrementalFacetUpdate<'i> { + db: &'i heed::Database, FacetGroupValueCodec>, + group_size: usize, + min_level_size: usize, + max_group_size: usize, +} +impl<'i> IncrementalFacetUpdate<'i> { + fn find_insertion_key_value<'a>( + &self, + field_id: u16, + level: u8, + search_key: &[u8], + txn: &RoTxn, + ) -> Result<(FacetKey>, FacetGroupValue)> { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + prefix.extend_from_slice(search_key); + + let mut prefix_iter = self + .db + .as_polymorph() + .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + if let Some(e) = prefix_iter.next() { + let (key_bytes, value) = e?; + let key = FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(heed::Error::Encoding)?; + Ok(( + FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + let key = FacetKey { field_id, level, left_bound: search_key }; + match self.db.get_lower_than(txn, &key)? { + Some((key, value)) => { + if key.level != level || key.field_id != field_id { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>( + txn, + &prefix.as_slice(), + )?; + let (key_bytes, value) = iter.next().unwrap()?; + Ok(( + FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + Ok((key.into_owned(), value)) + } + } + None => panic!(), + } + } + } + + fn insert_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + new_key: &[u8], + new_values: &RoaringBitmap, + ) -> Result { + let key = FacetKey { field_id, level: 0, left_bound: new_key }; + let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; + + let mut level0_prefix = vec![]; + level0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level0_prefix.push(0); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?; + + if iter.next().is_none() { + drop(iter); + self.db.put(txn, &key, &value)?; + return Ok(InsertionResult::Insert); + } else { + drop(iter); + let old_value = self.db.get(&txn, &key)?; + match old_value { + Some(mut updated_value) => { + // now merge the two + updated_value.bitmap |= value.bitmap; + self.db.put(txn, &key, &updated_value)?; + Ok(InsertionResult::InPlace) + } + None => { + self.db.put(txn, &key, &value)?; + Ok(InsertionResult::Insert) + } + } + } + } + fn insert_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + new_key: &[u8], + new_values: &RoaringBitmap, + ) -> Result { + if level == 0 { + return self.insert_in_level_0(txn, field_id, new_key, new_values); + } + + let max_group_size = self.max_group_size; + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, new_key, txn)?; + + let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?; + // level below inserted an element + + let insertion_key = { + let mut new_insertion_key = insertion_key.clone(); + let mut modified = false; + + if new_key < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = new_key.to_vec(); + modified = true; + } + if modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + new_insertion_key + }; + + match result { + // TODO: this could go above the block recomputing insertion key + // because we know that if we inserted in place, the key is not a new one + // thus it doesn't extend a group + InsertionResult::InPlace => { + let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + updated_value.bitmap |= new_values; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + + return Ok(InsertionResult::InPlace); + } + InsertionResult::Insert => {} + } + let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + + updated_value.size += 1; + if updated_value.size as usize == max_group_size { + // need to split it + // recompute left element and right element + // replace current group by left element + // add one more group to the right + + let size_left = max_group_size / 2; + let size_right = max_group_size - size_left; + + let level_below = level - 1; + + let (start_key, _) = self + .db + .get_greater_than_or_equal_to( + &txn, + &FacetKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }, + )? + .unwrap(); + + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + while let Some(next) = iter.next() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let mut values_right = RoaringBitmap::new(); + let mut right_start_key = None; + + while let Some(next) = iter.next() { + let (key, value) = next?; + if right_start_key.is_none() { + right_start_key = Some(key.left_bound); + } + values_right |= &value.bitmap; + } + + let key = + FacetKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; + let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) + } else { + let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + value.bitmap |= new_values; + value.size += 1; + self.db.put(txn, &insertion_key.as_ref(), &value).unwrap(); + + Ok(InsertionResult::InPlace) + } + } + + pub fn insert<'a, 't>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + new_key: &[u8], + new_values: &RoaringBitmap, + ) -> Result<()> { + if new_values.is_empty() { + return Ok(()); + } + let group_size = self.group_size; + + let highest_level = get_highest_level(&txn, &self.db, field_id)?; + + let result = + self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; + match result { + InsertionResult::InPlace => return Ok(()), + InsertionResult::Insert => {} + } + + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + let size_highest_level = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .count(); + + if size_highest_level < self.min_level_size { + return Ok(()); + } + + let mut groups_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; + + let mut to_add = vec![]; + for _ in 0..group_size { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..group_size { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: group_size as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + drop(groups_iter); + for (key, value) in to_add { + self.db.put(txn, &key.as_ref(), &value)?; + } + Ok(()) + } + + fn delete_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + key: &[u8], + value: u32, + ) -> Result { + if level == 0 { + return self.delete_in_level_0(txn, field_id, key, value); + } + let (deletion_key, mut bitmap) = + self.find_insertion_key_value(field_id, level, key, txn)?; + + let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?; + + let mut decrease_size = false; + let (prev_key, next_key) = match result { + DeletionResult::InPlace => { + bitmap.bitmap.remove(value); + self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; + return Ok(DeletionResult::InPlace); + } + DeletionResult::Reduce { prev, next } => (prev, next), + DeletionResult::Remove { prev, next } => { + decrease_size = true; + (prev, next) + } + }; + + let mut updated_value = bitmap; + if decrease_size { + updated_value.size -= 1; + } + + if updated_value.size == 0 { + self.db.delete(txn, &deletion_key.as_ref())?; + Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + } else { + let mut updated_deletion_key = deletion_key.clone(); + if key == deletion_key.left_bound { + updated_deletion_key.left_bound = next_key.clone().unwrap(); + } + updated_value.bitmap.remove(value); + let _ = self.db.delete(txn, &deletion_key.as_ref())?; + self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; + + Ok(DeletionResult::Reduce { prev: prev_key, next: next_key }) + } + } + + fn delete_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + key: &[u8], + value: u32, + ) -> Result { + let key = FacetKey { field_id, level: 0, left_bound: key }; + let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; + bitmap.remove(value); + + if bitmap.is_empty() { + let mut prev_key = None; + let mut next_key = None; + + if let Some(prev) = self.db.get_lower_than(&txn, &key)? { + prev_key = Some(prev.0.left_bound.to_vec()); + } + if let Some(next) = self.db.get_greater_than(&txn, &key)? { + if next.0.level == 0 { + next_key = Some(next.0.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + } else { + self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; + Ok(DeletionResult::InPlace) + } + } + + pub fn delete<'a, 't>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + key: &[u8], + value: u32, + ) -> Result<()> { + if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { + return Ok(()); + } + let highest_level = get_highest_level(&txn, &self.db, field_id)?; + + // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + + let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?; + match result { + DeletionResult::InPlace => return Ok(()), + DeletionResult::Reduce { .. } => {} + DeletionResult::Remove { .. } => {} + } + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + if highest_level == 0 + || self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .count() + >= self.group_size + { + return Ok(()); + } + let mut to_delete = vec![]; + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; + while let Some(el) = iter.next() { + let (k, _) = el?; + to_delete.push( + FacetKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } +} diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ceedff1e0..d27206af2 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -10,38 +10,39 @@ pub use self::filter::Filter; mod facet_distribution; mod facet_distribution_iter; +mod facet_range_search; mod facet_sort_ascending; mod facet_sort_descending; mod filter; +mod incremental_update; -fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> Option +) -> crate::Result> where BoundCodec: BytesDecode<'t>, { let mut level0prefix = vec![]; level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.push(0); - let mut level0_iter_forward = db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) - .unwrap(); + let mut level0_iter_forward = + db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { - let (first_key, _) = first.unwrap(); - let first_key = FacetKeyCodec::::bytes_decode(first_key).unwrap(); - Some(first_key.left_bound) + let (first_key, _) = first?; + let first_key = + FacetKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + Ok(Some(first_key.left_bound)) } else { - None + Ok(None) } } -fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> Option +) -> crate::Result> where BoundCodec: BytesDecode<'t>, { @@ -50,30 +51,129 @@ where level0prefix.push(0); let mut level0_iter_backward = db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) - .unwrap(); + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { - let (last_key, _) = last.unwrap(); - let last_key = FacetKeyCodec::::bytes_decode(last_key).unwrap(); - Some(last_key.left_bound) + let (last_key, _) = last?; + let last_key = + FacetKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + Ok(Some(last_key.left_bound)) } else { - None + Ok(None) } } -fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> u8 { +) -> crate::Result { let field_id_prefix = &field_id.to_be_bytes(); - db.as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix) - .unwrap() + Ok(db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)? .next() .map(|el| { let (key, _) = el.unwrap(); let key = FacetKeyCodec::::bytes_decode(key).unwrap(); key.level }) - .unwrap_or(0) + .unwrap_or(0)) +} + +#[cfg(test)] +mod test { + use std::{fmt::Display, marker::PhantomData, rc::Rc}; + + use heed::{BytesDecode, BytesEncode, Env}; + use tempfile::TempDir; + + use crate::{ + heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, + }, + snapshot_tests::display_bitmap, + }; + + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub db: Database, + _phantom: PhantomData, + } + + pub struct Database { + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: usize, + pub max_group_size: usize, + _tempdir: Rc, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; + let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 100); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + db: Database { content, group_size, max_group_size, _tempdir: tempdir }, + env, + _phantom: PhantomData, + } + } + pub fn new(group_size: u8, max_group_size: u8) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; + let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 100); + let tempdir = tempfile::TempDir::new_in("databases/").unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + db: Database { content, group_size, max_group_size, _tempdir: Rc::new(tempdir) }, + env, + _phantom: PhantomData, + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.db.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } } From 5a904cf29d77ccb7bbeda88373f9017f9c0e388c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 07:50:18 +0200 Subject: [PATCH 05/58] Reintroduce facet distribution functionality --- milli/src/search/facet/facet_distribution.rs | 132 +++++++++--------- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/mod.rs | 6 +- 3 files changed, 72 insertions(+), 70 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index fddf93d4b..670719a9b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,13 +1,18 @@ use std::collections::{BTreeMap, HashSet}; -use std::ops::Bound::Unbounded; +use std::ops::ControlFlow; use std::{fmt, mem}; use heed::types::ByteSlice; +use heed::BytesDecode; use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::search::facet::facet_distribution_iter; // use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; @@ -131,22 +136,21 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - todo!() - // let iter = - // FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - // for result in iter { - // let (value, mut docids) = result?; - // docids &= candidates; - // if !docids.is_empty() { - // distribution.insert(value.to_string(), docids.len()); - // } - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } - - // Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + &self.index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids| { + let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + ) } fn facet_strings_distribution_from_facet_levels( @@ -155,22 +159,21 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - todo!() - // let iter = - // FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - // for result in iter { - // let (_normalized, original, mut docids) = result?; - // docids &= candidates; - // if !docids.is_empty() { - // distribution.insert(original.to_string(), docids.len()); - // } - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } - - // Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + &self.index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids| { + let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + ) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -179,43 +182,42 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, ) -> heed::Result> { - todo!() - // let mut distribution = BTreeMap::new(); + let mut distribution = BTreeMap::new(); - // let db = self.index.facet_id_f64_docids; - // let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let db = self.index.facet_id_f64_docids; + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(0); + let iter = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - // for result in range { - // let ((_, _, value, _), docids) = result?; - // distribution.insert(value.to_string(), docids.len()); - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_string(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } - // let iter = self - // .index - // .facet_id_string_docids - // .remap_key_type::() - // .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - // .remap_key_type::(); + let iter = self + .index + .facet_id_string_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - // let mut normalized_distribution = BTreeMap::new(); - // for result in iter { - // let ((_, normalized_value), group_value) = result?; - // normalized_distribution - // .insert(normalized_value, (normalized_value, group_value.bitmap.len())); - // if normalized_distribution.len() == self.max_values_per_facet { - // break; - // } - // } + // TODO: get the original value of the facet somewhere (in the documents DB?) + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_owned(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } - // let iter = normalized_distribution - // .into_iter() - // .map(|(_normalized, (original, count))| (original.to_string(), count)); - // distribution.extend(iter); - - // Ok(distribution) + Ok(distribution) } fn facet_values(&self, field_id: FieldId) -> heed::Result> { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 83079028c..9f1031a85 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,5 +1,5 @@ use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use std::ops::ControlFlow; @@ -20,7 +20,7 @@ where get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - fd.iterate(candidates, highest_level, first_bound, usize::MAX); + fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { return Ok(()); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d27206af2..023d433ad 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -20,7 +20,7 @@ pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result> +) -> heed::Result> where BoundCodec: BytesDecode<'t>, { @@ -42,7 +42,7 @@ pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result> +) -> heed::Result> where BoundCodec: BytesDecode<'t>, { @@ -65,7 +65,7 @@ pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result { +) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); Ok(db .as_polymorph() From 6cc91824c1d831950187cfa6b4ca047cf0b89683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 07:51:11 +0200 Subject: [PATCH 06/58] Remove unused heed codec files --- .../facet_string_zero_bounds_value_codec.rs | 114 ------------------ .../facet/facet_value_string_codec.rs | 35 ------ milli/src/heed_codec/facet/mod.rs | 4 +- milli/src/update/delete_documents.rs | 2 +- 4 files changed, 3 insertions(+), 152 deletions(-) delete mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_value_string_codec.rs diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs deleted file mode 100644 index 337433c2b..000000000 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ /dev/null @@ -1,114 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use super::try_split_at; - -/// A codec that optionally encodes two strings in front of the value. -/// -/// The usecase is for the facet string levels algorithm where we must -/// know the origin of a group, the group left and right bounds are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (Option<(&'a str, &'a str)>, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (contains_bounds, bytes) = bytes.split_first()?; - - if *contains_bounds != 0 { - let (left_len, bytes) = try_split_at(bytes, 2)?; - let (right_len, bytes) = try_split_at(bytes, 2)?; - - let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; - let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; - - let (left, bytes) = try_split_at(bytes, left_len as usize)?; - let (right, bytes) = try_split_at(bytes, right_len as usize)?; - - let left = str::from_utf8(left).ok()?; - let right = str::from_utf8(right).ok()?; - - C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) - } else { - C::bytes_decode(bytes).map(|item| (None, item)) - } - } -} - -impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (Option<(&'a str, &'a str)>, C::EItem); - - fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { - let mut bytes = Vec::new(); - - match bounds { - Some((left, right)) => { - bytes.push(u8::max_value()); - - if left.is_empty() || right.is_empty() { - return None; - } - - let left_len: u16 = left.len().try_into().ok()?; - let right_len: u16 = right.len().try_into().ok()?; - - bytes.extend_from_slice(&left_len.to_be_bytes()); - bytes.extend_from_slice(&right_len.to_be_bytes()); - - bytes.extend_from_slice(left.as_bytes()); - bytes.extend_from_slice(right.as_bytes()); - - let value_bytes = C::bytes_encode(&value)?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } - None => { - bytes.push(0); - let value_bytes = C::bytes_encode(&value)?; - bytes.extend_from_slice(&value_bytes[..]); - Some(Cow::Owned(bytes)) - } - } - } -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - use crate::CboRoaringBitmapCodec; - - #[test] - fn deserialize_roaring_bitmaps() { - let bounds = Some(("abc", "def")); - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (bounds, docids.clone()); - let bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); - let (out_bounds, out_docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_bounds, out_docids), (bounds, docids)); - } - - #[test] - fn deserialize_unit() { - let bounds = Some(("abc", "def")); - let key = (bounds, ()); - let bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); - let (out_bounds, out_unit) = - FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_bounds, out_unit), (bounds, ())); - } -} diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs deleted file mode 100644 index 54abb7886..000000000 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ /dev/null @@ -1,35 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -pub struct FacetValueStringCodec; - -impl FacetValueStringCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetValueStringCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index d23ab391e..e145e311e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -2,7 +2,7 @@ // mod facet_level_value_u32_codec; // mod facet_string_level_zero_codec; // mod facet_string_level_zero_value_codec; -mod facet_string_zero_bounds_value_codec; +// mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -16,7 +16,7 @@ use heed::types::OwnedType; // pub use self::facet_string_level_zero_value_codec::{ // decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, // }; -pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; +// pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; use crate::BEU16; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bb30f24c9..5eebff913 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -10,7 +10,7 @@ use time::OffsetDateTime; use super::ClearDocuments; use crate::error::{InternalError, SerializationError, UserError}; -use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ From 22d80eeaf9262f9f97135d147bf45258240d9a3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:10:45 +0200 Subject: [PATCH 07/58] Reintroduce facet deletion functionality --- milli/src/update/delete_documents.rs | 139 ++++++++++++--------------- 1 file changed, 62 insertions(+), 77 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5eebff913..32b2ac986 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,19 +3,21 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; use heed::{BytesDecode, BytesEncode, Database}; +use obkv::Key; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; -use super::ClearDocuments; +use super::{ClearDocuments, Facets}; use crate::error::{InternalError, SerializationError, UserError}; // use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, - RoaringBitmapCodec, SmallString32, BEU32, + fields_ids_map, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, + FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -62,6 +64,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn execute(mut self) -> Result { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; @@ -439,25 +442,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } + remove_docids_from_facet_id_docids( + self.wtxn, + self.index, + facet_id_f64_docids.remap_key_type::>(), + &self.to_delete_docids, + fields_ids_map.clone(), + )?; + remove_docids_from_facet_id_docids( + self.wtxn, + self.index, + facet_id_string_docids.remap_key_type::>(), + &self.to_delete_docids, + fields_ids_map.clone(), + )?; // We delete the documents ids that are under the facet field id values. - // TODO: remove_docids_from_facet_field_id_docids( - // self.wtxn, - // facet_id_f64_docids, - // &self.to_delete_docids, - // )?; - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( + remove_docids_from_facet_id_exists_docids( self.wtxn, facet_id_exists_docids, &self.to_delete_docids, )?; - remove_docids_from_facet_field_id_string_docids( - self.wtxn, - facet_id_string_docids, - &self.to_delete_docids, - )?; - // Remove the documents ids from the faceted documents ids. for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids @@ -580,67 +585,7 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( - wtxn: &'a mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, -) -> crate::Result<()> { - // let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); - // let mut iter = db.remap_types::().iter_mut(wtxn)?; - // while let Some(result) = iter.next() { - // let (key, val) = result?; - // match FacetLevelValueU32Codec::bytes_decode(key) { - // Some(_) => { - // // If we are able to parse this key it means it is a facet string group - // // level key. We must then parse the value using the appropriate codec. - // let (group, mut docids) = - // FacetStringZeroBoundsValueCodec::::bytes_decode(val) - // .ok_or_else(|| SerializationError::Decoding { db_name })?; - - // let previous_len = docids.len(); - // docids -= to_remove; - // if docids.is_empty() { - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.del_current()? }; - // } else if docids.len() != previous_len { - // let key = key.to_owned(); - // let val = &(group, docids); - // let value_bytes = - // FacetStringZeroBoundsValueCodec::::bytes_encode(val) - // .ok_or_else(|| SerializationError::Encoding { db_name })?; - - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.put_current(&key, &value_bytes)? }; - // } - // } - // None => { - // // The key corresponds to a level zero facet string. - // let (original_value, mut docids) = - // FacetStringLevelZeroValueCodec::bytes_decode(val) - // .ok_or_else(|| SerializationError::Decoding { db_name })?; - - // let previous_len = docids.len(); - // docids -= to_remove; - // if docids.is_empty() { - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.del_current()? }; - // } else if docids.len() != previous_len { - // let key = key.to_owned(); - // let val = &(original_value, docids); - // let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - // .ok_or_else(|| SerializationError::Encoding { db_name })?; - - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.put_current(&key, &value_bytes)? }; - // } - // } - // } - // } - - Ok(()) -} - -fn remove_docids_from_facet_field_id_docids<'a, C>( +fn remove_docids_from_facet_id_exists_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, @@ -665,6 +610,46 @@ where Ok(()) } +fn remove_docids_from_facet_id_docids<'a>( + wtxn: &'a mut heed::RwTxn, + index: &Index, + db: heed::Database, FacetGroupValueCodec>, + to_remove: &RoaringBitmap, + fields_ids_map: FieldsIdsMap, +) -> Result<()> { + let mut modified = false; + for field_id in fields_ids_map.ids() { + let mut level0_prefix = vec![]; + level0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level0_prefix.push(0); + let mut iter = db + .as_polymorph() + .prefix_iter_mut::<_, ByteSlice, FacetGroupValueCodec>(wtxn, &level0_prefix)?; + + while let Some(result) = iter.next() { + let (bytes, mut value) = result?; + let previous_len = value.bitmap.len(); + value.bitmap -= to_remove; + if value.bitmap.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + modified = true; + } else if value.bitmap.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &value)? }; + modified = true; + } + } + } + if !modified { + return Ok(()); + } + let builder = Facets::new(index, db); + builder.execute(wtxn)?; + + Ok(()) +} #[cfg(test)] mod tests { From 39a4a0a362f4803016072b54a0cbcf88ccb3a55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:27:16 +0200 Subject: [PATCH 08/58] Reintroduce filter range search and facet extractors --- milli/src/search/facet/facet_range_search.rs | 12 +- milli/src/search/facet/filter.rs | 248 +++++------------- milli/src/update/delete_documents.rs | 10 +- .../extract/extract_facet_number_docids.rs | 13 +- .../extract/extract_facet_string_docids.rs | 40 +-- 5 files changed, 92 insertions(+), 231 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index c01346b25..75db9fda2 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,7 @@ use super::get_last_facet_value; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -48,13 +48,13 @@ where } Bound::Unbounded => Bound::Unbounded, }; - + let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; - let highest_level = get_highest_level(rtxn, db, field_id)?; + let mut f = FacetRangeSearch { rtxn, db: &db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, &db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, &db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, &db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dd34abe6d..79d7f5e0f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,22 +1,17 @@ -use std::collections::HashSet; -use std::fmt::{Debug, Display}; -use std::ops::Bound::{self, Excluded, Included}; -use std::ops::RangeBounds; - use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; -use heed::LazyDecode; use roaring::RoaringBitmap; +use std::collections::HashSet; +use std::fmt::{Debug, Display}; +use std::ops::Bound::{self, Excluded, Included}; -// use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; -// use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, -}; +use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; + +use super::facet_range_search; /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; @@ -147,158 +142,15 @@ impl<'a> Filter<'a> { } } -fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, FacetGroupValueCodec>, - field_id: FieldId, -) { -} - impl<'a> Filter<'a> { - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, CboRoaringBitmapCodec>, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> Result<()> { - // level must be > 0, I'll create a separate function for level 0 - // if level == 0 { - // call that function - //} - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels( - rtxn, db, field_id, 0, left, right, output, - ); - } - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - let range_start_key = FacetKey { - field_id, - level, - left_bound: match left { - Included(l) => l, - Excluded(l) => l, - Bound::Unbounded => f64::MIN, - }, - }; - let mut range_iter = db - .remap_data_type::>() - .range(rtxn, &(range_start_key..))?; + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; - let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; - while let Some(el) = range_iter.next() { - let (facet_key, value) = el?; - let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound)); - // if the current range intersects with the query range, then go deeper - // what does it mean for two ranges to intersect? - let gte_left = match left { - Included(l) => previous_facet_key.left_bound >= l, - Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true? - Bound::Unbounded => true, - }; - let lte_right = match right { - Included(r) => facet_key.left_bound <= r, - Excluded(r) => facet_key.left_bound < r, - Bound::Unbounded => true, - }; - } - // at this point, previous_facet_key and previous_value are the last groups in the level - // we must also check whether we should visit this group - - todo!(); - - // let mut left_found = None; - // let mut right_found = None; - - // // We must create a custom iterator to be able to iterate over the - // // requested range as the range iterator cannot express some conditions. - // let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - // debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - // for (i, result) in iter.enumerate() { - // let ((_fid, level, l, r), docids) = result?; - // debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - // *output |= docids; - // // We save the leftest and rightest bounds we actually found at this level. - // if i == 0 { - // left_found = Some(l); - // } - // right_found = Some(r); - // } - - // // Can we go deeper? - // let deeper_level = match level.checked_sub(1) { - // Some(level) => level, - // None => return Ok(()), - // }; - - // // We must refine the left and right bounds of this range by retrieving the - // // missing part in a deeper level. - // match left_found.zip(right_found) { - // Some((left_found, right_found)) => { - // // If the bound is satisfied we avoid calling this function again. - // if !matches!(left, Included(l) if l == left_found) { - // let sub_right = Excluded(left_found); - // debug!( - // "calling left with {:?} to {:?} (level {})", - // left, sub_right, deeper_level - // ); - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // left, - // sub_right, - // output, - // )?; - // } - // if !matches!(right, Included(r) if r == right_found) { - // let sub_left = Excluded(right_found); - // debug!( - // "calling right with {:?} to {:?} (level {})", - // sub_left, right, deeper_level - // ); - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // sub_left, - // right, - // output, - // )?; - // } - // } - // None => { - // // If we found nothing at this level it means that we must find - // // the same bounds but at a deeper, more precise level. - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // left, - // right, - // output, - // )?; - // } - // } - - // Ok(()) + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) } fn evaluate_operator( @@ -337,15 +189,15 @@ impl<'a> Filter<'a> { Some(n) => { let n = Included(n); let mut output = RoaringBitmap::new(); - // Self::explore_facet_number_levels( - // rtxn, - // numbers_db, - // field_id, - // 0, - // n, - // n, - // &mut output, - // )?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + 0, + n, + n, + &mut output, + )?; output } None => RoaringBitmap::new(), @@ -381,29 +233,53 @@ impl<'a> Filter<'a> { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - // Self::explore_facet_number_levels( - // rtxn, - // numbers_db, - // field_id, - // level, - // left, - // right, - // &mut output, - // )?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + level, + left, + right, + &mut output, + )?; Ok(output) } None => Ok(RoaringBitmap::new()), } } - pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { - // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; - let filterable_fields = index.filterable_fields(rtxn)?; + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> Result<()> { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_number_levels( + rtxn, db, field_id, 0, left, right, output, + ); + } + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + let x = facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, &db, field_id, &left, &right, + )?; + // TODO: the facet range search should take a mutable roaring bitmap as argument + *output = x; - // and finally we delete all the soft_deleted_documents, again, only once at the very end - self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) + Ok(()) } fn inner_evaluate( diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 32b2ac986..e16d98e74 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,22 +2,20 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode, Database}; -use obkv::Key; +use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; use super::{ClearDocuments, Facets}; -use crate::error::{InternalError, SerializationError, UserError}; -// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +use crate::error::{InternalError, UserError}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - fields_ids_map, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, - FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, + DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, + RoaringBitmapCodec, SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index c5424a346..eece08ee3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,6 +6,8 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::Result; @@ -31,14 +33,13 @@ pub fn extract_facet_number_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { - todo!() - // let (field_id, document_id, number) = - // FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + let (field_id, document_id, number) = + FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - // let key = (field_id, 0, number, number); - // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + let key = FacetKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 4e655329e..51d2df923 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,13 +1,11 @@ -use std::fs::File; -use std::iter::FromIterator; -use std::{io, str}; - -use roaring::RoaringBitmap; - use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; -// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; use crate::{FieldId, Result}; +use heed::BytesEncode; +use std::fs::File; +use std::io; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -22,38 +20,26 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, // TODO: check + merge_cbo_roaring_bitmaps, // TODO: check that it is correct indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, original_value_bytes)) = cursor.move_on_next()? { + while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); - let document_id = u32::from_be_bytes(document_id_bytes); - let original_value = str::from_utf8(original_value_bytes)?; - key_buffer.clear(); - // TODO - // FacetStringLevelZeroCodec::serialize_into( - // field_id, - // str::from_utf8(normalized_value_bytes)?, - // &mut key_buffer, - // ); + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); - value_buffer.clear(); - // TODO - // encode_prefix_string(original_value, &mut value_buffer)?; - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - bitmap.serialize_into(&mut value_buffer)?; + let normalised_value = std::str::from_utf8(normalized_value_bytes)?; + let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; + facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; } sorter_into_reader(facet_string_docids_sorter, indexer) From bd2c0e1ab6393550d7cdd8439c9b605ff2dd7fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:39:01 +0200 Subject: [PATCH 09/58] Remove unused code --- milli/src/search/facet/incremental_update.rs | 2 -- .../src/update/index_documents/typed_chunk.rs | 25 ++++++------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/search/facet/incremental_update.rs index a437efb2d..f01b19dab 100644 --- a/milli/src/search/facet/incremental_update.rs +++ b/milli/src/search/facet/incremental_update.rs @@ -43,8 +43,6 @@ impl<'i> IncrementalFacetUpdate<'i> { .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; - let key = FacetKeyCodec::::bytes_decode(&key_bytes) - .ok_or(heed::Error::Encoding)?; Ok(( FacetKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7a9787bdb..3c7a78d95 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -189,23 +189,14 @@ pub(crate) fn write_typed_chunk_into_index( } } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - append_entries_into_database( - facet_id_string_docids, - &index.facet_id_string_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - |new_values, db_values, buffer| { - todo!() - // let (_, new_values) = decode_prefix_string(new_values).unwrap(); - // let new_values = RoaringBitmap::deserialize_from(new_values)?; - // let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); - // let db_values = RoaringBitmap::deserialize_from(db_values)?; - // let values = new_values | db_values; - // encode_prefix_string(db_original, buffer)?; - // Ok(values.serialize_into(buffer)?) - }, - )?; + // facet_id_string_docids contains the thing that the extractor put into it, + // so: (FacetKey { field id, level: 0, left_bound } , docids: RoaringBitmap ) + // now we need to either: + // 1. incrementally add the keys/docids pairs into the DB + // 2. add the keys/docids into level 0 and then call Facets::execute + // the choice of solution should be determined by their performance + // characteristics + is_merged_database = true; } TypedChunk::GeoPoints(geo_points) => { From e570c23153f4b4ce91dfd0fe80ed03802a396563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 09:36:19 +0200 Subject: [PATCH 10/58] Reintroduce asc/desc functionality --- milli/src/search/criteria/asc_desc.rs | 33 ++++++++++--------- milli/src/search/facet/facet_distribution.rs | 4 +-- .../search/facet/facet_distribution_iter.rs | 6 ++-- milli/src/search/facet/facet_range_search.rs | 12 +++---- .../src/search/facet/facet_sort_ascending.rs | 21 +++++------- .../src/search/facet/facet_sort_descending.rs | 31 ++++++++--------- milli/src/search/facet/filter.rs | 2 +- milli/src/search/facet/incremental_update.rs | 4 +-- milli/src/search/facet/mod.rs | 10 +++--- 9 files changed, 60 insertions(+), 63 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bd08c54a5..a5ea9b058 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -6,7 +6,10 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::facet::facet_sort_ascending::ascending_facet_sort; +use crate::search::facet::facet_sort_descending::descending_facet_sort; // use crate::search::facet::FacetStringIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -186,24 +189,22 @@ fn facet_ordered<'t>( iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - todo!() - // let facet_number_fn = if is_ascending { - // FacetNumberIter::new_reducing - // } else { - // FacetNumberIter::new_reverse_reducing - // }; - // let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - // .map(|res| res.map(|(_, docids)| docids)); + let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; - // let facet_string_fn = if is_ascending { - // FacetStringIter::new_reducing - // } else { - // FacetStringIter::new_reverse_reducing - // }; - // let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - // .map(|res| res.map(|(_, _, docids)| docids)); + let number_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates.clone(), + )?; + let string_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates, + )?; - // Ok(Box::new(number_iter.chain(string_iter))) + Ok(Box::new(number_iter.chain(string_iter))) } } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 670719a9b..c7619c609 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -138,7 +138,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - &self.index.facet_id_f64_docids.remap_key_type::>(), + self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { @@ -161,7 +161,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - &self.index.facet_id_string_docids.remap_key_type::>(), + self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 9f1031a85..f347b9d7e 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -7,7 +7,7 @@ use super::{get_first_facet_value, get_highest_level}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -17,7 +17,7 @@ where { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; + get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; @@ -32,7 +32,7 @@ where CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 75db9fda2..b05a3c275 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,7 @@ use super::get_last_facet_value; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -50,11 +50,11 @@ where }; let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db: &db, field_id, left, right, docids: &mut docids }; - let highest_level = get_highest_level(rtxn, &db, field_id)?; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, &db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, &db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { @@ -65,7 +65,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 73491d4ae..e4b77c691 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -1,24 +1,19 @@ use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Result> + 't>> { - let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; - if let Some(first_bound) = get_first_facet_value::( - rtxn, - &db.remap_key_type::>(), - field_id, - )? { +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); @@ -30,7 +25,7 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, @@ -39,7 +34,7 @@ struct AscendingFacetSort<'t, 'e> { } impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { - type Item = Result<(&'t [u8], RoaringBitmap)>; + type Item = Result; fn next(&mut self) -> Option { 'outer: loop { @@ -67,7 +62,7 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { *documents_ids -= &bitmap; if level == 0 { - return Some(Ok((left_bound, bitmap))); + return Some(Ok(bitmap)); } let starting_key_below = FacetKey { field_id: self.field_id, level: level - 1, left_bound }; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 81b0eb09d..fc62b894f 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -3,17 +3,17 @@ use std::ops::Bound; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -fn descending_facet_sort<'t>( +pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Result> + 't>> { +) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; @@ -33,7 +33,7 @@ fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, @@ -43,7 +43,7 @@ struct DescendingFacetSort<'t> { } impl<'t> Iterator for DescendingFacetSort<'t> { - type Item = Result<(&'t [u8], RoaringBitmap)>; + type Item = Result; fn next(&mut self) -> Option { 'outer: loop { @@ -70,7 +70,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *documents_ids -= &bitmap; if level == 0 { - return Some(Ok((left_bound, bitmap))); + return Some(Ok(bitmap)); } let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; @@ -89,14 +89,15 @@ impl<'t> Iterator for DescendingFacetSort<'t> { }; let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); - let iter = match self.db.rev_range( - &self.rtxn, - &(Bound::Included(starting_key_below), end_key_kelow), - ) { - Ok(iter) => iter, - Err(e) => return Some(Err(e.into())), - } - .take(group_size as usize); + let iter = + match self.db.remap_key_type::>().rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter, prev_right_bound)); continue 'outer; diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 79d7f5e0f..6ec626a5c 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -274,7 +274,7 @@ impl<'a> Filter<'a> { (_, _) => (), } let x = facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, &db, field_id, &left, &right, + rtxn, db, field_id, &left, &right, )?; // TODO: the facet range search should take a mutable roaring bitmap as argument *output = x; diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/search/facet/incremental_update.rs index f01b19dab..fd4e1eeb5 100644 --- a/milli/src/search/facet/incremental_update.rs +++ b/milli/src/search/facet/incremental_update.rs @@ -264,7 +264,7 @@ impl<'i> IncrementalFacetUpdate<'i> { } let group_size = self.group_size; - let highest_level = get_highest_level(&txn, &self.db, field_id)?; + let highest_level = get_highest_level(&txn, *self.db, field_id)?; let result = self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; @@ -413,7 +413,7 @@ impl<'i> IncrementalFacetUpdate<'i> { if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { return Ok(()); } - let highest_level = get_highest_level(&txn, &self.db, field_id)?; + let highest_level = get_highest_level(&txn, *self.db, field_id)?; // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 023d433ad..8405c0141 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -11,14 +11,14 @@ pub use self::filter::Filter; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; -mod facet_sort_ascending; -mod facet_sort_descending; +pub mod facet_sort_ascending; +pub mod facet_sort_descending; mod filter; mod incremental_update; pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -40,7 +40,7 @@ where } pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -63,7 +63,7 @@ where } pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); From fb8d23deb3690e412217a59b43e16c34b4bfb938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 12:53:53 +0200 Subject: [PATCH 11/58] Reintroduce db_snap! for facet databases --- milli/src/snapshot_tests.rs | 55 ++++++++++--------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 4031c9b06..17ee3f392 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,6 +6,7 @@ use heed::types::ByteSlice; use heed::BytesDecode; use roaring::RoaringBitmap; +use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; @@ -229,48 +230,22 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { snap } pub fn snap_facet_id_f64_docids(index: &Index) -> String { - todo!() - // let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - // (facet_id, level, left, right), - // b, - // )| { - // &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) - // }); - // snap + let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + FacetKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<6} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap } pub fn snap_facet_id_string_docids(index: &Index) -> String { - todo!() - // let rtxn = index.read_txn().unwrap(); - // let bytes_db = index.facet_id_string_docids.remap_types::(); - // let iter = bytes_db.iter(&rtxn).unwrap(); - // let mut snap = String::new(); - - // for x in iter { - // let (key, value) = x.unwrap(); - // if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { - // let (orig_string, docids) = - // FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - // snap.push_str(&format!( - // "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - // display_bitmap(&docids) - // )); - // } else if let Some((field_id, level, left, right)) = - // FacetLevelValueU32Codec::bytes_decode(key) - // { - // snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - // let (bounds, docids) = - // FacetStringZeroBoundsValueCodec::::bytes_decode(value) - // .unwrap(); - // if let Some((left, right)) = bounds { - // snap.push_str(&format!("{left:<8} {right:<8} ")); - // } - // snap.push_str(&display_bitmap(&docids)); - // snap.push('\n'); - // } else { - // panic!(); - // } - // } - // snap + let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( + FacetKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<12} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap } pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); From e8a156d68287db90841109328f8dd3ba70f10433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 13:03:36 +0200 Subject: [PATCH 12/58] Reorganise facets database indexing code --- http-ui/src/main.rs | 1 + milli/src/search/facet/mod.rs | 3 -- milli/src/search/mod.rs | 2 +- milli/src/update/delete_documents.rs | 4 +- milli/src/update/{facets.rs => facet/bulk.rs} | 37 +++++++++---------- .../facet/incremental.rs} | 11 +++--- milli/src/update/facet/mod.rs | 2 + milli/src/update/index_documents/mod.rs | 6 +-- milli/src/update/mod.rs | 4 +- 9 files changed, 33 insertions(+), 37 deletions(-) create mode 100644 http-ui/src/main.rs rename milli/src/update/{facets.rs => facet/bulk.rs} (97%) rename milli/src/{search/facet/incremental_update.rs => update/facet/incremental.rs} (98%) create mode 100644 milli/src/update/facet/mod.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/http-ui/src/main.rs @@ -0,0 +1 @@ + diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 8405c0141..12074cc12 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -4,8 +4,6 @@ use heed::{BytesDecode, RoTxn}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; -// pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; @@ -14,7 +12,6 @@ mod facet_range_search; pub mod facet_sort_ascending; pub mod facet_sort_descending; mod filter; -mod incremental_update; pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d05e807df..e6651737c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -32,7 +32,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; -mod facet; +pub mod facet; mod fst_utils; mod matches; mod query_tree; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e16d98e74..1d1745d82 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; -use super::{ClearDocuments, Facets}; +use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; @@ -643,7 +643,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = Facets::new(index, db); + let builder = FacetsUpdateBulk::new(index, db); builder.execute(wtxn)?; Ok(()) diff --git a/milli/src/update/facets.rs b/milli/src/update/facet/bulk.rs similarity index 97% rename from milli/src/update/facets.rs rename to milli/src/update/facet/bulk.rs index fe8c2855e..587dc95ab 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,23 +1,20 @@ -use std::cmp; -use std::fs::File; -use std::num::NonZeroUsize; - +use crate::error::InternalError; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; +use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; +use crate::{FieldId, Index, Result}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn}; use log::debug; use roaring::RoaringBitmap; +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; use time::OffsetDateTime; -use crate::error::InternalError; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, -}; -// use crate::heed_codec::CboRoaringBitmapCodec; -use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; -use crate::{FieldId, Index, Result}; - -pub struct Facets<'i> { +pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, pub(crate) chunk_compression_type: CompressionType, @@ -26,12 +23,12 @@ pub struct Facets<'i> { min_level_size: usize, } -impl<'i> Facets<'i> { +impl<'i> FacetsUpdateBulk<'i> { pub fn new( index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - ) -> Facets<'i> { - Facets { + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { index, database, chunk_compression_type: CompressionType::None, @@ -63,7 +60,7 @@ impl<'i> Facets<'i> { Ok(()) } - #[logging_timer::time("Facets::{}")] + #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. @@ -105,7 +102,7 @@ impl<'i> Facets<'i> { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - let algo = CreateFacetsAlgo { + let algo = FacetsUpdateBulkAlgorithm { rtxn: txn, db: &self.database, field_id, @@ -129,7 +126,7 @@ impl<'i> Facets<'i> { } } -pub struct CreateFacetsAlgo<'t> { +pub struct FacetsUpdateBulkAlgorithm<'t> { rtxn: &'t heed::RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, chunk_compression_type: CompressionType, @@ -138,7 +135,7 @@ pub struct CreateFacetsAlgo<'t> { level_group_size: usize, min_level_size: usize, } -impl<'t> CreateFacetsAlgo<'t> { +impl<'t> FacetsUpdateBulkAlgorithm<'t> { fn read_level_0( &self, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/update/facet/incremental.rs similarity index 98% rename from milli/src/search/facet/incremental_update.rs rename to milli/src/update/facet/incremental.rs index fd4e1eeb5..d2fb3755f 100644 --- a/milli/src/search/facet/incremental_update.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,13 +1,12 @@ use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; +use crate::search::facet::get_highest_level; use crate::Result; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use super::get_highest_level; - enum InsertionResult { InPlace, Insert, @@ -18,14 +17,14 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -struct IncrementalFacetUpdate<'i> { - db: &'i heed::Database, FacetGroupValueCodec>, +struct FacetUpdateIncremental { + db: heed::Database, FacetGroupValueCodec>, group_size: usize, min_level_size: usize, max_group_size: usize, } -impl<'i> IncrementalFacetUpdate<'i> { - fn find_insertion_key_value<'a>( +impl FacetUpdateIncremental { + fn find_insertion_key_value( &self, field_id: u16, level: u8, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs new file mode 100644 index 000000000..ecde3a248 --- /dev/null +++ b/milli/src/update/facet/mod.rs @@ -0,0 +1,2 @@ +pub mod bulk; +pub mod incremental; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5a9066eba..be9b1e3c5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -37,8 +37,8 @@ use crate::error::UserError; use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, + PrefixWordPairsProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -436,7 +436,7 @@ where (&self.index.facet_id_string_docids).remap_key_type::>(), (&self.index.facet_id_f64_docids).remap_key_type::>(), ] { - let mut builder = Facets::new(self.index, facet_db); + let mut builder = FacetsUpdateBulk::new(self.index, facet_db); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_level = self.indexer_config.chunk_compression_level; if let Some(value) = self.config.facet_level_group_size { diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 3ddc01cef..cd96d3e88 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,7 +1,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; -pub use self::facets::Facets; +pub use self::facet::bulk::FacetsUpdateBulk; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; @@ -16,7 +16,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facets; +mod facet; mod index_documents; mod indexer_config; mod prefix_word_pairs; From d30c89e3451dd22b9b507e877cbea8b7473ff145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 14:19:52 +0200 Subject: [PATCH 13/58] Fix compile error+warnings in new tests --- .../search/facet/facet_distribution_iter.rs | 22 ++++++++-------- milli/src/search/facet/facet_range_search.rs | 25 ++++++++++--------- .../src/search/facet/facet_sort_ascending.rs | 15 ++++++----- .../src/search/facet/facet_sort_descending.rs | 18 ++++++------- milli/src/search/facet/mod.rs | 16 ++++++++++-- milli/src/snapshot_tests.rs | 11 +++----- milli/src/update/facet/incremental.rs | 13 +++++++--- milli/src/update/mod.rs | 1 + 8 files changed, 67 insertions(+), 54 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index f347b9d7e..16b83c2db 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -109,7 +109,7 @@ where #[cfg(test)] mod tests { use heed::BytesDecode; - use rand::{rngs::SmallRng, Rng, SeedableRng}; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; use std::ops::ControlFlow; @@ -125,7 +125,7 @@ mod tests { for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); - index.insert(&mut txn, 0, &i, &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } txn.commit().unwrap(); index @@ -134,14 +134,14 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100.); + bitmap.insert(key + 100); index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); @@ -156,13 +156,13 @@ mod tests { #[test] fn filter_distribution_all() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); iterate_over_facet_distribution( &txn, - &index.db.content, + index.db.content, 0, &candidates, |facet, count| { @@ -170,7 +170,8 @@ mod tests { results.push_str(&format!("{facet}: {count}\n")); ControlFlow::Continue(()) }, - ); + ) + .unwrap(); insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); txn.commit().unwrap(); @@ -179,14 +180,14 @@ mod tests { #[test] fn filter_distribution_all_stop_early() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; iterate_over_facet_distribution( &txn, - &index.db.content, + index.db.content, 0, &candidates, |facet, count| { @@ -200,7 +201,8 @@ mod tests { ControlFlow::Continue(()) } }, - ); + ) + .unwrap(); insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); txn.commit().unwrap(); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index b05a3c275..7e7c5e713 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -259,8 +259,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, - search::facet::test::FacetIndex, snapshot_tests::display_bitmap, + heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec}, + search::facet::test::FacetIndex, + snapshot_tests::display_bitmap, }; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; @@ -283,7 +284,7 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); @@ -305,7 +306,7 @@ mod tests { #[test] fn filter_range_increasing() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); for i in 0..=255 { @@ -314,7 +315,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -333,7 +334,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -351,7 +352,7 @@ mod tests { #[test] fn filter_range_decreasing() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); @@ -362,7 +363,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -384,7 +385,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -404,7 +405,7 @@ mod tests { #[test] fn filter_range_pinch() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); @@ -415,7 +416,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -434,7 +435,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e4b77c691..8af191089 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,7 +83,6 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use heed::BytesDecode; use rand::Rng; use rand::SeedableRng; use roaring::RoaringBitmap; @@ -100,7 +99,7 @@ mod tests { for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); - index.insert(&mut txn, 0, &i, &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } txn.commit().unwrap(); index @@ -109,7 +108,7 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); @@ -131,14 +130,14 @@ mod tests { #[test] fn filter_sort() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); - for (facet, docids) in iter { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + let iter = ascending_facet_sort(&txn, index.db.content, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); } insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index fc62b894f..5ce55ec6d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -111,8 +111,6 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - - use heed::BytesDecode; use rand::Rng; use rand::SeedableRng; use roaring::RoaringBitmap; @@ -129,7 +127,7 @@ mod tests { for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); - index.insert(&mut txn, 0, &i, &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } txn.commit().unwrap(); index @@ -138,14 +136,14 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100.); + bitmap.insert(key + 100); index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); @@ -160,15 +158,15 @@ mod tests { #[test] fn filter_sort_descending() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); let db = index.db.content.remap_key_type::>(); - let iter = descending_facet_sort(&txn, &db, 0, candidates); - for (facet, docids) in iter { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); } insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 12074cc12..2ca6c0689 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -78,9 +78,10 @@ pub(crate) fn get_highest_level<'t>( #[cfg(test)] mod test { + use crate::update::FacetsUpdateIncremental; + use heed::{BytesDecode, BytesEncode, Env, RwTxn}; + use roaring::RoaringBitmap; use std::{fmt::Display, marker::PhantomData, rc::Rc}; - - use heed::{BytesDecode, BytesEncode, Env}; use tempfile::TempDir; use crate::{ @@ -148,6 +149,17 @@ mod test { _phantom: PhantomData, } } + pub fn insert<'a>( + &self, + rwtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncremental::new(self.db.content); + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); + } } impl Display for FacetIndex diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 17ee3f392..c6b83eeb6 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -1,15 +1,10 @@ +use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; +use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; +use roaring::RoaringBitmap; use std::borrow::Cow; use std::fmt::Write; use std::path::Path; -use heed::types::ByteSlice; -use heed::BytesDecode; -use roaring::RoaringBitmap; - -use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; -use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; -use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; - #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { let mut settings = insta::Settings::clone_current(); diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index d2fb3755f..df0b93839 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -17,13 +17,18 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -struct FacetUpdateIncremental { +pub struct FacetsUpdateIncremental { db: heed::Database, FacetGroupValueCodec>, group_size: usize, min_level_size: usize, max_group_size: usize, } -impl FacetUpdateIncremental { +impl FacetsUpdateIncremental { + pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { + Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } + } +} +impl FacetsUpdateIncremental { fn find_insertion_key_value( &self, field_id: u16, @@ -263,7 +268,7 @@ impl FacetUpdateIncremental { } let group_size = self.group_size; - let highest_level = get_highest_level(&txn, *self.db, field_id)?; + let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; @@ -412,7 +417,7 @@ impl FacetUpdateIncremental { if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { return Ok(()); } - let highest_level = get_highest_level(&txn, *self.db, field_id)?; + let highest_level = get_highest_level(&txn, self.db, field_id)?; // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index cd96d3e88..8fba16d3d 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,6 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; +pub use self::facet::incremental::FacetsUpdateIncremental; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; From 85824ee203a3f6c99a0335c9a11c275cb6dc37f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 08:17:27 +0200 Subject: [PATCH 14/58] Try to make facet indexing incremental --- milli/src/search/facet/facet_range_search.rs | 2 +- milli/src/update/delete_documents.rs | 7 +++- milli/src/update/facet/bulk.rs | 18 ++++++-- .../extract/extract_facet_string_docids.rs | 4 ++ milli/src/update/index_documents/mod.rs | 18 -------- .../src/update/index_documents/typed_chunk.rs | 41 ++++++++++++++----- 6 files changed, 55 insertions(+), 35 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 7e7c5e713..523b3853c 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -138,7 +138,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let should_skip = { match self.left { Bound::Included(left) => left >= next_key.left_bound, - Bound::Excluded(left) => left >= next_key.left_bound, // TODO: use > instead? + Bound::Excluded(left) => left >= next_key.left_bound, Bound::Unbounded => false, } }; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1d1745d82..bb18ed80f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::Database; +use heed::{Database, RwTxn}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -446,6 +446,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_f64_docids.remap_key_type::>(), &self.to_delete_docids, fields_ids_map.clone(), + Index::put_number_faceted_documents_ids, )?; remove_docids_from_facet_id_docids( self.wtxn, @@ -453,6 +454,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_string_docids.remap_key_type::>(), &self.to_delete_docids, fields_ids_map.clone(), + Index::put_string_faceted_documents_ids, )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( @@ -614,6 +616,7 @@ fn remove_docids_from_facet_id_docids<'a>( db: heed::Database, FacetGroupValueCodec>, to_remove: &RoaringBitmap, fields_ids_map: FieldsIdsMap, + put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, ) -> Result<()> { let mut modified = false; for field_id in fields_ids_map.ids() { @@ -643,7 +646,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = FacetsUpdateBulk::new(index, db); + let builder = FacetsUpdateBulk::new(index, db, put_faceted_docids_in_main); builder.execute(wtxn)?; Ok(()) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 587dc95ab..b3e932dc2 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -6,7 +6,7 @@ use crate::update::index_documents::{create_writer, write_into_lmdb_database, wr use crate::{FieldId, Index, Result}; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn}; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; use log::debug; use roaring::RoaringBitmap; use std::cmp; @@ -21,12 +21,19 @@ pub struct FacetsUpdateBulk<'i> { pub(crate) chunk_compression_level: Option, level_group_size: usize, min_level_size: usize, + put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, } impl<'i> FacetsUpdateBulk<'i> { pub fn new( index: &'i Index, database: heed::Database, FacetGroupValueCodec>, + put_faceted_docids_in_main: fn( + &Index, + &mut RwTxn, + FieldId, + &RoaringBitmap, + ) -> heed::Result<()>, ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, @@ -35,6 +42,7 @@ impl<'i> FacetsUpdateBulk<'i> { chunk_compression_level: None, level_group_size: 4, min_level_size: 5, + put_faceted_docids_in_main, } } @@ -78,8 +86,12 @@ impl<'i> FacetsUpdateBulk<'i> { let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &nested_wtxn)?; - // TODO: this will need to be an argument to Facets as well - self.index.put_string_faceted_documents_ids(&mut nested_wtxn, field_id, &all_docids)?; + (self.put_faceted_docids_in_main)( + &self.index, + &mut nested_wtxn, + field_id, + &all_docids, + )?; for level_reader in level_readers { // TODO: append instead of write with merge diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 51d2df923..0bb83c29a 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -32,6 +32,10 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + // document_id_bytes is a big-endian u32 + // merge_cbo_roaring_bitmap works with native endian u32s + // that is a problem, I think + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index be9b1e3c5..1ab1bd38d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -34,7 +34,6 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; -use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -431,23 +430,6 @@ where // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; - // Run the facets update operation. - for facet_db in [ - (&self.index.facet_id_string_docids).remap_key_type::>(), - (&self.index.facet_id_f64_docids).remap_key_type::>(), - ] { - let mut builder = FacetsUpdateBulk::new(self.index, facet_db); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - if let Some(value) = self.config.facet_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.config.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute(self.wtxn)?; - } - databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 3c7a78d95..7aa306183 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,9 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; +use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::update::index_documents::helpers::as_cloneable_grenad; +use crate::update::FacetsUpdateIncremental; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, @@ -146,6 +148,34 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { + // merge cbo roaring bitmaps is not the correct merger because the data in the DB + // is FacetGroupValue and not RoaringBitmap + // so I need to create my own merging function + + // facet_id_string_docids is encoded as: + // key: FacetKeyCodec + // value: CboRoaringBitmapCodec + // basically + + // TODO: a condition saying "if I have more than 1/50th of the DB to add, + // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, + // it is a ratio I determine empirically + + // for now I only do it incrementally, to see if things work + let builder = FacetsUpdateIncremental::new( + index.facet_id_string_docids.remap_key_type::>(), + ); + let mut cursor = facet_id_string_docids.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let value = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + builder.insert(wtxn, key.field_id, key.left_bound, &value)?; + } + is_merged_database = true; + } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { append_entries_into_database( facet_id_exists_docids, @@ -188,17 +218,6 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - // facet_id_string_docids contains the thing that the extractor put into it, - // so: (FacetKey { field id, level: 0, left_bound } , docids: RoaringBitmap ) - // now we need to either: - // 1. incrementally add the keys/docids pairs into the DB - // 2. add the keys/docids into level 0 and then call Facets::execute - // the choice of solution should be determined by their performance - // characteristics - - is_merged_database = true; - } TypedChunk::GeoPoints(geo_points) => { let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; From 68cbcdf08b860ce42458ffc0868a00086696b10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 08:34:26 +0200 Subject: [PATCH 15/58] Fix compile errors/warnings in http-ui and infos --- milli/src/search/facet/mod.rs | 4 +-- milli/src/update/delete_documents.rs | 46 +++++++++++++++------------- milli/src/update/facet/bulk.rs | 38 +++++++++++------------ 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 2ca6c0689..b03302ca1 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -82,7 +82,6 @@ mod test { use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; use std::{fmt::Display, marker::PhantomData, rc::Rc}; - use tempfile::TempDir; use crate::{ heed_codec::facet::new::{ @@ -113,8 +112,9 @@ mod test { for<'a> BoundCodec: BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { + #[cfg(all(test, fuzzing))] pub fn open_from_tempdir( - tempdir: Rc, + tempdir: Rc, group_size: u8, max_group_size: u8, ) -> FacetIndex { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bb18ed80f..531fd2b74 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{Database, RwTxn}; +use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -10,6 +10,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; @@ -185,9 +186,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, - facet_id_f64_docids, + facet_id_f64_docids: _, facet_id_exists_docids, - facet_id_string_docids, + facet_id_string_docids: _, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -440,22 +441,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } - remove_docids_from_facet_id_docids( - self.wtxn, - self.index, - facet_id_f64_docids.remap_key_type::>(), - &self.to_delete_docids, - fields_ids_map.clone(), - Index::put_number_faceted_documents_ids, - )?; - remove_docids_from_facet_id_docids( - self.wtxn, - self.index, - facet_id_string_docids.remap_key_type::>(), - &self.to_delete_docids, - fields_ids_map.clone(), - Index::put_string_faceted_documents_ids, - )?; + for facet_type in [FacetType::Number, FacetType::String] { + remove_docids_from_facet_id_docids( + self.wtxn, + self.index, + &self.to_delete_docids, + fields_ids_map.clone(), + facet_type, + )?; + } + // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( self.wtxn, @@ -613,11 +608,18 @@ where fn remove_docids_from_facet_id_docids<'a>( wtxn: &'a mut heed::RwTxn, index: &Index, - db: heed::Database, FacetGroupValueCodec>, to_remove: &RoaringBitmap, fields_ids_map: FieldsIdsMap, - put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, + facet_type: FacetType, ) -> Result<()> { + let db = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; let mut modified = false; for field_id in fields_ids_map.ids() { let mut level0_prefix = vec![]; @@ -646,7 +648,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = FacetsUpdateBulk::new(index, db, put_faceted_docids_in_main); + let builder = FacetsUpdateBulk::new(index, facet_type); builder.execute(wtxn)?; Ok(()) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index b3e932dc2..b8acffbaf 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,4 +1,5 @@ use crate::error::InternalError; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; @@ -6,7 +7,7 @@ use crate::update::index_documents::{create_writer, write_into_lmdb_database, wr use crate::{FieldId, Index, Result}; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use heed::{BytesEncode, Error, RoTxn}; use log::debug; use roaring::RoaringBitmap; use std::cmp; @@ -21,28 +22,26 @@ pub struct FacetsUpdateBulk<'i> { pub(crate) chunk_compression_level: Option, level_group_size: usize, min_level_size: usize, - put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, + facet_type: FacetType, } impl<'i> FacetsUpdateBulk<'i> { - pub fn new( - index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, - put_faceted_docids_in_main: fn( - &Index, - &mut RwTxn, - FieldId, - &RoaringBitmap, - ) -> heed::Result<()>, - ) -> FacetsUpdateBulk<'i> { + pub fn new(index: &'i Index, facet_type: FacetType) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, - database, + database: match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }, chunk_compression_type: CompressionType::None, chunk_compression_level: None, level_group_size: 4, min_level_size: 5, - put_faceted_docids_in_main, + facet_type, } } @@ -86,12 +85,11 @@ impl<'i> FacetsUpdateBulk<'i> { let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &nested_wtxn)?; - (self.put_faceted_docids_in_main)( - &self.index, - &mut nested_wtxn, - field_id, - &all_docids, - )?; + let put_docids_fn = match self.facet_type { + FacetType::Number => Index::put_number_faceted_documents_ids, + FacetType::String => Index::put_string_faceted_documents_ids, + }; + put_docids_fn(&self.index, &mut nested_wtxn, field_id, &all_docids)?; for level_reader in level_readers { // TODO: append instead of write with merge From 61252248fb991557cce1e0de25e4dfe13ff00388 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 09:51:43 +0200 Subject: [PATCH 16/58] Fix some facet indexing bugs --- milli/src/search/facet/mod.rs | 2 +- milli/src/snapshot_tests.rs | 12 ++++ milli/src/update/facet/incremental.rs | 5 +- .../extract/extract_facet_string_docids.rs | 3 +- milli/src/update/index_documents/mod.rs | 49 ++++++++++++++- .../src/update/index_documents/typed_chunk.rs | 62 +++++++++++++++---- 6 files changed, 115 insertions(+), 18 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index b03302ca1..0ed80dd92 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -139,7 +139,7 @@ mod test { let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 100); - let tempdir = tempfile::TempDir::new_in("databases/").unwrap(); + let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index c6b83eeb6..933f68837 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String { }); snap } +pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |( + (field_id, doc_id, string), + other_string, + )| { + &format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}") + }); + snap +} pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let documents_ids = index.documents_ids(&rtxn).unwrap(); @@ -423,6 +432,9 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_string_docids) => {{ $crate::snapshot_tests::snap_facet_id_string_docids(&$index) }}; + ($index:ident, field_id_docid_facet_strings) => {{ + $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index df0b93839..a0d426d7a 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,8 +1,9 @@ +use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; -use crate::Result; +use crate::{Index, Result}; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -287,7 +288,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count(); - if size_highest_level < self.min_level_size { + if size_highest_level < self.group_size * self.min_level_size { return Ok(()); } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 0bb83c29a..fe42801e7 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -38,12 +38,13 @@ pub fn extract_facet_string_docids( let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); let normalised_value = std::str::from_utf8(normalized_value_bytes)?; let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; + facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1ab1bd38d..2a2511362 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -592,7 +592,7 @@ mod tests { use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; use crate::update::DeleteDocuments; - use crate::BEU16; + use crate::{db_snap, BEU16}; #[test] fn simple_document_replacement() { @@ -1379,6 +1379,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let hidden = index.faceted_fields(&rtxn).unwrap(); @@ -1399,6 +1418,15 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, field_id_docid_facet_strings, @""); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); @@ -1412,6 +1440,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7aa306183..df98724da 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; @@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::FacetsUpdateIncremental; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, + Index, Result, }; pub(crate) enum TypedChunk { @@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { - append_entries_into_database( - facet_id_f64_docids_iter, - &index.facet_id_f64_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; + // merge cbo roaring bitmaps is not the correct merger because the data in the DB + // is FacetGroupValue and not RoaringBitmap + // so I need to create my own merging function + + // facet_id_string_docids is encoded as: + // key: FacetKeyCodec + // value: CboRoaringBitmapCodec + // basically + + // TODO: a condition saying "if I have more than 1/50th of the DB to add, + // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, + // it is a ratio I determine empirically + + // for now I only do it incrementally, to see if things work + let indexer = FacetsUpdateIncremental::new( + index.facet_id_f64_docids.remap_key_type::>(), + ); + + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = facet_id_f64_docids_iter.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let docids = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + for (field_id, new_docids) in new_faceted_docids { + let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?; + docids |= new_docids; + index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?; + } + is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { @@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index( // it is a ratio I determine empirically // for now I only do it incrementally, to see if things work - let builder = FacetsUpdateIncremental::new( + let indexer = FacetsUpdateIncremental::new( index.facet_id_string_docids.remap_key_type::>(), ); + let mut new_faceted_docids = HashMap::::default(); + let mut cursor = facet_id_string_docids.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let key = FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let value = + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - builder.insert(wtxn, key.field_id, key.left_bound, &value)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + for (field_id, new_docids) in new_faceted_docids { + let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?; + docids |= new_docids; + index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?; } is_merged_database = true; } From 07ff92c663014d61ddc67b0726c6e7051a0a5efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 11:09:01 +0200 Subject: [PATCH 17/58] Add more snapshots from facet tests --- .../search/facet/facet_distribution_iter.rs | 14 +- milli/src/search/facet/facet_range_search.rs | 27 +-- .../src/search/facet/facet_sort_ascending.rs | 13 +- .../src/search/facet/facet_sort_descending.rs | 13 +- .../filter_distribution_all/0.snap | 228 ++++++++++++++++++ .../filter_distribution_all/1.snap | 100 ++++++++ .../filter_distribution_all_stop_early/0.snap | 104 ++++++++ .../filter_distribution_all_stop_early/1.snap | 100 ++++++++ .../random_looking_index_snap.hash.snap | 4 + .../filter_range_decreasing/0.hash.snap | 4 + .../filter_range_decreasing/1.hash.snap | 4 + .../filter_range_increasing/0.hash.snap | 4 + .../filter_range_increasing/1.hash.snap | 4 + .../filter_range_pinch/0.hash.snap | 4 + .../filter_range_pinch/1.hash.snap | 4 + .../random_looking_index_snap.hash.snap | 4 + .../filter_sort/0.snap | 28 +++ .../filter_sort/1.snap | 53 ++++ .../random_looking_index_snap.hash.snap | 4 + .../filter_sort_descending/0.snap | 16 ++ .../filter_sort_descending/1.snap | 49 ++++ .../random_looking_index_snap.hash.snap | 4 + milli/src/snapshot_tests.rs | 69 +++++- milli/src/update/facet/incremental.rs | 3 +- .../default/facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../default/facet_id_string_docids.hash.snap | 4 + .../facet_id_string_docids.hash.snap | 4 + .../default/facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../default/facet_id_string_docids.hash.snap | 4 - .../facet_id_string_docids.hash.snap | 4 - 40 files changed, 840 insertions(+), 81 deletions(-) create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 16b83c2db..9e251103c 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -108,15 +108,15 @@ where #[cfg(test)] mod tests { + use crate::milli_snap; + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, + }; use heed::BytesDecode; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; use std::ops::ControlFlow; - use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, - }; - use super::iterate_over_facet_distribution; fn get_simple_index() -> FacetIndex { @@ -151,7 +151,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_distribution_all() { @@ -172,7 +172,7 @@ mod tests { }, ) .unwrap(); - insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); + milli_snap!(results, i); txn.commit().unwrap(); } @@ -203,7 +203,7 @@ mod tests { }, ) .unwrap(); - insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 523b3853c..38c6acdec 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -258,6 +258,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { + use crate::milli_snap; use crate::{ heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec}, search::facet::test::FacetIndex, @@ -301,7 +302,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_range_increasing() { @@ -323,10 +324,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_increasing_included_bounds"), - results - ); + milli_snap!(results, i); let mut results = String::new(); for i in 0..=255 { let i = i as f64; @@ -342,10 +340,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_increasing_excluded_bounds"), - results - ); + milli_snap!(results, i); txn.commit().unwrap(); } } @@ -372,10 +367,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_decreasing_included_bounds"), - results - ); + milli_snap!(results, i); let mut results = String::new(); @@ -394,10 +386,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_decreasing_excluded_bounds"), - results - ); + milli_snap!(results, i); txn.commit().unwrap(); } @@ -425,7 +414,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!(format!("filter_range_{i}_pinch_included_bounds"), results); + milli_snap!(results, i); let mut results = String::new(); @@ -444,7 +433,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!(format!("filter_range_{i}_pinch_excluded_bounds"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 8af191089..e8618c302 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,15 +83,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use rand::Rng; - use rand::SeedableRng; - use roaring::RoaringBitmap; - + use crate::milli_snap; use crate::{ heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex}, snapshot_tests::display_bitmap, }; + use rand::Rng; + use rand::SeedableRng; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); @@ -125,7 +125,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_sort() { @@ -138,8 +138,9 @@ mod tests { for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); + results.push('\n'); } - insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 5ce55ec6d..b8bae2f9d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -111,15 +111,15 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use rand::Rng; - use rand::SeedableRng; - use roaring::RoaringBitmap; - + use crate::milli_snap; use crate::{ heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice}, search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex}, snapshot_tests::display_bitmap, }; + use rand::Rng; + use rand::SeedableRng; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); @@ -153,7 +153,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_sort_descending() { @@ -167,8 +167,9 @@ mod tests { for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); + results.push('\n'); } - insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap new file mode 100644 index 000000000..fe5f69d7d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap @@ -0,0 +1,228 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 +100: 1 +101: 1 +102: 1 +103: 1 +104: 1 +105: 1 +106: 1 +107: 1 +108: 1 +109: 1 +110: 1 +111: 1 +112: 1 +113: 1 +114: 1 +115: 1 +116: 1 +117: 1 +118: 1 +119: 1 +120: 1 +121: 1 +122: 1 +123: 1 +124: 1 +125: 1 +126: 1 +127: 1 +128: 1 +129: 1 +130: 1 +131: 1 +132: 1 +133: 1 +134: 1 +135: 1 +136: 1 +137: 1 +138: 1 +139: 1 +140: 1 +141: 1 +142: 1 +143: 1 +144: 1 +145: 1 +146: 1 +147: 1 +148: 1 +149: 1 +150: 1 +151: 1 +152: 1 +153: 1 +154: 1 +155: 1 +156: 1 +157: 1 +158: 1 +159: 1 +160: 1 +161: 1 +162: 1 +163: 1 +164: 1 +165: 1 +166: 1 +167: 1 +168: 1 +169: 1 +170: 1 +171: 1 +172: 1 +173: 1 +174: 1 +175: 1 +176: 1 +177: 1 +178: 1 +179: 1 +180: 1 +181: 1 +182: 1 +183: 1 +184: 1 +185: 1 +186: 1 +187: 1 +188: 1 +189: 1 +190: 1 +191: 1 +192: 1 +193: 1 +194: 1 +195: 1 +196: 1 +197: 1 +198: 1 +199: 1 +200: 1 +201: 1 +202: 1 +203: 1 +204: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +211: 1 +212: 1 +213: 1 +214: 1 +215: 1 +216: 1 +217: 1 +218: 1 +219: 1 +220: 1 +221: 1 +222: 1 +223: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap new file mode 100644 index 000000000..dd5e761ea --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap @@ -0,0 +1,100 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +226: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap new file mode 100644 index 000000000..7170dab89 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap @@ -0,0 +1,104 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap new file mode 100644 index 000000000..dd5e761ea --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap @@ -0,0 +1,100 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +226: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..da2b49adc --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap new file mode 100644 index 000000000..e835d8934 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +52d0b31f312572c10959418434e36581 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap new file mode 100644 index 000000000..150f00f7b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2cb9e819529823d488e141edb4307f97 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap new file mode 100644 index 000000000..4f05823f4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +38a4352c48905f5b121d1217734862da diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap new file mode 100644 index 000000000..d2c8a3559 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +aefc1ec120fa884cc8396a68bd7de42f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap new file mode 100644 index 000000000..3fb0c94b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9e360d7bcd29ac2c23bc241df941fd23 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap new file mode 100644 index 000000000..44fa88004 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +f0606b9af67de9ede9d469514ea1741f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..cf4b29ba3 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap new file mode 100644 index 000000000..9dcd92ed7 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap @@ -0,0 +1,28 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[200, ] +[201, ] +[202, ] +[203, ] +[204, ] +[205, ] +[206, ] +[207, ] +[208, ] +[209, ] +[210, ] +[211, ] +[212, ] +[213, ] +[214, ] +[215, ] +[216, ] +[217, ] +[218, ] +[219, ] +[220, ] +[221, ] +[222, ] +[223, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap new file mode 100644 index 000000000..a81e7377b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap @@ -0,0 +1,53 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, ] +[202, ] +[203, ] +[207, ] +[211, ] +[215, ] +[219, ] +[223, ] +[224, ] +[230, ] +[231, ] +[233, ] +[235, ] +[236, ] +[237, ] +[239, ] +[241, ] +[243, ] +[244, ] +[247, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[263, ] +[264, ] +[267, ] +[269, ] +[273, ] +[277, ] +[278, ] +[279, ] +[281, ] +[282, ] +[286, ] +[289, ] +[292, ] +[293, ] +[295, ] +[297, ] +[205, ] +[206, ] +[208, ] +[209, ] +[210, ] +[216, ] +[220, ] +[226, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..785ff325c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap new file mode 100644 index 000000000..05a18f000 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[247, ] +[246, ] +[245, ] +[244, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap new file mode 100644 index 000000000..9890c1aab --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap @@ -0,0 +1,49 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[243, ] +[235, ] +[226, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[297, ] +[295, ] +[293, ] +[292, ] +[289, ] +[286, ] +[282, ] +[281, ] +[279, ] +[278, ] +[277, ] +[273, ] +[269, ] +[267, ] +[264, ] +[263, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[247, ] +[244, ] +[241, ] +[239, ] +[237, ] +[236, ] +[233, ] +[231, ] +[230, ] +[224, ] +[223, ] +[215, ] +[211, ] +[203, ] +[202, ] +[201, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..b68843376 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 933f68837..f35bda2e7 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,7 +6,7 @@ use std::fmt::Write; use std::path::Path; #[track_caller] -pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { +pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { let mut settings = insta::Settings::clone_current(); settings.set_prepend_module_to_snapshot(false); let path = Path::new(std::panic::Location::caller().file()); @@ -16,12 +16,63 @@ pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Setti if let Some(name) = name { settings - .set_snapshot_path(Path::new("snapshots").join(filename).join(test_name).join(name)); + .set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name).join(name)); } else { - settings.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name)); + settings.set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name)); } - settings + (settings, test_name) +} +#[macro_export] +macro_rules! milli_snap { + ($value:expr, $name:expr) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr, @$inline:literal) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; + ($value:expr, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; } /** @@ -92,7 +143,7 @@ db_snap!(index, word_docids, "some_identifier", @""); #[macro_export] macro_rules! db_snap { ($index:ident, $db_name:ident, $name:expr) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( &format!("{}", $name), )); settings.bind(|| { @@ -104,7 +155,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); @@ -114,7 +165,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); @@ -127,8 +178,8 @@ macro_rules! db_snap { } }); }; - ($index:ident, $db_name:ident, $name:literal, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); + ($index:ident, $db_name:ident, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a0d426d7a..6dd1f7ac5 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,9 +1,8 @@ -use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; -use crate::{Index, Result}; +use crate::Result; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..bc0668408 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +834f27a924de1acbd3cd94c0d7f10315 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..bc0668408 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +834f27a924de1acbd3cd94c0d7f10315 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap deleted file mode 100644 index 373455db6..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -587899707db2848da3f18399e14ed4d0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index c3415c320..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -02bbf2ca1663cccea0e4c06d5ad06a45 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 78dad29f1..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -e68ea591e1af3e53e544dff9a1648e88 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 61a5908f4..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -12a4bb0f5b95d7629c2b9a915150c0cf diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 961346de5..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -6438e94bc7fada13022e0efccdf294e0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 2b7c1ef9c..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -5348bbc46b5384455b6a900666d2a502 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap deleted file mode 100644 index 901b86255..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -faddef9eae5f2efacfec51f20f2e8cd6 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index aa6c85461..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -ddb8fc987c5dc892337682595043858e From 36296bbb20e9c545d131117d85b1d3718d985378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 11:33:50 +0200 Subject: [PATCH 18/58] Add facet incremental indexing snapshot tests + fix bug --- milli/src/search/facet/mod.rs | 13 +- .../filter_distribution_all/0.snap | 32 + .../filter_distribution_all/1.snap | 5 + .../filter_distribution_all_stop_early/1.snap | 4 + .../random_looking_index_snap.hash.snap | 2 +- .../filter_range_decreasing/0.hash.snap | 2 +- .../filter_range_decreasing/1.hash.snap | 2 +- .../filter_range_increasing/0.hash.snap | 2 +- .../filter_range_increasing/1.hash.snap | 2 +- .../filter_range_pinch/0.hash.snap | 2 +- .../filter_range_pinch/1.hash.snap | 2 +- .../random_looking_index_snap.hash.snap | 2 +- .../filter_sort/0.snap | 32 + .../filter_sort/1.snap | 1 + .../random_looking_index_snap.hash.snap | 2 +- .../filter_sort_descending/0.snap | 44 ++ .../filter_sort_descending/1.snap | 9 +- .../random_looking_index_snap.hash.snap | 2 +- milli/src/update/facet/incremental.rs | 679 +++++++++++++++++- .../default/facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../default/facet_id_string_docids.hash.snap | 2 +- .../facet_id_string_docids.hash.snap | 2 +- .../incremental.rs/append/append.hash.snap | 4 + .../incremental.rs/delete_from_end/0.snap | 4 + .../delete_from_end/100.hash.snap | 4 + .../incremental.rs/delete_from_end/15.snap | 23 + .../delete_from_end/150.hash.snap | 4 + .../incremental.rs/delete_from_end/17.snap | 26 + .../delete_from_end/200.hash.snap | 4 + .../delete_from_start/127.hash.snap | 4 + .../incremental.rs/delete_from_start/215.snap | 54 ++ .../incremental.rs/delete_from_start/255.snap | 4 + .../delete_shuffled/127.hash.snap | 4 + .../delete_shuffled/215.hash.snap | 4 + .../incremental.rs/delete_shuffled/255.snap | 4 + .../in_place_level0_delete.hash.snap | 4 + .../in_place_level0_insert.snap | 20 + .../many_field_ids_append.hash.snap | 4 + .../many_field_ids_prepend.hash.snap | 4 + .../merge_values/merge_values.hash.snap | 4 + .../incremental.rs/prepend/prepend.hash.snap | 4 + .../shuffle_merge_string/1.hash.snap | 4 + .../shuffle_merge_string/2.hash.snap | 4 + .../shuffled/shuffled.hash.snap | 4 + 49 files changed, 1028 insertions(+), 22 deletions(-) create mode 100644 milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 0ed80dd92..42c0f065a 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -77,7 +77,7 @@ pub(crate) fn get_highest_level<'t>( } #[cfg(test)] -mod test { +pub mod test { use crate::update::FacetsUpdateIncremental; use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; @@ -160,6 +160,17 @@ mod test { let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); } + pub fn delete<'a>( + &self, + rwtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + value: u32, + ) { + let update = FacetsUpdateIncremental::new(self.db.content); + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); + } } impl Display for FacetIndex diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap index fe5f69d7d..2b6123289 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap @@ -225,4 +225,36 @@ source: milli/src/search/facet/facet_distribution_iter.rs 221: 1 222: 1 223: 1 +224: 1 +225: 1 +226: 1 +227: 1 +228: 1 +229: 1 +230: 1 +231: 1 +232: 1 +233: 1 +234: 1 +235: 1 +236: 1 +237: 1 +238: 1 +239: 1 +240: 1 +241: 1 +242: 1 +243: 1 +244: 1 +245: 1 +246: 1 +247: 1 +248: 1 +249: 1 +250: 1 +251: 1 +252: 1 +253: 1 +254: 1 +255: 1 diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap index dd5e761ea..d0c0dd98d 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap @@ -96,5 +96,10 @@ source: milli/src/search/facet/facet_distribution_iter.rs 216: 1 219: 1 220: 1 +223: 1 226: 1 +235: 1 +236: 1 +238: 1 +243: 1 diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap index dd5e761ea..95c719bb0 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap @@ -96,5 +96,9 @@ source: milli/src/search/facet/facet_distribution_iter.rs 216: 1 219: 1 220: 1 +223: 1 226: 1 +235: 1 +236: 1 +238: 1 diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index da2b49adc..661e1a35b 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_distribution_iter.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap index e835d8934..7bf13e05c 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -52d0b31f312572c10959418434e36581 +fcedc563a82c1c61f50174a5f3f982b6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap index 150f00f7b..100b928d7 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -2cb9e819529823d488e141edb4307f97 +6cc26e77fc6bd9145deedf14cf422b03 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap index 4f05823f4..db11ce952 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -38a4352c48905f5b121d1217734862da +c1c7a0bb91d53d33724583b6d4a99f16 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap index d2c8a3559..f5a81c121 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -aefc1ec120fa884cc8396a68bd7de42f +12213d3f1047a0c3d08e4670a7d688e7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap index 3fb0c94b0..07664807e 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -9e360d7bcd29ac2c23bc241df941fd23 +3456db9a1bb94c33c1e9f656184ee711 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap index 44fa88004..ef530faa1 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -f0606b9af67de9ede9d469514ea1741f +2127cd818b457e0611e0c8e1a871602a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index cf4b29ba3..67a2f6bd9 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap index 9dcd92ed7..2d0f6e213 100644 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap @@ -25,4 +25,36 @@ source: milli/src/search/facet/facet_sort_ascending.rs [221, ] [222, ] [223, ] +[224, ] +[225, ] +[226, ] +[227, ] +[228, ] +[229, ] +[230, ] +[231, ] +[232, ] +[233, ] +[234, ] +[235, ] +[236, ] +[237, ] +[238, ] +[239, ] +[240, ] +[241, ] +[242, ] +[243, ] +[244, ] +[245, ] +[246, ] +[247, ] +[248, ] +[249, ] +[250, ] +[251, ] +[252, ] +[253, ] +[254, ] +[255, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap index a81e7377b..20d666494 100644 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap @@ -50,4 +50,5 @@ source: milli/src/search/facet/facet_sort_ascending.rs [216, ] [220, ] [226, ] +[238, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index 785ff325c..64ff762db 100644 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_sort_ascending.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap index 05a18f000..032763c74 100644 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap @@ -1,10 +1,54 @@ --- source: milli/src/search/facet/facet_sort_descending.rs --- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] [247, ] [246, ] [245, ] [244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] [207, ] [206, ] [205, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap index 9890c1aab..4c62cfee4 100644 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap @@ -2,8 +2,15 @@ source: milli/src/search/facet/facet_sort_descending.rs --- [243, ] +[238, ] +[236, ] [235, ] [226, ] +[223, ] +[220, ] +[219, ] +[216, ] +[210, ] [209, ] [208, ] [207, ] @@ -35,12 +42,10 @@ source: milli/src/search/facet/facet_sort_descending.rs [241, ] [239, ] [237, ] -[236, ] [233, ] [231, ] [230, ] [224, ] -[223, ] [215, ] [211, ] [203, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index b68843376..0649e3c5d 100644 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_sort_descending.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 6dd1f7ac5..712d7271c 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -297,7 +297,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; let mut to_add = vec![]; - for _ in 0..group_size { + for _ in 0..self.min_level_size { let mut first_key = None; let mut values = RoaringBitmap::new(); for _ in 0..group_size { @@ -459,3 +459,680 @@ impl FacetsUpdateIncremental { Ok(()) } } + +#[cfg(test)] +mod tests { + use crate::milli_snap; + use crate::{ + heed_codec::facet::new::{ + ordered_f64_codec::OrderedF64Codec, str_ref::StrRefCodec, FacetGroupValueCodec, + FacetKeyCodec, MyByteSlice, + }, + search::facet::{get_highest_level, test::FacetIndex}, + }; + use heed::{types::ByteSlice, BytesDecode, BytesEncode}; + use rand::Rng; + use rand::{seq::SliceRandom, SeedableRng}; + use roaring::RoaringBitmap; + + pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) + where + for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, + { + let FacetIndex { env, db, .. } = index; + + let txn = env.write_txn().unwrap(); + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(&txn, index.db.content, field_id).unwrap(); + txn.commit().unwrap(); + + let txn = env.read_txn().unwrap(); + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = db + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = db + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + &txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0 && (value.size as usize) < db.max_group_size); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = + db.content.range(&txn, &(start_below..)).unwrap().take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + #[test] + fn append() { + let index = FacetIndex::::new(4, 8); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_append() { + let index = FacetIndex::::new(4, 8); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + verify_structure_validity(&index, 1); + verify_structure_validity(&index, 2); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_prepend() { + let index = FacetIndex::::new(4, 8); + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + verify_structure_validity(&index, 1); + verify_structure_validity(&index, 2); + milli_snap!(format!("{index}")); + } + + #[test] + fn prepend() { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn shuffled() { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn merge_values() { + let index = FacetIndex::::new(4, 8); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(rng.gen_range(256..512)); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + txn.commit().unwrap(); + } + + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn delete_from_end() { + let index = FacetIndex::::new(4, 8); + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); + txn.commit().unwrap(); + } + + for i in (200..256).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 200); + + for i in (150..200).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 150); + + for i in (100..150).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 100); + + for i in (17..100).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 17); + + let mut txn = index.env.write_txn().unwrap(); + for i in (15..17).into_iter().rev() { + index.delete(&mut txn, 0, &(i as f64), i as u32); + } + txn.commit().unwrap(); + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 15); + for i in (0..15).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 0); + } + + #[test] + fn delete_from_start() { + let index = FacetIndex::::new(4, 8); + + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + + for i in 0..128 { + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 127); + for i in 128..216 { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 215); + for i in 216..256 { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn delete_shuffled() { + let index = FacetIndex::::new(4, 8); + + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for i in 0..128 { + let key = keys[i]; + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 127); + for i in 128..216 { + let key = keys[i]; + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 215); + for i in 216..256 { + let key = keys[i]; + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn in_place_level0_insert() { + let index = FacetIndex::::new(4, 8); + let mut keys = (0..16).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + for i in 0..4 { + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + txn.commit().unwrap(); + } + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn in_place_level0_delete() { + let index = FacetIndex::::new(4, 8); + + let mut keys = (0..64).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + + for &key in keys.iter() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key + 100); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn shuffle_merge_string() { + let index = FacetIndex::::new(4, 8); + + let mut keys = (1000..1064).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 1); + + for &key in keys.iter() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 2); + } + + // fuzz tests +} +// #[cfg(all(test, fuzzing))] +// mod fuzz { +// use crate::codec::U16Codec; + +// use super::tests::verify_structure_validity; +// use super::*; +// use fuzzcheck::mutators::integer_within_range::U16WithinRangeMutator; +// use fuzzcheck::DefaultMutator; +// use roaring::RoaringBitmap; +// use std::collections::BTreeMap; +// use std::collections::HashMap; + +// #[derive(Default)] +// pub struct TrivialDatabase { +// pub elements: BTreeMap>, +// } +// impl TrivialDatabase +// where +// T: Ord + Clone + Copy + Eq + std::fmt::Debug, +// { +// pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { +// if new_values.is_empty() { +// return; +// } +// let values_field_id = self.elements.entry(field_id).or_default(); +// let values = values_field_id.entry(new_key).or_default(); +// *values |= new_values; +// } +// pub fn delete(&mut self, field_id: u16, key: T, value: u32) { +// if let Some(values_field_id) = self.elements.get_mut(&field_id) { +// if let Some(values) = values_field_id.get_mut(&key) { +// values.remove(value); +// if values.is_empty() { +// values_field_id.remove(&key); +// } +// } +// if values_field_id.is_empty() { +// self.elements.remove(&field_id); +// } +// } +// } +// } +// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] +// struct Operation { +// key: Key, +// #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] +// field_id: u16, +// kind: OperationKind, +// } +// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] +// enum OperationKind { +// Insert(Vec), +// Delete(u8), +// } + +// fn compare_with_trivial_database( +// tempdir: Rc, +// group_size: u8, +// max_group_size: u8, +// operations: &[Operation], +// ) { +// let index = FacetIndex::::open_from_tempdir(tempdir, group_size, max_group_size); +// let mut trivial_db = TrivialDatabase::::default(); +// let mut value_to_keys = HashMap::>::new(); +// let mut txn = index.env.write_txn().unwrap(); +// for Operation { key, field_id, kind } in operations { +// match kind { +// OperationKind::Insert(values) => { +// let mut bitmap = RoaringBitmap::new(); +// for value in values { +// bitmap.insert(*value as u32); +// value_to_keys.entry(*value).or_default().push(*key); +// } +// index.insert(&mut txn, *field_id, key, &bitmap); +// trivial_db.insert(*field_id, *key, &bitmap); +// } +// OperationKind::Delete(value) => { +// if let Some(keys) = value_to_keys.get(value) { +// for key in keys { +// index.delete(&mut txn, *field_id, key, *value as u32); +// trivial_db.delete(*field_id, *key, *value as u32); +// } +// } +// } +// } +// } +// for (field_id, values_field_id) in trivial_db.elements.iter() { +// let level0iter = index +// .db +// .content +// .as_polymorph() +// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( +// &mut txn, +// &field_id.to_be_bytes(), +// ) +// .unwrap(); + +// for ((key, values), group) in values_field_id.iter().zip(level0iter) { +// let (group_key, group_values) = group.unwrap(); +// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// assert_eq!(key, &group_key.left_bound); +// assert_eq!(values, &group_values.bitmap); +// } +// } + +// txn.commit().unwrap(); +// let mut txn = index.env.write_txn().unwrap(); +// for (field_id, values_field_id) in trivial_db.elements.iter() { +// let level0iter = index +// .db +// .content +// .as_polymorph() +// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) +// .unwrap(); + +// for ((key, values), group) in values_field_id.iter().zip(level0iter) { +// let (group_key, group_values) = group.unwrap(); +// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// assert_eq!(key, &group_key.left_bound); +// assert_eq!(values, &group_values.bitmap); +// } +// verify_structure_validity(&index, *field_id); +// } + +// index.db.content.clear(&mut txn).unwrap(); +// txn.commit().unwrap(); +// } + +// #[test] +// fn fuzz() { +// let tempdir = Rc::new(TempDir::new().unwrap()); +// let tempdir_cloned = tempdir.clone(); +// let result = fuzzcheck::fuzz_test(move |x: &(u8, u8, Vec>)| { +// compare_with_trivial_database(tempdir_cloned.clone(), x.0, x.1, &x.2) +// }) +// .default_mutator() +// .serde_serializer() +// .default_sensor_and_pool_with_custom_filter(|file, function| { +// if file.is_relative() +// && !function.contains("serde") +// && !function.contains("tests::") +// && !function.contains("fuzz::") +// && !function.contains("display_bitmap") +// { +// true +// } else { +// false +// } +// }) +// .arguments_from_cargo_fuzzcheck() +// .launch(); +// assert!(!result.found_test_failure); +// } + +// #[test] +// fn reproduce_bug() { +// let operations = r#" +// [ +// {"key":0, "field_id": 0, "kind":{"Insert":[109]}}, +// {"key":143, "field_id": 0, "kind":{"Insert":[243]}}, +// {"key":90, "field_id": 0, "kind":{"Insert":[217]}}, +// {"key":172, "field_id": 0, "kind":{"Insert":[94]}}, +// {"key":27, "field_id": 0, "kind":{"Insert":[4]}}, +// {"key":124, "field_id": 0, "kind":{"Insert":[0]}}, +// {"key":123, "field_id": 0, "kind":{"Insert":[0]}}, +// {"key":67, "field_id": 0, "kind":{"Insert":[109]}}, +// {"key":13, "field_id": 0, "kind":{"Insert":[0]}}, +// {"key":162, "field_id": 0, "kind":{"Insert":[213]}}, +// {"key":235, "field_id": 0, "kind":{"Insert":[67]}}, +// {"key":251, "field_id": 0, "kind":{"Insert":[50]}}, +// {"key":218, "field_id": 0, "kind":{"Insert":[164]}}, +// {"key":166, "field_id": 0, "kind":{"Insert":[67]}}, +// {"key":64, "field_id": 0, "kind":{"Insert":[61]}}, +// {"key":183, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":250, "field_id": 0, "kind":{"Delete":50}} +// ] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); +// } + +// #[test] +// fn reproduce_bug2() { +// let operations = r#" +// [ +// {"key":102, "field_id": 0, "kind":{"Insert":[122]}}, +// {"key":73, "field_id": 0, "kind":{"Insert":[132]}}, +// {"key":20, "field_id": 0, "kind":{"Insert":[215]}}, +// {"key":39, "field_id": 0, "kind":{"Insert":[152]}}, +// {"key":151, "field_id": 0, "kind":{"Insert":[226]}}, +// {"key":17, "field_id": 0, "kind":{"Insert":[101]}}, +// {"key":74, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":2, "field_id": 0, "kind":{"Insert":[130]}}, +// {"key":64, "field_id": 0, "kind":{"Insert":[180]}}, +// {"key":83, "field_id": 0, "kind":{"Insert":[250]}}, +// {"key":80, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":113, "field_id": 0, "kind":{"Insert":[63]}}, +// {"key":201, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":200, "field_id": 0, "kind":{"Insert":[5]}}, +// {"key":93, "field_id": 0, "kind":{"Insert":[98]}}, +// {"key":162, "field_id": 0, "kind":{"Insert":[5]}}, +// {"key":80, "field_id": 0, "kind":{"Delete":210}} +// ] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); +// } +// #[test] +// fn reproduce_bug3() { +// let operations = r#" +// [ +// {"key":27488, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":64716, "field_id": 0, "kind":{"Insert":[216]}}, +// {"key":60886, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":59509, "field_id": 0, "kind":{"Insert":[187,231]}}, +// {"key":55057, "field_id": 0, "kind":{"Insert":[37]}}, +// {"key":45200, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":55056, "field_id": 0, "kind":{"Insert":[37]}}, +// {"key":63679, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":52155, "field_id": 0, "kind":{"Insert":[74]}}, +// {"key":20648, "field_id": 0, "kind":{"Insert":[47,138,157]}} +// ] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 0, 7, &operations); +// } + +// #[test] +// fn reproduce_bug4() { +// let operations = r#" +// [{"key":63499, "field_id": 0, "kind":{"Insert":[87]}},{"key":25374, "field_id": 0, "kind":{"Insert":[14]}},{"key":64481, "field_id": 0, "kind":{"Delete":87}},{"key":23038, "field_id": 0, "kind":{"Insert":[173]}},{"key":14862, "field_id": 0, "kind":{"Insert":[8]}},{"key":13145, "field_id": 0, "kind":{"Insert":[5,64]}},{"key":23446, "field_id": 0, "kind":{"Insert":[86,59]}},{"key":17972, "field_id": 0, "kind":{"Insert":[58,137]}},{"key":21273, "field_id": 0, "kind":{"Insert":[121,132,81,147]}},{"key":28264, "field_id": 0, "kind":{"Insert":[36]}},{"key":46659, "field_id": 0, "kind":{"Insert":[]}}] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 2, 1, &operations); +// } +// } diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap index bc0668408..7ed43424a 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -834f27a924de1acbd3cd94c0d7f10315 +5ce8009d3eb023e4b9c0a6e7fa4e6262 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap index bc0668408..7ed43424a 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -834f27a924de1acbd3cd94c0d7f10315 +5ce8009d3eb023e4b9c0a6e7fa4e6262 diff --git a/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap new file mode 100644 index 000000000..bdeeefc13 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +6ed7bf5d440599b3b10b37549a271fdf diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap new file mode 100644 index 000000000..08534cbd4 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap @@ -0,0 +1,23 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" +0 1 k0 4 "[0, 1, 2, 3, ]" +0 1 k4 4 "[4, 5, 6, 7, ]" +0 1 k8 4 "[8, 9, 10, 11, ]" +0 1 k12 3 "[12, 13, 14, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap new file mode 100644 index 000000000..e9ccc990f --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b5203f0df0036ebaa133dd77d63a00eb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap new file mode 100644 index 000000000..a98803604 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap @@ -0,0 +1,26 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" +0 0 k15 1 "[15, ]" +0 0 k16 1 "[16, ]" +0 1 k0 4 "[0, 1, 2, 3, ]" +0 1 k4 4 "[4, 5, 6, 7, ]" +0 1 k8 4 "[8, 9, 10, 11, ]" +0 1 k12 4 "[12, 13, 14, 15, ]" +0 1 k16 1 "[16, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap new file mode 100644 index 000000000..bb07123a9 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +95497d8579740868ee0bfc655b0bf782 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap new file mode 100644 index 000000000..8714af061 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +d565c2f7bbd9e13e12de40cfbbfba6bb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap new file mode 100644 index 000000000..1bba99454 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k216 1 "[216, ]" +0 0 k217 1 "[217, ]" +0 0 k218 1 "[218, ]" +0 0 k219 1 "[219, ]" +0 0 k220 1 "[220, ]" +0 0 k221 1 "[221, ]" +0 0 k222 1 "[222, ]" +0 0 k223 1 "[223, ]" +0 0 k224 1 "[224, ]" +0 0 k225 1 "[225, ]" +0 0 k226 1 "[226, ]" +0 0 k227 1 "[227, ]" +0 0 k228 1 "[228, ]" +0 0 k229 1 "[229, ]" +0 0 k230 1 "[230, ]" +0 0 k231 1 "[231, ]" +0 0 k232 1 "[232, ]" +0 0 k233 1 "[233, ]" +0 0 k234 1 "[234, ]" +0 0 k235 1 "[235, ]" +0 0 k236 1 "[236, ]" +0 0 k237 1 "[237, ]" +0 0 k238 1 "[238, ]" +0 0 k239 1 "[239, ]" +0 0 k240 1 "[240, ]" +0 0 k241 1 "[241, ]" +0 0 k242 1 "[242, ]" +0 0 k243 1 "[243, ]" +0 0 k244 1 "[244, ]" +0 0 k245 1 "[245, ]" +0 0 k246 1 "[246, ]" +0 0 k247 1 "[247, ]" +0 0 k248 1 "[248, ]" +0 0 k249 1 "[249, ]" +0 0 k250 1 "[250, ]" +0 0 k251 1 "[251, ]" +0 0 k252 1 "[252, ]" +0 0 k253 1 "[253, ]" +0 0 k254 1 "[254, ]" +0 0 k255 1 "[255, ]" +0 1 k216 4 "[216, 217, 218, 219, ]" +0 1 k220 4 "[220, 221, 222, 223, ]" +0 1 k224 4 "[224, 225, 226, 227, ]" +0 1 k228 4 "[228, 229, 230, 231, ]" +0 1 k232 4 "[232, 233, 234, 235, ]" +0 1 k236 4 "[236, 237, 238, 239, ]" +0 1 k240 4 "[240, 241, 242, 243, ]" +0 1 k244 4 "[244, 245, 246, 247, ]" +0 1 k248 4 "[248, 249, 250, 251, ]" +0 1 k252 4 "[252, 253, 254, 255, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap new file mode 100644 index 000000000..6815ee609 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7cb503827ba17e9670296cc9531a1380 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap new file mode 100644 index 000000000..6860385ee --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b061f43e379e16f0617c05d3313d0078 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap new file mode 100644 index 000000000..f96b42b27 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +81fc9489d6b163935b97433477dea63b diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap new file mode 100644 index 000000000..82a7ce716 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[3, 435, 583, 849, ]" +0 0 k1 1 "[35, 494, 693, 796, ]" +0 0 k2 1 "[76, 420, 526, 909, ]" +0 0 k3 1 "[133, 451, 653, 806, ]" +0 0 k4 1 "[131, 464, 656, 853, ]" +0 0 k5 1 "[61, 308, 701, 903, ]" +0 0 k6 1 "[144, 449, 674, 794, ]" +0 0 k7 1 "[182, 451, 735, 941, ]" +0 0 k8 1 "[6, 359, 679, 1003, ]" +0 0 k9 1 "[197, 418, 659, 904, ]" +0 0 k10 1 "[88, 297, 567, 800, ]" +0 0 k11 1 "[150, 309, 530, 946, ]" +0 0 k12 1 "[156, 466, 567, 892, ]" +0 0 k13 1 "[46, 425, 610, 807, ]" +0 0 k14 1 "[236, 433, 549, 891, ]" +0 0 k15 1 "[207, 472, 603, 974, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap new file mode 100644 index 000000000..d055892f5 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b3e2de9020d9e0f3941bc3a179c795ba diff --git a/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap new file mode 100644 index 000000000..2b6805676 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +4fc800f49201a336295af0542fdf01ab diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap new file mode 100644 index 000000000..1802eb952 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +9343355bf535ed4a0c956df2b229d5e6 diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap new file mode 100644 index 000000000..5ef88bfb4 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +fd65ce7d96a07aafb0ef6cfb5bf016b8 From a7201ece04e1acc47801be7f7dd1c93388751718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 11:40:29 +0200 Subject: [PATCH 19/58] cargo fmt --- milli/src/heed_codec/facet/new/mod.rs | 5 +++- .../heed_codec/facet/new/ordered_f64_codec.rs | 3 +- .../search/facet/facet_distribution_iter.rs | 17 ++++++----- milli/src/search/facet/facet_range_search.rs | 30 ++++++++----------- .../src/search/facet/facet_sort_ascending.rs | 21 +++++++------ .../src/search/facet/facet_sort_descending.rs | 22 +++++++------- milli/src/search/facet/filter.rs | 12 ++++---- milli/src/search/facet/mod.rs | 18 +++++------ milli/src/snapshot_tests.rs | 8 +++-- milli/src/update/facet/bulk.rs | 20 +++++++------ milli/src/update/facet/incremental.rs | 29 +++++++++--------- .../extract/extract_facet_string_docids.rs | 8 +++-- 12 files changed, 99 insertions(+), 94 deletions(-) diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs index 5ed6a61f6..04a545564 100644 --- a/milli/src/heed_codec/facet/new/mod.rs +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -1,6 +1,9 @@ +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; + use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; -use std::{borrow::Cow, convert::TryFrom, marker::PhantomData}; pub mod ordered_f64_codec; pub mod str_ref; diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs index 856a9c0d1..5ac9ffcfc 100644 --- a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs +++ b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, convert::TryInto}; +use std::borrow::Cow; +use std::convert::TryInto; use heed::BytesDecode; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 9e251103c..13ba28019 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,9 +1,10 @@ -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; -use heed::Result; -use roaring::RoaringBitmap; use std::ops::ControlFlow; +use heed::Result; +use roaring::RoaringBitmap; + use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, @@ -108,16 +109,16 @@ where #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, - }; + use std::ops::ControlFlow; + use heed::BytesDecode; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use std::ops::ControlFlow; use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::test::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 38c6acdec..20ad23a37 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -1,18 +1,12 @@ +use std::ops::{Bound, RangeBounds}; + use heed::BytesEncode; use roaring::RoaringBitmap; -use std::ops::Bound; -use std::ops::RangeBounds; -use crate::heed_codec::facet::new::FacetGroupValueCodec; -use crate::heed_codec::facet::new::FacetKey; -use crate::heed_codec::facet::new::FacetKeyCodec; -use crate::heed_codec::facet::new::MyByteSlice; +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; use crate::Result; -use super::get_first_facet_value; -use super::get_highest_level; -use super::get_last_facet_value; - pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -258,17 +252,17 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec}, - search::facet::test::FacetIndex, - snapshot_tests::display_bitmap, - }; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; use std::ops::Bound; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use super::find_docids_of_facet_within_bounds; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::new::FacetKeyCodec; + use crate::milli_snap; + use crate::search::facet::test::FacetIndex; + use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e8618c302..b3cae5d28 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -1,10 +1,10 @@ -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, -}; use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, @@ -83,16 +83,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, - search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex}, - snapshot_tests::display_bitmap, - }; - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::facet_sort_ascending::ascending_facet_sort; + use crate::search::facet::test::FacetIndex; + use crate::snapshot_tests::display_bitmap; + fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index b8bae2f9d..d68c9bdad 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -1,12 +1,12 @@ use std::ops::Bound; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, -}; use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, @@ -111,16 +111,16 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice}, - search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex}, - snapshot_tests::display_bitmap, - }; - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; + use crate::milli_snap; + use crate::search::facet::facet_sort_descending::descending_facet_sort; + use crate::search::facet::test::FacetIndex; + use crate::snapshot_tests::display_bitmap; + fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 6ec626a5c..6a10b7097 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,18 +1,18 @@ -use either::Either; -pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; -use heed::types::DecodeIgnore; -use roaring::RoaringBitmap; use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; +use either::Either; +pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; +use heed::types::DecodeIgnore; +use roaring::RoaringBitmap; + +use super::facet_range_search; use crate::error::{Error, UserError}; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; -use super::facet_range_search; - /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 42c0f065a..78cd8fd4b 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,10 +1,9 @@ use heed::types::ByteSlice; use heed::{BytesDecode, RoTxn}; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; - pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; mod facet_distribution; mod facet_distribution_iter; @@ -78,17 +77,18 @@ pub(crate) fn get_highest_level<'t>( #[cfg(test)] pub mod test { - use crate::update::FacetsUpdateIncremental; + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; + use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; - use std::{fmt::Display, marker::PhantomData, rc::Rc}; - use crate::{ - heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, - }, - snapshot_tests::display_bitmap, + use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncremental; pub struct FacetIndex where diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f35bda2e7..d054e63b5 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -1,10 +1,12 @@ -use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; -use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; -use roaring::RoaringBitmap; use std::borrow::Cow; use std::fmt::Write; use std::path::Path; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; +use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; + #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { let mut settings = insta::Settings::clone_current(); diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index b8acffbaf..f93ee735e 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,3 +1,14 @@ +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + use crate::error::InternalError; use crate::facet::FacetType; use crate::heed_codec::facet::new::{ @@ -5,15 +16,6 @@ use crate::heed_codec::facet::new::{ }; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::cmp; -use std::fs::File; -use std::num::NonZeroUsize; -use time::OffsetDateTime; pub struct FacetsUpdateBulk<'i> { index: &'i Index, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 712d7271c..3493db0f7 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,11 +1,12 @@ +use heed::types::ByteSlice; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; use crate::Result; -use heed::Error; -use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; -use roaring::RoaringBitmap; enum InsertionResult { InPlace, @@ -462,19 +463,19 @@ impl FacetsUpdateIncremental { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::{ - ordered_f64_codec::OrderedF64Codec, str_ref::StrRefCodec, FacetGroupValueCodec, - FacetKeyCodec, MyByteSlice, - }, - search::facet::{get_highest_level, test::FacetIndex}, - }; - use heed::{types::ByteSlice, BytesDecode, BytesEncode}; - use rand::Rng; - use rand::{seq::SliceRandom, SeedableRng}; + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode}; + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::new::str_ref::StrRefCodec; + use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + use crate::milli_snap; + use crate::search::facet::get_highest_level; + use crate::search::facet::test::FacetIndex; + pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) where for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index fe42801e7..591f44c74 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,11 +1,13 @@ +use std::fs::File; +use std::io; + +use heed::BytesEncode; + use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::new::str_ref::StrRefCodec; use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; -use heed::BytesEncode; -use std::fs::File; -use std::io; /// Extracts the facet string and the documents ids where this facet string appear. /// From afdf87f6f75cb5977c2b1ecec91406951e3d3256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 12:51:54 +0200 Subject: [PATCH 20/58] Fix bugs in asc/desc criterion and facet indexing --- milli/src/search/criteria/asc_desc.rs | 3 ++- milli/src/update/index_documents/extract/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index a5ea9b058..23dd860e1 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -197,9 +197,10 @@ fn facet_ordered<'t>( field_id, candidates.clone(), )?; + let string_iter = make_iter( rtxn, - index.facet_id_f64_docids.remap_key_type::>(), + index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, )?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 1e414458f..208dfc74d 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_roaring_bitmaps, // TODO: check (cbo?) + merge_cbo_roaring_bitmaps, // TODO: check (cbo?) TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); From 079ed4a992df4db94d7c4b555b164cf89ab4bf1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 12:57:29 +0200 Subject: [PATCH 21/58] Add more snapshots --- milli/src/search/facet/facet_range_search.rs | 12 ++++++------ .../{0.hash.snap => excluded_0.hash.snap} | 0 .../{1.hash.snap => excluded_1.hash.snap} | 0 .../filter_range_decreasing/included_0.hash.snap | 4 ++++ .../filter_range_decreasing/included_1.hash.snap | 4 ++++ .../{0.hash.snap => excluded_0.hash.snap} | 0 .../{1.hash.snap => excluded_1.hash.snap} | 0 .../filter_range_increasing/included_0.hash.snap | 4 ++++ .../filter_range_increasing/included_1.hash.snap | 4 ++++ .../{0.hash.snap => excluded_0.hash.snap} | 0 .../{1.hash.snap => excluded_1.hash.snap} | 0 .../filter_range_pinch/included_0.hash.snap | 4 ++++ .../filter_range_pinch/included_1.hash.snap | 4 ++++ milli/src/update/facet/incremental.rs | 10 +++++----- ...evel0_delete.hash.snap => after_delete.hash.snap} | 0 .../in_place_level0_delete/before_delete.hash.snap | 4 ++++ .../after_delete.hash.snap} | 0 .../before_delete.hash.snap} | 0 18 files changed, 39 insertions(+), 11 deletions(-) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/{0.hash.snap => excluded_0.hash.snap} (100%) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/{1.hash.snap => excluded_1.hash.snap} (100%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/{0.hash.snap => excluded_0.hash.snap} (100%) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/{1.hash.snap => excluded_1.hash.snap} (100%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/{0.hash.snap => excluded_0.hash.snap} (100%) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/{1.hash.snap => excluded_1.hash.snap} (100%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap rename milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/{in_place_level0_delete.hash.snap => after_delete.hash.snap} (100%) create mode 100644 milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap rename milli/src/update/facet/snapshots/incremental.rs/{shuffle_merge_string/2.hash.snap => shuffle_merge_string_and_delete/after_delete.hash.snap} (100%) rename milli/src/update/facet/snapshots/incremental.rs/{shuffle_merge_string/1.hash.snap => shuffle_merge_string_and_delete/before_delete.hash.snap} (100%) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 20ad23a37..039cd5c8d 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -318,7 +318,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("included_{i}")); let mut results = String::new(); for i in 0..=255 { let i = i as f64; @@ -334,7 +334,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); } } @@ -361,7 +361,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("included_{i}")); let mut results = String::new(); @@ -380,7 +380,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); } @@ -408,7 +408,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("included_{i}")); let mut results = String::new(); @@ -427,7 +427,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap new file mode 100644 index 000000000..be0b06ded --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +57d35cfa419a19a1a1f8d7c8ef096e0f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap new file mode 100644 index 000000000..93fe17b0c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3dbe0547b42759795e9b16989df72cee diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap new file mode 100644 index 000000000..fa7242056 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ca59f20e043a4d52c49e15b10adf96bb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap new file mode 100644 index 000000000..a7611d8c1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +cb69e0fe10fb299bafe77514204379cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap new file mode 100644 index 000000000..db8a314b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b976551ceff412bfb2ec9bfbda320bbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap new file mode 100644 index 000000000..2b82e07e8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7620ca1a96882c7147d3fd996570f9b3 diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 3493db0f7..e32a6baf1 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -853,7 +853,7 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}")); + milli_snap!(format!("{index}"), "before_delete"); for &key in keys.iter() { verify_structure_validity(&index, 0); @@ -862,11 +862,11 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}")); + milli_snap!(format!("{index}"), "after_delete"); } #[test] - fn shuffle_merge_string() { + fn shuffle_merge_string_and_delete() { let index = FacetIndex::::new(4, 8); let mut keys = (1000..1064).into_iter().collect::>(); @@ -883,7 +883,7 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}"), 1); + milli_snap!(format!("{index}"), "before_delete"); for &key in keys.iter() { verify_structure_validity(&index, 0); @@ -892,7 +892,7 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}"), 2); + milli_snap!(format!("{index}"), "after_delete"); } // fuzz tests diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap similarity index 100% rename from milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap rename to milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap new file mode 100644 index 000000000..c57ca72eb --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b17b2c4ec87a778aae07854c96c08b48 diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap similarity index 100% rename from milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap rename to milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap similarity index 100% rename from milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap rename to milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap From 982efab88ff0f1e34346c55f7a113299b93a46e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 12:50:31 +0200 Subject: [PATCH 22/58] Fix encoding bugs in facet databases --- milli/src/heed_codec/facet/new/mod.rs | 47 +++++---------------------- 1 file changed, 8 insertions(+), 39 deletions(-) diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs index 04a545564..bcb2957fc 100644 --- a/milli/src/heed_codec/facet/new/mod.rs +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -5,6 +5,8 @@ use std::marker::PhantomData; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use crate::CboRoaringBitmapCodec; + pub mod ordered_f64_codec; pub mod str_ref; // TODO: these codecs were quickly written and not fast/resilient enough @@ -35,6 +37,7 @@ impl<'a> FacetKey> { } } +#[derive(Debug)] pub struct FacetGroupValue { pub size: u8, pub bitmap: RoaringBitmap, @@ -56,7 +59,7 @@ where v.extend_from_slice(&value.field_id.to_be_bytes()); v.extend_from_slice(&[value.level]); - let bound = T::bytes_encode(&value.left_bound).unwrap(); + let bound = T::bytes_encode(&value.left_bound)?; v.extend_from_slice(&bound); Some(Cow::Owned(v)) @@ -69,9 +72,9 @@ where type DItem = FacetKey; fn bytes_decode(bytes: &'a [u8]) -> Option { - let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).unwrap()); + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); let level = bytes[2]; - let bound = T::bytes_decode(&bytes[3..]).unwrap(); + let bound = T::bytes_decode(&bytes[3..])?; Some(FacetKey { field_id: fid, level, left_bound: bound }) } } @@ -83,7 +86,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { fn bytes_encode(value: &'a Self::EItem) -> Option> { let mut v = vec![]; v.push(value.size); - value.bitmap.serialize_into(&mut v).unwrap(); + CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); Some(Cow::Owned(v)) } } @@ -91,7 +94,7 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; fn bytes_decode(bytes: &'a [u8]) -> Option { let size = bytes[0]; - let bitmap = RoaringBitmap::deserialize_from(&bytes[1..]).unwrap(); + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; Some(FacetGroupValue { size, bitmap }) } } @@ -115,37 +118,3 @@ impl<'a> BytesDecode<'a> for MyByteSlice { Some(bytes) } } - -// I won't need these ones anymore -// pub struct U16Codec; -// impl<'a> BytesEncode<'a> for U16Codec { -// type EItem = u16; - -// fn bytes_encode(item: &'a Self::EItem) -> Option> { -// Some(Cow::Owned(item.to_be_bytes().to_vec())) -// } -// } -// impl<'a> BytesDecode<'a> for U16Codec { -// type DItem = u16; - -// fn bytes_decode(bytes: &'a [u8]) -> Option { -// Some(u16::from_be_bytes(bytes[0..=1].try_into().unwrap())) -// } -// } - -// pub struct StrCodec; -// impl<'a> BytesEncode<'a> for StrCodec { -// type EItem = &'a str; - -// fn bytes_encode(item: &'a &'a str) -> Option> { -// Some(Cow::Borrowed(item.as_bytes())) -// } -// } -// impl<'a> BytesDecode<'a> for StrCodec { -// type DItem = &'a str; - -// fn bytes_decode(bytes: &'a [u8]) -> Option { -// let s = std::str::from_utf8(bytes).unwrap(); -// Some(s) -// } -// } From 3d145d7f48b35739c02e3cf3a44c624cf94ce8d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 12:51:40 +0200 Subject: [PATCH 23/58] Merge the two _faceted_documents_ids methods into one --- milli/src/index.rs | 71 ++++++++------------------- milli/src/search/criteria/asc_desc.rs | 7 ++- milli/src/snapshot_tests.rs | 5 +- milli/src/update/clear_documents.rs | 16 ++++-- milli/src/update/delete_documents.rs | 22 +++++++-- 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 0561a77ac..40e78bf10 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,6 +12,7 @@ use rstar::RTree; use time::OffsetDateTime; use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::str_ref::StrRefCodec; @@ -780,68 +781,38 @@ impl Index { /* faceted documents ids */ - /// Writes the documents ids that are faceted with numbers under this field id. - pub(crate) fn put_number_faceted_documents_ids( + /// Writes the documents ids that are faceted under this field id for the given facet type. + pub fn put_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, + facet_type: FacetType, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = - [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } - /// Retrieve all the documents ids that faceted with numbers under this field id. - pub fn number_faceted_documents_ids( + /// Retrieve all the documents ids that are faceted under this field id for the given facet type. + pub fn faceted_documents_ids( &self, rtxn: &RoTxn, field_id: FieldId, + facet_type: FacetType, ) -> heed::Result { - let mut buffer = - [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); - match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()), - } - } - - /// Writes the documents ids that are faceted with strings under this field id. - pub(crate) fn put_string_faceted_documents_ids( - &self, - wtxn: &mut RwTxn, - field_id: FieldId, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - let mut buffer = - [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); - self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) - } - - /// Retrieve all the documents ids that faceted with strings under this field id. - pub fn string_faceted_documents_ids( - &self, - rtxn: &RoTxn, - field_id: FieldId, - ) -> heed::Result { - let mut buffer = - [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 23dd860e1..ccf66889e 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -6,6 +6,7 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; @@ -62,8 +63,10 @@ impl<'t> AscDesc<'t> { let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { Some(field_id) => { - let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; - let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?; + let number_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; + let string_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; number_faceted | string_faceted } None => RoaringBitmap::default(), diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index d054e63b5..57fd2e5fe 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -4,6 +4,7 @@ use std::path::Path; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; @@ -370,7 +371,7 @@ pub fn snap_number_faceted_documents_ids(index: &Index) -> String { let mut snap = String::new(); for field_id in fields_ids_map.ids() { let number_faceted_documents_ids = - index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) .unwrap(); } @@ -383,7 +384,7 @@ pub fn snap_string_faceted_documents_ids(index: &Index) -> String { let mut snap = String::new(); for field_id in fields_ids_map.ids() { let string_faceted_documents_ids = - index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) .unwrap(); } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ba59c14cf..7d89ca89a 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::{facet::FacetType, ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -55,8 +55,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean all the faceted documents ids. for field_id in faceted_fields { - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::Number, + &empty_roaring, + )?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::String, + &empty_roaring, + )?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 531fd2b74..ffa63f0a7 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -461,9 +461,15 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Remove the documents ids from the faceted documents ids. for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids - let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::Number)?; docids -= &self.to_delete_docids; - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::Number, + &docids, + )?; remove_docids_from_field_id_docid_facet_value( self.wtxn, @@ -474,9 +480,15 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { )?; // Remove docids from the string faceted documents ids - let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::String)?; docids -= &self.to_delete_docids; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::String, + &docids, + )?; remove_docids_from_field_id_docid_facet_value( self.wtxn, @@ -648,7 +660,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = FacetsUpdateBulk::new(index, facet_type); + let builder = FacetsUpdateBulk::new_not_updating_level_0(index, facet_type); builder.execute(wtxn)?; Ok(()) From 9b55e582cd70c4f64e3739323255e91fb433be44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 12:52:05 +0200 Subject: [PATCH 24/58] Add FacetsUpdate type that wraps incremental and bulk indexing methods --- milli/src/update/facet/bulk.rs | 156 ++++++++++++------ milli/src/update/facet/mod.rs | 88 ++++++++++ .../default/facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../default/facet_id_string_docids.hash.snap | 2 +- .../facet_id_string_docids.hash.snap | 2 +- .../src/update/index_documents/typed_chunk.rs | 85 ++-------- 11 files changed, 216 insertions(+), 129 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index f93ee735e..0a4b7db45 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,10 +1,11 @@ +use std::borrow::Cow; use std::cmp; use std::fs::File; use std::num::NonZeroUsize; use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; use log::debug; use roaring::RoaringBitmap; use time::OffsetDateTime; @@ -14,21 +15,27 @@ use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; -use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; -use crate::{FieldId, Index, Result}; +use crate::update::index_documents::{ + create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, +}; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, level_group_size: usize, min_level_size: usize, facet_type: FacetType, + // None if level 0 does not need to be updated + new_data: Option>, } impl<'i> FacetsUpdateBulk<'i> { - pub fn new(index: &'i Index, facet_type: FacetType) -> FacetsUpdateBulk<'i> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, database: match facet_type { @@ -39,11 +46,31 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, level_group_size: 4, min_level_size: 5, facet_type, + new_data: Some(new_data), + } + } + + pub fn new_not_updating_level_0( + index: &'i Index, + facet_type: FacetType, + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { + index, + database: match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }, + level_group_size: 4, + min_level_size: 5, + facet_type, + new_data: None, } } @@ -70,39 +97,84 @@ impl<'i> FacetsUpdateBulk<'i> { } #[logging_timer::time("FacetsUpdateBulk::{}")] - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for &field_id in faceted_fields.iter() { self.clear_levels(wtxn, field_id)?; } + self.update_level0(wtxn)?; - let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; + // let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; for &field_id in faceted_fields.iter() { - let (level_readers, all_docids) = - self.compute_levels_for_field_id(field_id, &nested_wtxn)?; + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; - let put_docids_fn = match self.facet_type { - FacetType::Number => Index::put_number_faceted_documents_ids, - FacetType::String => Index::put_string_faceted_documents_ids, - }; - put_docids_fn(&self.index, &mut nested_wtxn, field_id, &all_docids)?; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?; for level_reader in level_readers { - // TODO: append instead of write with merge - write_into_lmdb_database( - &mut nested_wtxn, - *self.database.as_polymorph(), - level_reader, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? - }, - )?; + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + let key = FacetKeyCodec::::bytes_decode(k).unwrap(); + let value = FacetGroupValueCodec::bytes_decode(v).unwrap(); + println!("inserting {key:?} {value:?}"); + + self.database.remap_types::().put(wtxn, k, v)?; + } + } + } + + Ok(()) + } + + fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { + let new_data = match self.new_data.take() { + Some(x) => x, + None => return Ok(()), + }; + if self.database.is_empty(wtxn)? { + let mut buffer = Vec::new(); + let mut database = self.database.iter_mut(wtxn)?.remap_types::(); + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + buffer.extend_from_slice(value); + unsafe { database.append(key, &buffer)? }; + } + } + } else { + let mut buffer = Vec::new(); + let database = self.database.remap_types::(); + + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + match database.get(wtxn, key)? { + Some(prev_value) => { + let old_bitmap = &prev_value[1..]; + CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], + &mut buffer, + )?; + } + None => { + buffer.extend_from_slice(value); + } + }; + database.put(wtxn, key, &buffer)?; + } } } @@ -114,16 +186,14 @@ impl<'i> FacetsUpdateBulk<'i> { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - let algo = FacetsUpdateBulkAlgorithm { + // TODO: first check whether there is anything in level 0 + let algo = ComputeHigherLevels { rtxn: txn, db: &self.database, field_id, level_group_size: self.level_group_size, min_level_size: self.min_level_size, - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, }; - // TODO: first check whether there is anything in level 0 let mut all_docids = RoaringBitmap::new(); let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { @@ -138,16 +208,14 @@ impl<'i> FacetsUpdateBulk<'i> { } } -pub struct FacetsUpdateBulkAlgorithm<'t> { +struct ComputeHigherLevels<'t> { rtxn: &'t heed::RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, field_id: u16, level_group_size: usize, min_level_size: usize, } -impl<'t> FacetsUpdateBulkAlgorithm<'t> { +impl<'t> ComputeHigherLevels<'t> { fn read_level_0( &self, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, @@ -215,11 +283,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> { // once we have computed `level_group_size` elements, we give the left bound // of those elements, and their bitmaps, to the level above - let mut cur_writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); let mut cur_writer_len = 0; let mut group_sizes = vec![]; @@ -259,7 +323,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> { Ok(()) })?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() && cur_writer_len >= self.level_group_size * self.min_level_size { + if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size { let left_bound = left_bounds.first().unwrap(); handle_group(&bitmaps, left_bound)?; for ((bitmap, left_bound), group_size) in @@ -274,7 +338,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> { cur_writer_len += 1; } } - if cur_writer_len > self.level_group_size * self.min_level_size { + if cur_writer_len > self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); } return Ok(sub_writers); @@ -315,9 +379,9 @@ mod tests { documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); } let documents = documents_batch_reader_from_objects(documents); - + dbg!(); index.add_documents(documents).unwrap(); - + dbg!(); db_snap!(index, facet_id_f64_docids, name); }; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ecde3a248..00964a406 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,2 +1,90 @@ +use std::{collections::HashMap, fs::File}; + +use grenad::CompressionType; +use heed::BytesDecode; +use roaring::RoaringBitmap; + +use crate::{ + facet::FacetType, + heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}, + CboRoaringBitmapCodec, FieldId, Index, Result, +}; + +use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; + pub mod bulk; pub mod incremental; + +pub struct FacetsUpdate<'i> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + level_group_size: u8, + max_level_group_size: u8, + min_level_size: u8, + facet_type: FacetType, + new_data: grenad::Reader, +} +impl<'i> FacetsUpdate<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + let database = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + level_group_size: 4, + max_level_group_size: 8, + min_level_size: 5, + facet_type, + new_data, + } + } + + // /// The number of elements from the level below that are represented by a single element in the level above + // /// + // /// This setting is always greater than or equal to 2. + // pub fn level_group_size(&mut self, value: u8) -> &mut Self { + // self.level_group_size = std::cmp::max(value, 2); + // self + // } + + // /// The minimum number of elements that a level is allowed to have. + // pub fn min_level_size(&mut self, value: u8) -> &mut Self { + // self.min_level_size = std::cmp::max(value, 1); + // self + // } + + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.database.is_empty(wtxn)? { + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data); + bulk_update.execute(wtxn)?; + } else { + let indexer = FacetsUpdateIncremental::new(self.database); + + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let docids = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = + self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + } + Ok(()) + } +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap index 7ed43424a..574a3c393 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -5ce8009d3eb023e4b9c0a6e7fa4e6262 +3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap index 7ed43424a..574a3c393 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -5ce8009d3eb023e4b9c0a6e7fa4e6262 +3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index df98724da..16784bd92 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; @@ -14,12 +13,12 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; -use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; +use crate::facet::FacetType; +use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::update::FacetsUpdateIncremental; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, - Index, Result, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, }; pub(crate) enum TypedChunk { @@ -138,78 +137,14 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { - // merge cbo roaring bitmaps is not the correct merger because the data in the DB - // is FacetGroupValue and not RoaringBitmap - // so I need to create my own merging function - - // facet_id_string_docids is encoded as: - // key: FacetKeyCodec - // value: CboRoaringBitmapCodec - // basically - - // TODO: a condition saying "if I have more than 1/50th of the DB to add, - // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, - // it is a ratio I determine empirically - - // for now I only do it incrementally, to see if things work - let indexer = FacetsUpdateIncremental::new( - index.facet_id_f64_docids.remap_key_type::>(), - ); - - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = facet_id_f64_docids_iter.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = - FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - for (field_id, new_docids) in new_faceted_docids { - let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?; - docids |= new_docids; - index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?; - } - + TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); + indexer.execute(wtxn)?; is_merged_database = true; } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - // merge cbo roaring bitmaps is not the correct merger because the data in the DB - // is FacetGroupValue and not RoaringBitmap - // so I need to create my own merging function - - // facet_id_string_docids is encoded as: - // key: FacetKeyCodec - // value: CboRoaringBitmapCodec - // basically - - // TODO: a condition saying "if I have more than 1/50th of the DB to add, - // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, - // it is a ratio I determine empirically - - // for now I only do it incrementally, to see if things work - let indexer = FacetsUpdateIncremental::new( - index.facet_id_string_docids.remap_key_type::>(), - ); - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = facet_id_string_docids.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = - FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - for (field_id, new_docids) in new_faceted_docids { - let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?; - docids |= new_docids; - index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?; - } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); + indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { From 485a72306d6e599f5d602887a0fa02822087527d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 13:01:36 +0200 Subject: [PATCH 25/58] Refactor facet-related codecs --- milli/src/heed_codec/facet/mod.rs | 134 +++++++++++++++--- milli/src/heed_codec/facet/new/mod.rs | 120 ---------------- .../facet/{new => }/ordered_f64_codec.rs | 0 .../src/heed_codec/facet/{new => }/str_ref.rs | 0 milli/src/index.rs | 17 +-- milli/src/search/criteria/asc_desc.rs | 6 +- milli/src/search/distinct/facet_distinct.rs | 6 +- milli/src/search/facet/facet_distribution.rs | 15 +- .../search/facet/facet_distribution_iter.rs | 16 +-- milli/src/search/facet/facet_range_search.rs | 32 ++--- .../src/search/facet/facet_sort_ascending.rs | 20 +-- .../src/search/facet/facet_sort_descending.rs | 34 ++--- milli/src/search/facet/filter.rs | 16 ++- milli/src/search/facet/mod.rs | 22 +-- milli/src/snapshot_tests.rs | 6 +- milli/src/update/delete_documents.rs | 6 +- milli/src/update/facet/bulk.rs | 32 ++--- milli/src/update/facet/incremental.rs | 52 +++---- milli/src/update/facet/mod.rs | 27 ++-- .../extract/extract_facet_number_docids.rs | 8 +- .../extract/extract_facet_string_docids.rs | 8 +- .../word_pair_proximity_docids.hash.snap | 4 + 22 files changed, 280 insertions(+), 301 deletions(-) delete mode 100644 milli/src/heed_codec/facet/new/mod.rs rename milli/src/heed_codec/facet/{new => }/ordered_f64_codec.rs (100%) rename milli/src/heed_codec/facet/{new => }/str_ref.rs (100%) create mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index e145e311e..299aeceb4 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,25 +1,19 @@ -// mod facet_level_value_f64_codec; -// mod facet_level_value_u32_codec; -// mod facet_string_level_zero_codec; -// mod facet_string_level_zero_value_codec; -// mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +mod ordered_f64_codec; +mod str_ref; -pub mod new; - -use heed::types::OwnedType; - -// pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -// pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; -// pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -// pub use self::facet_string_level_zero_value_codec::{ -// decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, -// }; -// pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; -use crate::BEU16; +pub use self::ordered_f64_codec::OrderedF64Codec; +pub use self::str_ref::StrRefCodec; +use crate::{CboRoaringBitmapCodec, BEU16}; +use heed::types::OwnedType; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; pub type FieldIdCodec = OwnedType; @@ -32,3 +26,109 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FacetGroupKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +#[derive(Debug)] +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetGroupKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetGroupKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetGroupKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound)?; + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetGroupKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..])?; + Some(FacetGroupKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.push(value.size); + CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; + Some(FacetGroupValue { size, bitmap }) + } +} + +pub struct ByteSliceRef; + +impl<'a> BytesEncode<'a> for ByteSliceRef { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for ByteSliceRef { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs deleted file mode 100644 index bcb2957fc..000000000 --- a/milli/src/heed_codec/facet/new/mod.rs +++ /dev/null @@ -1,120 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryFrom; -use std::marker::PhantomData; - -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; - -use crate::CboRoaringBitmapCodec; - -pub mod ordered_f64_codec; -pub mod str_ref; -// TODO: these codecs were quickly written and not fast/resilient enough - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct FacetKey { - pub field_id: u16, - pub level: u8, - pub left_bound: T, -} -impl<'a> FacetKey<&'a [u8]> { - pub fn into_owned(self) -> FacetKey> { - FacetKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.to_vec(), - } - } -} - -impl<'a> FacetKey> { - pub fn as_ref(&self) -> FacetKey<&[u8]> { - FacetKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.as_slice(), - } - } -} - -#[derive(Debug)] -pub struct FacetGroupValue { - pub size: u8, - pub bitmap: RoaringBitmap, -} - -pub struct FacetKeyCodec { - _phantom: PhantomData, -} - -impl<'a, T> heed::BytesEncode<'a> for FacetKeyCodec -where - T: BytesEncode<'a>, - T::EItem: Sized, -{ - type EItem = FacetKey; - - fn bytes_encode(value: &'a Self::EItem) -> Option> { - let mut v = vec![]; - v.extend_from_slice(&value.field_id.to_be_bytes()); - v.extend_from_slice(&[value.level]); - - let bound = T::bytes_encode(&value.left_bound)?; - v.extend_from_slice(&bound); - - Some(Cow::Owned(v)) - } -} -impl<'a, T> heed::BytesDecode<'a> for FacetKeyCodec -where - T: BytesDecode<'a>, -{ - type DItem = FacetKey; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); - let level = bytes[2]; - let bound = T::bytes_decode(&bytes[3..])?; - Some(FacetKey { field_id: fid, level, left_bound: bound }) - } -} - -pub struct FacetGroupValueCodec; -impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { - type EItem = FacetGroupValue; - - fn bytes_encode(value: &'a Self::EItem) -> Option> { - let mut v = vec![]; - v.push(value.size); - CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); - Some(Cow::Owned(v)) - } -} -impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { - type DItem = FacetGroupValue; - fn bytes_decode(bytes: &'a [u8]) -> Option { - let size = bytes[0]; - let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; - Some(FacetGroupValue { size, bitmap }) - } -} - -// TODO: get rid of this codec as it is named confusingly + should really be part of heed -// or even replace the current ByteSlice codec -pub struct MyByteSlice; - -impl<'a> BytesEncode<'a> for MyByteSlice { - type EItem = &'a [u8]; - - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Borrowed(item)) - } -} - -impl<'a> BytesDecode<'a> for MyByteSlice { - type DItem = &'a [u8]; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(bytes) - } -} diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/ordered_f64_codec.rs similarity index 100% rename from milli/src/heed_codec/facet/new/ordered_f64_codec.rs rename to milli/src/heed_codec/facet/ordered_f64_codec.rs diff --git a/milli/src/heed_codec/facet/new/str_ref.rs b/milli/src/heed_codec/facet/str_ref.rs similarity index 100% rename from milli/src/heed_codec/facet/new/str_ref.rs rename to milli/src/heed_codec/facet/str_ref.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 40e78bf10..66a53d98c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,15 +14,10 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec}; -use crate::heed_codec::facet::{ - // FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, - FieldIdCodec, -}; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::StrRefCodec; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, @@ -130,9 +125,9 @@ pub struct Index { pub facet_id_exists_docids: Database, /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. - pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, /// Maps the facet field id and ranges of strings with the docids that corresponds to them. - pub facet_id_string_docids: Database, FacetGroupValueCodec>, + pub facet_id_string_docids: Database, FacetGroupValueCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ccf66889e..2908f0e78 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::facet_sort_descending::descending_facet_sort; @@ -196,14 +196,14 @@ fn facet_ordered<'t>( let number_iter = make_iter( rtxn, - index.facet_id_f64_docids.remap_key_type::>(), + index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates.clone(), )?; let string_iter = make_iter( rtxn, - index.facet_id_string_docids.remap_key_type::>(), + index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, )?; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 4a4815775..b9d584eb6 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; -use crate::heed_codec::facet::new::FacetKey; +use crate::heed_codec::facet::FacetGroupKey; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; @@ -48,7 +48,7 @@ impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { self.index .facet_id_string_docids - .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) .map(|opt| opt.map(|v| v.bitmap)) } @@ -56,7 +56,7 @@ impl<'a> FacetDistinctIter<'a> { // get facet docids on level 0 self.index .facet_id_f64_docids - .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) .map(|opt| opt.map(|v| v.bitmap)) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index c7619c609..10b995d97 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -8,12 +8,11 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::StrRefCodec; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; use crate::search::facet::facet_distribution_iter; -// use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -138,7 +137,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_f64_docids.remap_key_type::>(), + self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { @@ -161,7 +160,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_string_docids.remap_key_type::>(), + self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { @@ -191,7 +190,7 @@ impl<'a> FacetDistribution<'a> { let iter = db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); for result in iter { let (key, value) = result?; @@ -206,7 +205,7 @@ impl<'a> FacetDistribution<'a> { .facet_id_string_docids .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); // TODO: get the original value of the facet somewhere (in the documents DB?) for result in iter { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 13ba28019..151304029 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,11 +4,11 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupValueCodec, FacetGroupKeyCodec}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -18,9 +18,9 @@ where { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = - get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; + get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { @@ -33,7 +33,7 @@ where CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } @@ -49,7 +49,7 @@ where group_size: usize, ) -> Result> { let starting_key = - FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size); for el in iter { let (key, value) = el?; @@ -78,7 +78,7 @@ where if level == 0 { return self.iterate_level_0(candidates, starting_bound, group_size); } - let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound }; + let starting_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); for el in iter { @@ -116,7 +116,7 @@ mod tests { use roaring::RoaringBitmap; use super::iterate_over_facet_distribution; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 039cd5c8d..a0e6d8e03 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,12 +4,12 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef}; use crate::Result; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -42,13 +42,13 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); + let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { @@ -59,7 +59,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, @@ -68,7 +68,7 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = - FacetKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); for el in iter { let (key, value) = el?; @@ -117,7 +117,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { return self.run_level_0(starting_left_bound, group_size); } - let left_key = FacetKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let left_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); let (mut previous_key, mut previous_value) = iter.next().unwrap()?; @@ -258,8 +258,8 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::new::FacetKeyCodec; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::FacetGroupKeyCodec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; @@ -310,7 +310,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -326,7 +326,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -352,7 +352,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -371,7 +371,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -399,7 +399,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -418,7 +418,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index b3cae5d28..b601242e8 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -2,19 +2,19 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })) @@ -25,11 +25,11 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take, FacetGroupValueCodec>>, )>, } @@ -41,7 +41,7 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { let (documents_ids, deepest_iter) = self.stack.last_mut()?; for result in deepest_iter { let ( - FacetKey { level, left_bound, field_id }, + FacetGroupKey { level, left_bound, field_id }, FacetGroupValue { size: group_size, mut bitmap }, ) = result.unwrap(); // The range is unbounded on the right and the group size for the highest level is MAX, @@ -65,7 +65,7 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { return Some(Ok(bitmap)); } let starting_key_below = - FacetKey { field_id: self.field_id, level: level - 1, left_bound }; + FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound }; let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) { Ok(iter) => iter, Err(e) => return Some(Err(e.into())), @@ -86,7 +86,7 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index d68c9bdad..088f8d2fa 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -4,21 +4,21 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); - let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(Box::new(DescendingFacetSort { rtxn, @@ -33,11 +33,11 @@ pub fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take, FacetGroupValueCodec>>, Bound<&'t [u8]>, )>, } @@ -50,7 +50,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; while let Some(result) = deepest_iter.next() { let ( - FacetKey { level, left_bound, field_id }, + FacetGroupKey { level, left_bound, field_id }, FacetGroupValue { size: group_size, mut bitmap }, ) = result.unwrap(); // The range is unbounded on the right and the group size for the highest level is MAX, @@ -72,15 +72,15 @@ impl<'t> Iterator for DescendingFacetSort<'t> { if level == 0 { return Some(Ok(bitmap)); } - let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; + let starting_key_below = FacetGroupKey { field_id, level: level - 1, left_bound }; let end_key_kelow = match *right_bound { - Bound::Included(right) => Bound::Included(FacetKey { + Bound::Included(right) => Bound::Included(FacetGroupKey { field_id, level: level - 1, left_bound: right, }), - Bound::Excluded(right) => Bound::Excluded(FacetKey { + Bound::Excluded(right) => Bound::Excluded(FacetGroupKey { field_id, level: level - 1, left_bound: right, @@ -90,7 +90,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); let iter = - match self.db.remap_key_type::>().rev_range( + match self.db.remap_key_type::>().rev_range( &self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), ) { @@ -114,8 +114,8 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; @@ -162,7 +162,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.db.content.remap_key_type::>(); + let db = index.db.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 6a10b7097..1b40f6db1 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -9,8 +9,8 @@ use roaring::RoaringBitmap; use super::facet_range_search; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. @@ -180,7 +180,11 @@ impl<'a> Filter<'a> { let string_docids = strings_db .get( rtxn, - &FacetKey { field_id, level: 0, left_bound: &val.value().to_lowercase() }, + &FacetGroupKey { + field_id, + level: 0, + left_bound: &val.value().to_lowercase(), + }, )? .map(|v| v.bitmap) .unwrap_or_default(); @@ -218,10 +222,10 @@ impl<'a> Filter<'a> { .remap_data_type::() .get_lower_than_or_equal_to( rtxn, - &FacetKey { field_id, level: u8::MAX, left_bound: f64::MAX }, + &FacetGroupKey { field_id, level: u8::MAX, left_bound: f64::MAX }, )? .and_then( - |(FacetKey { field_id: id, level, .. }, _)| { + |(FacetGroupKey { field_id: id, level, .. }, _)| { if id == field_id { Some(level) } else { @@ -252,7 +256,7 @@ impl<'a> Filter<'a> { /// going deeper through the levels. fn explore_facet_number_levels( rtxn: &heed::RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: FieldId, level: u8, left: Bound, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 78cd8fd4b..ec5caa2a8 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; mod facet_distribution; mod facet_distribution_iter; @@ -14,7 +14,7 @@ mod filter; pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -28,7 +28,7 @@ where if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; let first_key = - FacetKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + FacetGroupKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; Ok(Some(first_key.left_bound)) } else { Ok(None) @@ -36,7 +36,7 @@ where } pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -51,7 +51,7 @@ where if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; let last_key = - FacetKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + FacetGroupKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; Ok(Some(last_key.left_bound)) } else { Ok(None) @@ -59,7 +59,7 @@ where } pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); @@ -69,7 +69,7 @@ pub(crate) fn get_highest_level<'t>( .next() .map(|el| { let (key, _) = el.unwrap(); - let key = FacetKeyCodec::::bytes_decode(key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); key.level }) .unwrap_or(0)) @@ -84,8 +84,8 @@ pub mod test { use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, + use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; use crate::snapshot_tests::display_bitmap; use crate::update::FacetsUpdateIncremental; @@ -101,7 +101,7 @@ pub mod test { } pub struct Database { - pub content: heed::Database, FacetGroupValueCodec>, + pub content: heed::Database, FacetGroupValueCodec>, pub group_size: usize, pub max_group_size: usize, _tempdir: Rc, @@ -184,7 +184,7 @@ pub mod test { let mut iter = self.db.content.iter(&txn).unwrap(); while let Some(el) = iter.next() { let (key, value) = el.unwrap(); - let FacetKey { field_id, level, left_bound: bound } = key; + let FacetGroupKey { field_id, level, left_bound: bound } = key; let bound = BoundCodec::bytes_decode(bound).unwrap(); let FacetGroupValue { size, bitmap } = value; writeln!( diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 57fd2e5fe..ab9dddaf2 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -5,7 +5,7 @@ use std::path::Path; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; +use crate::heed_codec::facet::{FacetGroupValue, FacetGroupKey}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; #[track_caller] @@ -280,7 +280,7 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { } pub fn snap_facet_id_f64_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - FacetKey { field_id, level, left_bound }, + FacetGroupKey { field_id, level, left_bound }, FacetGroupValue { size, bitmap }, )| { &format!("{field_id:<3} {level:<2} {left_bound:<6} {size:<2} {}", display_bitmap(&bitmap)) @@ -289,7 +289,7 @@ pub fn snap_facet_id_f64_docids(index: &Index) -> String { } pub fn snap_facet_id_string_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( - FacetKey { field_id, level, left_bound }, + FacetGroupKey { field_id, level, left_bound }, FacetGroupValue { size, bitmap }, )| { &format!("{field_id:<3} {level:<2} {left_bound:<12} {size:<2} {}", display_bitmap(&bitmap)) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index ffa63f0a7..5b9e99d77 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -11,7 +11,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ @@ -626,10 +626,10 @@ fn remove_docids_from_facet_id_docids<'a>( ) -> Result<()> { let db = match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; let mut modified = false; diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 0a4b7db45..38017a83d 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -12,8 +12,8 @@ use time::OffsetDateTime; use crate::error::InternalError; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; use crate::update::index_documents::{ create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, @@ -22,7 +22,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, level_group_size: usize, min_level_size: usize, facet_type: FacetType, @@ -40,10 +40,10 @@ impl<'i> FacetsUpdateBulk<'i> { index, database: match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }, level_group_size: 4, @@ -61,10 +61,10 @@ impl<'i> FacetsUpdateBulk<'i> { index, database: match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }, level_group_size: 4, @@ -89,8 +89,8 @@ impl<'i> FacetsUpdateBulk<'i> { } fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { - let left = FacetKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; - let right = FacetKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; let range = left..=right; self.database.delete_range(wtxn, &range).map(drop)?; Ok(()) @@ -119,7 +119,7 @@ impl<'i> FacetsUpdateBulk<'i> { for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - let key = FacetKeyCodec::::bytes_decode(k).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(k).unwrap(); let value = FacetGroupValueCodec::bytes_decode(v).unwrap(); println!("inserting {key:?} {value:?}"); @@ -210,7 +210,7 @@ impl<'i> FacetsUpdateBulk<'i> { struct ComputeHigherLevels<'t> { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, level_group_size: usize, min_level_size: usize, @@ -233,7 +233,7 @@ impl<'t> ComputeHigherLevels<'t> { .db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; @@ -311,9 +311,9 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; let key = - FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; @@ -329,9 +329,9 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; let key = - FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; cur_writer.insert(key, value)?; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index e32a6baf1..e86aa4402 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -2,8 +2,8 @@ use heed::types::ByteSlice; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; use crate::search::facet::get_highest_level; use crate::Result; @@ -19,13 +19,13 @@ enum DeletionResult { } pub struct FacetsUpdateIncremental { - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, group_size: usize, min_level_size: usize, max_group_size: usize, } impl FacetsUpdateIncremental { - pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { + pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } } @@ -36,7 +36,7 @@ impl FacetsUpdateIncremental { level: u8, search_key: &[u8], txn: &RoTxn, - ) -> Result<(FacetKey>, FacetGroupValue)> { + ) -> Result<(FacetGroupKey>, FacetGroupValue)> { let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); @@ -45,17 +45,17 @@ impl FacetsUpdateIncremental { let mut prefix_iter = self .db .as_polymorph() - .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( - FacetKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, )) } else { - let key = FacetKey { field_id, level, left_bound: search_key }; + let key = FacetGroupKey { field_id, level, left_bound: search_key }; match self.db.get_lower_than(txn, &key)? { Some((key, value)) => { if key.level != level || key.field_id != field_id { @@ -66,13 +66,13 @@ impl FacetsUpdateIncremental { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>( + .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>( txn, &prefix.as_slice(), )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -93,7 +93,7 @@ impl FacetsUpdateIncremental { new_key: &[u8], new_values: &RoaringBitmap, ) -> Result { - let key = FacetKey { field_id, level: 0, left_bound: new_key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: new_key }; let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; let mut level0_prefix = vec![]; @@ -193,7 +193,7 @@ impl FacetsUpdateIncremental { .db .get_greater_than_or_equal_to( &txn, - &FacetKey { + &FacetGroupKey { field_id, level: level_below, left_bound: insertion_key.left_bound.as_slice(), @@ -217,7 +217,7 @@ impl FacetsUpdateIncremental { } let key = - FacetKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; (key, value) }; @@ -235,7 +235,7 @@ impl FacetsUpdateIncremental { } let key = - FacetKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; + FacetGroupKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; (key, value) }; @@ -303,7 +303,7 @@ impl FacetsUpdateIncremental { let mut values = RoaringBitmap::new(); for _ in 0..group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -311,7 +311,7 @@ impl FacetsUpdateIncremental { } values |= value_i.bitmap; } - let key = FacetKey { + let key = FacetGroupKey { field_id, level: highest_level + 1, left_bound: first_key.unwrap().left_bound, @@ -384,7 +384,7 @@ impl FacetsUpdateIncremental { key: &[u8], value: u32, ) -> Result { - let key = FacetKey { field_id, level: 0, left_bound: key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: key }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; bitmap.remove(value); @@ -415,7 +415,7 @@ impl FacetsUpdateIncremental { key: &[u8], value: u32, ) -> Result<()> { - if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { + if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() { return Ok(()); } let highest_level = get_highest_level(&txn, self.db, field_id)?; @@ -450,7 +450,7 @@ impl FacetsUpdateIncremental { while let Some(el) = iter.next() { let (k, _) = el?; to_delete.push( - FacetKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), + FacetGroupKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), ); } drop(iter); @@ -469,9 +469,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::new::str_ref::StrRefCodec; - use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::str_ref::StrRefCodec; + use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; @@ -502,7 +502,7 @@ mod tests { .unwrap(); while let Some(el) = iter.next() { let (key, value) = el.unwrap(); - let key = FacetKeyCodec::::bytes_decode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); let mut prefix_start_below = vec![]; prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); @@ -519,7 +519,7 @@ mod tests { ) .unwrap(); let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetKeyCodec::::bytes_decode(&key_bytes).unwrap() + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; assert!(value.size > 0 && (value.size as usize) < db.max_group_size); @@ -996,7 +996,7 @@ mod tests { // for ((key, values), group) in values_field_id.iter().zip(level0iter) { // let (group_key, group_values) = group.unwrap(); -// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } @@ -1014,7 +1014,7 @@ mod tests { // for ((key, values), group) in values_field_id.iter().zip(level0iter) { // let (group_key, group_values) = group.unwrap(); -// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 00964a406..77b42f355 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,23 +1,20 @@ -use std::{collections::HashMap, fs::File}; - +use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; +use crate::{ + facet::FacetType, + heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, + CboRoaringBitmapCodec, FieldId, Index, Result, +}; use grenad::CompressionType; use heed::BytesDecode; use roaring::RoaringBitmap; - -use crate::{ - facet::FacetType, - heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}, - CboRoaringBitmapCodec, FieldId, Index, Result, -}; - -use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; +use std::{collections::HashMap, fs::File}; pub mod bulk; pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, level_group_size: u8, max_level_group_size: u8, min_level_size: u8, @@ -28,10 +25,10 @@ impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { let database = match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { @@ -70,8 +67,8 @@ impl<'i> FacetsUpdate<'i> { let mut cursor = self.new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let key = - FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index eece08ee3..9a89691b1 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,9 +6,9 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::heed_codec::facet::FieldDocIdFacetF64Codec; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -36,8 +36,8 @@ pub fn extract_facet_number_docids( let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - let key = FacetKey { field_id, level: 0, left_bound: number }; - let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 591f44c74..078a82335 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,8 +4,8 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; -use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; +use crate::heed_codec::facet::StrRefCodec; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; @@ -43,8 +43,8 @@ pub fn extract_facet_string_docids( let document_id = u32::from_be_bytes(document_id_bytes); let normalised_value = std::str::from_utf8(normalized_value_bytes)?; - let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; - let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e50e50347 --- /dev/null +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +6873ff1f78d08f2b1a13bb9e37349c01 From 330c9eb1b28ad84cb7f710f58682b254fae1d06d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 13:49:52 +0200 Subject: [PATCH 26/58] Rename facet codecs and refine FacetsUpdate API --- .../search/facet/facet_distribution_iter.rs | 19 +-- milli/src/search/facet/facet_range_search.rs | 9 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 27 ++-- milli/src/update/facet/bulk.rs | 134 ++++++++---------- milli/src/update/facet/incremental.rs | 50 ++++--- milli/src/update/facet/mod.rs | 24 +--- .../incremental.rs/delete_from_end/15.snap | 4 - 8 files changed, 133 insertions(+), 138 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 151304029..6eec64b25 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,7 +4,9 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupValueCodec, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, +}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, @@ -78,7 +80,8 @@ where if level == 0 { return self.iterate_level_0(candidates, starting_bound, group_size); } - let starting_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; + let starting_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); for el in iter { @@ -109,16 +112,14 @@ where #[cfg(test)] mod tests { - use std::ops::ControlFlow; - + use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::test::FacetIndex; use heed::BytesDecode; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - - use super::iterate_over_facet_distribution; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; - use crate::milli_snap; - use crate::search::facet::test::FacetIndex; + use std::ops::ControlFlow; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index a0e6d8e03..d9a6c5fd4 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,7 +4,9 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, +}; use crate::Result; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( @@ -117,7 +119,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { return self.run_level_0(starting_left_bound, group_size); } - let left_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let left_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); let (mut previous_key, mut previous_value) = iter.next().unwrap()?; @@ -258,8 +261,8 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::FacetGroupKeyCodec; + use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index b601242e8..e620f9f1d 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -3,7 +3,7 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; pub fn ascending_facet_sort<'t>( @@ -86,7 +86,7 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 088f8d2fa..5425a5051 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; pub fn descending_facet_sort<'t>( @@ -37,7 +37,9 @@ struct DescendingFacetSort<'t> { field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take< + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, Bound<&'t [u8]>, )>, } @@ -72,7 +74,8 @@ impl<'t> Iterator for DescendingFacetSort<'t> { if level == 0 { return Some(Ok(bitmap)); } - let starting_key_below = FacetGroupKey { field_id, level: level - 1, left_bound }; + let starting_key_below = + FacetGroupKey { field_id, level: level - 1, left_bound }; let end_key_kelow = match *right_bound { Bound::Included(right) => Bound::Included(FacetGroupKey { @@ -89,15 +92,17 @@ impl<'t> Iterator for DescendingFacetSort<'t> { }; let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); - let iter = - match self.db.remap_key_type::>().rev_range( + let iter = match self + .db + .remap_key_type::>() + .rev_range( &self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), ) { - Ok(iter) => iter, - Err(e) => return Some(Err(e.into())), - } - .take(group_size as usize); + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter, prev_right_bound)); continue 'outer; @@ -114,8 +119,8 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 38017a83d..70392b7db 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,30 +1,24 @@ +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; use std::borrow::Cow; use std::cmp; use std::fs::File; -use std::num::NonZeroUsize; - -use grenad::CompressionType; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::error::InternalError; -use crate::facet::FacetType; -use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, -}; -use crate::update::index_documents::{ - create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, -}; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; - pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - level_group_size: usize, - min_level_size: usize, + level_group_size: u8, + min_level_size: u8, facet_type: FacetType, // None if level 0 does not need to be updated new_data: Option>, @@ -39,9 +33,9 @@ impl<'i> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, database: match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { index.facet_id_f64_docids.remap_key_type::>() } @@ -60,9 +54,9 @@ impl<'i> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, database: match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { index.facet_id_f64_docids.remap_key_type::>() } @@ -77,14 +71,14 @@ impl<'i> FacetsUpdateBulk<'i> { /// The number of elements from the level below that are represented by a single element in the level above /// /// This setting is always greater than or equal to 2. - pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = cmp::max(value.get(), 2); + pub fn level_group_size(mut self, value: u8) -> Self { + self.level_group_size = cmp::max(value, 2); self } /// The minimum number of elements that a level is allowed to have. - pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.min_level_size = value.get(); + pub fn min_level_size(mut self, value: u8) -> Self { + self.min_level_size = cmp::max(value, 1); self } @@ -109,8 +103,6 @@ impl<'i> FacetsUpdateBulk<'i> { } self.update_level0(wtxn)?; - // let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; - for &field_id in faceted_fields.iter() { let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; @@ -119,10 +111,6 @@ impl<'i> FacetsUpdateBulk<'i> { for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(k).unwrap(); - let value = FacetGroupValueCodec::bytes_decode(v).unwrap(); - println!("inserting {key:?} {value:?}"); - self.database.remap_types::().put(wtxn, k, v)?; } } @@ -141,14 +129,12 @@ impl<'i> FacetsUpdateBulk<'i> { let mut database = self.database.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - if valid_lmdb_key(key) { - buffer.clear(); - // the group size for level 0 - buffer.push(1); - // then we extend the buffer with the docids bitmap - buffer.extend_from_slice(value); - unsafe { database.append(key, &buffer)? }; - } + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + buffer.extend_from_slice(value); + unsafe { database.append(key, &buffer)? }; } } else { let mut buffer = Vec::new(); @@ -156,25 +142,24 @@ impl<'i> FacetsUpdateBulk<'i> { let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - if valid_lmdb_key(key) { - buffer.clear(); - // the group size for level 0 - buffer.push(1); - // then we extend the buffer with the docids bitmap - match database.get(wtxn, key)? { - Some(prev_value) => { - let old_bitmap = &prev_value[1..]; - CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], - &mut buffer, - )?; - } - None => { - buffer.extend_from_slice(value); - } - }; - database.put(wtxn, key, &buffer)?; - } + // the value is a CboRoaringBitmap, but I still need to prepend the + // group size for level 0 (= 1) to it + buffer.clear(); + buffer.push(1); + // then we extend the buffer with the docids bitmap + match database.get(wtxn, key)? { + Some(prev_value) => { + let old_bitmap = &prev_value[1..]; + CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], + &mut buffer, + )?; + } + None => { + buffer.extend_from_slice(value); + } + }; + database.put(wtxn, key, &buffer)?; } } @@ -186,7 +171,7 @@ impl<'i> FacetsUpdateBulk<'i> { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - // TODO: first check whether there is anything in level 0 + // TODO: first check whether there is anything in level 0? let algo = ComputeHigherLevels { rtxn: txn, db: &self.database, @@ -212,8 +197,8 @@ struct ComputeHigherLevels<'t> { rtxn: &'t heed::RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, - level_group_size: usize, - min_level_size: usize, + level_group_size: u8, + min_level_size: u8, } impl<'t> ComputeHigherLevels<'t> { fn read_level_0( @@ -248,7 +233,7 @@ impl<'t> ComputeHigherLevels<'t> { } bitmaps.push(docids); - if bitmaps.len() == self.level_group_size { + if bitmaps.len() == self.level_group_size as usize { handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); @@ -265,9 +250,8 @@ impl<'t> ComputeHigherLevels<'t> { /// Compute the content of the database levels from its level 0 for the given field id. /// /// ## Returns: - /// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` + /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. - /// 2. a roaring bitmap of all the document ids present in the database fn compute_higher_levels( &self, level: u8, @@ -302,7 +286,7 @@ impl<'t> ComputeHigherLevels<'t> { left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() != self.level_group_size { + if bitmaps.len() != self.level_group_size as usize { return Ok(()); } let left_bound = left_bounds.first().unwrap(); @@ -312,8 +296,8 @@ impl<'t> ComputeHigherLevels<'t> { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; - let key = - FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; @@ -330,8 +314,8 @@ impl<'t> ComputeHigherLevels<'t> { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; - let key = - FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; cur_writer.insert(key, value)?; @@ -340,6 +324,10 @@ impl<'t> ComputeHigherLevels<'t> { } if cur_writer_len > self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); + } else { + if !bitmaps.is_empty() { + handle_group(&bitmaps, left_bounds.first().unwrap())?; + } } return Ok(sub_writers); } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index e86aa4402..bcde3bc53 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::Result; @@ -20,14 +20,26 @@ enum DeletionResult { pub struct FacetsUpdateIncremental { db: heed::Database, FacetGroupValueCodec>, - group_size: usize, - min_level_size: usize, - max_group_size: usize, + group_size: u8, + min_level_size: u8, + max_group_size: u8, } impl FacetsUpdateIncremental { pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } + pub fn group_size(mut self, size: u8) -> Self { + self.group_size = size; + self + } + pub fn min_level_size(mut self, size: u8) -> Self { + self.min_level_size = size; + self + } + pub fn max_group_size(mut self, size: u8) -> Self { + self.max_group_size = size; + self + } } impl FacetsUpdateIncremental { fn find_insertion_key_value( @@ -178,12 +190,7 @@ impl FacetsUpdateIncremental { let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); updated_value.size += 1; - if updated_value.size as usize == max_group_size { - // need to split it - // recompute left element and right element - // replace current group by left element - // add one more group to the right - + if updated_value.size == max_group_size { let size_left = max_group_size / 2; let size_right = max_group_size - size_left; @@ -201,7 +208,7 @@ impl FacetsUpdateIncremental { )? .unwrap(); - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size); + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); let group_left = { let mut values_left = RoaringBitmap::new(); @@ -234,8 +241,11 @@ impl FacetsUpdateIncremental { values_right |= &value.bitmap; } - let key = - FacetGroupKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; + let key = FacetGroupKey { + field_id, + level, + left_bound: right_start_key.unwrap().to_vec(), + }; let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; (key, value) }; @@ -288,7 +298,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count(); - if size_highest_level < self.group_size * self.min_level_size { + if size_highest_level < self.group_size as usize * self.min_level_size as usize { return Ok(()); } @@ -438,7 +448,7 @@ impl FacetsUpdateIncremental { .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count() - >= self.group_size + >= self.min_level_size as usize { return Ok(()); } @@ -450,7 +460,9 @@ impl FacetsUpdateIncremental { while let Some(el) = iter.next() { let (k, _) = el?; to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), + FacetGroupKeyCodec::::bytes_decode(k) + .ok_or(Error::Encoding)? + .into_owned(), ); } drop(iter); @@ -469,9 +481,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::str_ref::StrRefCodec; - use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::StrRefCodec; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 77b42f355..04810cb48 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -4,7 +4,6 @@ use crate::{ heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, CboRoaringBitmapCodec, FieldId, Index, Result, }; -use grenad::CompressionType; use heed::BytesDecode; use roaring::RoaringBitmap; use std::{collections::HashMap, fs::File}; @@ -42,26 +41,17 @@ impl<'i> FacetsUpdate<'i> { } } - // /// The number of elements from the level below that are represented by a single element in the level above - // /// - // /// This setting is always greater than or equal to 2. - // pub fn level_group_size(&mut self, value: u8) -> &mut Self { - // self.level_group_size = std::cmp::max(value, 2); - // self - // } - - // /// The minimum number of elements that a level is allowed to have. - // pub fn min_level_size(&mut self, value: u8) -> &mut Self { - // self.min_level_size = std::cmp::max(value, 1); - // self - // } - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + // here, come up with a better condition! if self.database.is_empty(wtxn)? { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data); + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) + .level_group_size(self.level_group_size) + .min_level_size(self.min_level_size); bulk_update.execute(wtxn)?; } else { - let indexer = FacetsUpdateIncremental::new(self.database); + let indexer = FacetsUpdateIncremental::new(self.database) + .max_group_size(self.max_level_group_size) + .min_level_size(self.min_level_size); let mut new_faceted_docids = HashMap::::default(); diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap index 08534cbd4..e037c0295 100644 --- a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap @@ -16,8 +16,4 @@ source: milli/src/update/facet/incremental.rs 0 0 k12 1 "[12, ]" 0 0 k13 1 "[13, ]" 0 0 k14 1 "[14, ]" -0 1 k0 4 "[0, 1, 2, 3, ]" -0 1 k4 4 "[4, 5, 6, 7, ]" -0 1 k8 4 "[8, 9, 10, 11, ]" -0 1 k12 3 "[12, 13, 14, ]" From 9026867d17744a0a95ea4086d0efff48ddd323af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 17:31:26 +0200 Subject: [PATCH 27/58] Give same interface to bulk and incremental facet indexing types + cargo fmt, oops, sorry for the bad history :( --- milli/src/heed_codec/facet/mod.rs | 14 ++- milli/src/index.rs | 8 +- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/distinct/facet_distinct.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 8 +- .../search/facet/facet_distribution_iter.rs | 14 ++- milli/src/search/facet/facet_range_search.rs | 7 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 7 +- milli/src/search/facet/filter.rs | 5 +- milli/src/search/facet/mod.rs | 59 ++++++--- milli/src/snapshot_tests.rs | 2 +- milli/src/update/clear_documents.rs | 3 +- milli/src/update/delete_documents.rs | 2 +- milli/src/update/facet/bulk.rs | 118 ++++++++++++++++-- milli/src/update/facet/incremental.rs | 117 ++++++++++++----- milli/src/update/facet/mod.rs | 68 +++++----- .../default/facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../default/facet_id_string_docids.hash.snap | 4 + .../facet_id_string_docids.hash.snap | 4 + .../extract/extract_facet_number_docids.rs | 6 +- .../extract/extract_facet_string_docids.rs | 3 +- .../index_documents/helpers/grenad_helpers.rs | 32 +---- .../src/update/index_documents/helpers/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/mod.rs | 2 +- 27 files changed, 333 insertions(+), 174 deletions(-) create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 299aeceb4..40e395881 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -3,17 +3,19 @@ mod field_doc_id_facet_string_codec; mod ordered_f64_codec; mod str_ref; +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; + +use heed::types::OwnedType; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; + pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; -use heed::types::OwnedType; -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::convert::TryFrom; -use std::marker::PhantomData; pub type FieldIdCodec = OwnedType; diff --git a/milli/src/index.rs b/milli/src/index.rs index 66a53d98c..893817d59 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,10 +14,10 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec}; +use crate::heed_codec::facet::{ + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldIdCodec, OrderedF64Codec, StrRefCodec, +}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 2908f0e78..bb2788cc8 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::facet_sort_descending::descending_facet_sort; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index b9d584eb6..1725346be 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,8 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; -use crate::heed_codec::facet::FacetGroupKey; -use crate::heed_codec::facet::*; +use crate::heed_codec::facet::{FacetGroupKey, *}; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 10b995d97..7c554d368 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -8,10 +8,10 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, OrderedF64Codec, StrRefCodec, +}; use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 6eec64b25..2eebffbcd 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -112,17 +112,19 @@ where #[cfg(test)] mod tests { + use std::ops::ControlFlow; + + use heed::BytesDecode; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use super::iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use std::ops::ControlFlow; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -133,7 +135,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index d9a6c5fd4..bb555e1ab 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -261,14 +261,13 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::FacetGroupKeyCodec; - use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -279,7 +278,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e620f9f1d..fc5fd3d04 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -93,7 +93,7 @@ mod tests { use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -104,7 +104,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 5425a5051..42bae42a6 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -119,15 +119,14 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -138,7 +137,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 1b40f6db1..15edafb03 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -9,8 +9,9 @@ use roaring::RoaringBitmap; use super::facet_range_search; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, +}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ec5caa2a8..ef72658ec 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; mod facet_distribution; mod facet_distribution_iter; @@ -27,8 +27,8 @@ where db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; - let first_key = - FacetGroupKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) + .ok_or(heed::Error::Encoding)?; Ok(Some(first_key.left_bound)) } else { Ok(None) @@ -50,8 +50,8 @@ where .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; - let last_key = - FacetGroupKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) + .ok_or(heed::Error::Encoding)?; Ok(Some(last_key.left_bound)) } else { Ok(None) @@ -85,11 +85,12 @@ pub mod test { use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::snapshot_tests::display_bitmap; - use crate::update::FacetsUpdateIncremental; + use crate::update::FacetsUpdateIncrementalInner; + // A dummy index that only contains the facet database, used for testing pub struct FacetIndex where for<'a> BoundCodec: @@ -100,10 +101,12 @@ pub mod test { _phantom: PhantomData, } + // The faecet database and its settings pub struct Database { pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: usize, - pub max_group_size: usize, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, _tempdir: Rc, } @@ -117,9 +120,12 @@ pub mod test { tempdir: Rc, group_size: u8, max_group_size: u8, + min_level_size: u8, ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; - let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 10 * 100); unsafe { @@ -129,14 +135,25 @@ pub mod test { let content = env.open_database(None).unwrap().unwrap(); FacetIndex { - db: Database { content, group_size, max_group_size, _tempdir: tempdir }, + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: tempdir, + }, env, _phantom: PhantomData, } } - pub fn new(group_size: u8, max_group_size: u8) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; - let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 100); let tempdir = tempfile::TempDir::new().unwrap(); @@ -144,7 +161,13 @@ pub mod test { let content = env.create_database(None).unwrap(); FacetIndex { - db: Database { content, group_size, max_group_size, _tempdir: Rc::new(tempdir) }, + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: Rc::new(tempdir), + }, env, _phantom: PhantomData, } @@ -156,7 +179,7 @@ pub mod test { key: &'a >::EItem, docids: &RoaringBitmap, ) { - let update = FacetsUpdateIncremental::new(self.db.content); + let update = FacetsUpdateIncrementalInner::new(self.db.content); let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); } @@ -167,7 +190,7 @@ pub mod test { key: &'a >::EItem, value: u32, ) { - let update = FacetsUpdateIncremental::new(self.db.content); + let update = FacetsUpdateIncrementalInner::new(self.db.content); let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index ab9dddaf2..9bc39d882 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -5,7 +5,7 @@ use std::path::Path; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupValue, FacetGroupKey}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; #[track_caller] diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 7d89ca89a..adeea11fa 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,8 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{facet::FacetType, ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::facet::FacetType; +use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5b9e99d77..14ef5fd6a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -11,7 +11,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 70392b7db..ad97ed2de 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,18 +1,20 @@ +use std::borrow::Cow; +use std::cmp; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::cmp; -use std::fs::File; -use time::OffsetDateTime; pub struct FacetsUpdateBulk<'i> { index: &'i Index, @@ -367,9 +369,7 @@ mod tests { documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); } let documents = documents_batch_reader_from_objects(documents); - dbg!(); index.add_documents(documents).unwrap(); - dbg!(); db_snap!(index, facet_id_f64_docids, name); }; @@ -421,4 +421,100 @@ mod tests { test("default", None, None); test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); } + + #[test] + fn test_facets_number_incremental_update() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + let mut documents = vec![]; + for i in 1000..1010 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 100..110 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } + + #[test] + fn test_facets_number_delete_facet_id_then_bulk_update() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + // 1100 facets -> how long is the DB? + + let mut documents = vec![]; + for i in 1000..1010 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 100..110 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index bcde3bc53..75ca5d55b 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,12 +1,16 @@ +use std::collections::HashMap; +use std::fs::File; + use heed::types::ByteSlice; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; -use crate::Result; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { InPlace, @@ -18,30 +22,79 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -pub struct FacetsUpdateIncremental { +pub struct FacetsUpdateIncremental<'i> { + index: &'i Index, + inner: FacetsUpdateIncrementalInner, + facet_type: FacetType, + new_data: grenad::Reader, +} + +impl<'i> FacetsUpdateIncremental<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + FacetsUpdateIncremental { + index, + inner: FacetsUpdateIncrementalInner { + db: match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => index + .facet_id_f64_docids + .remap_key_type::>(), + }, + group_size: 4, + max_group_size: 8, + min_level_size: 5, + }, + facet_type, + new_data, + } + } + pub fn group_size(mut self, size: u8) -> Self { + self.inner.group_size = size; + self + } + pub fn min_level_size(mut self, size: u8) -> Self { + self.inner.min_level_size = size; + self + } + pub fn max_group_size(mut self, size: u8) -> Self { + self.inner.max_group_size = size; + self + } + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + Ok(()) + } +} + +pub struct FacetsUpdateIncrementalInner { db: heed::Database, FacetGroupValueCodec>, group_size: u8, min_level_size: u8, max_group_size: u8, } -impl FacetsUpdateIncremental { +impl FacetsUpdateIncrementalInner { pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } - pub fn group_size(mut self, size: u8) -> Self { - self.group_size = size; - self - } - pub fn min_level_size(mut self, size: u8) -> Self { - self.min_level_size = size; - self - } - pub fn max_group_size(mut self, size: u8) -> Self { - self.max_group_size = size; - self - } } -impl FacetsUpdateIncremental { +impl FacetsUpdateIncrementalInner { fn find_insertion_key_value( &self, field_id: u16, @@ -481,9 +534,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::heed_codec::facet::StrRefCodec; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; + use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, StrRefCodec, + }; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; @@ -534,7 +587,7 @@ mod tests { FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; - assert!(value.size > 0 && (value.size as usize) < db.max_group_size); + assert!(value.size > 0 && value.size < db.max_group_size); let mut actual_size = 0; let mut values_below = RoaringBitmap::new(); @@ -553,7 +606,7 @@ mod tests { } #[test] fn append() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -566,7 +619,7 @@ mod tests { } #[test] fn many_field_ids_append() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -595,7 +648,7 @@ mod tests { } #[test] fn many_field_ids_prepend() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in (0..256).into_iter().rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -625,7 +678,7 @@ mod tests { #[test] fn prepend() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in (0..256).into_iter().rev() { @@ -640,7 +693,7 @@ mod tests { #[test] fn shuffled() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..256).into_iter().collect::>(); @@ -659,7 +712,7 @@ mod tests { #[test] fn merge_values() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..256).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -680,7 +733,7 @@ mod tests { #[test] fn delete_from_end() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); @@ -745,7 +798,7 @@ mod tests { #[test] fn delete_from_start() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); @@ -783,7 +836,7 @@ mod tests { #[test] fn delete_shuffled() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); @@ -829,7 +882,7 @@ mod tests { #[test] fn in_place_level0_insert() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..16).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -849,7 +902,7 @@ mod tests { #[test] fn in_place_level0_delete() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..64).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -879,7 +932,7 @@ mod tests { #[test] fn shuffle_merge_string_and_delete() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (1000..1064).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 04810cb48..3b46bb421 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,12 +1,9 @@ -use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; -use crate::{ - facet::FacetType, - heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, - CboRoaringBitmapCodec, FieldId, Index, Result, -}; -use heed::BytesDecode; -use roaring::RoaringBitmap; -use std::{collections::HashMap, fs::File}; +use self::incremental::FacetsUpdateIncremental; +use super::FacetsUpdateBulk; +use crate::facet::FacetType; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::{Index, Result}; +use std::fs::File; pub mod bulk; pub mod incremental; @@ -14,11 +11,13 @@ pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + new_data: grenad::Reader, + // Options: + // there's no way to change these for now level_group_size: u8, max_level_group_size: u8, min_level_size: u8, - facet_type: FacetType, - new_data: grenad::Reader, } impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { @@ -42,36 +41,37 @@ impl<'i> FacetsUpdate<'i> { } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.new_data.is_empty() { + return Ok(()); + } // here, come up with a better condition! - if self.database.is_empty(wtxn)? { + // ideally we'd choose which method to use for each field id individually + // but I dont' think it's worth the effort yet + // As a first requirement, we ask that the length of the new data is less + // than a 1/50th of the length of the database in order to use the incremental + // method. + if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) .level_group_size(self.level_group_size) .min_level_size(self.min_level_size); bulk_update.execute(wtxn)?; } else { - let indexer = FacetsUpdateIncremental::new(self.database) - .max_group_size(self.max_level_group_size) - .min_level_size(self.min_level_size); - - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = self.new_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(key) - .ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - - for (field_id, new_docids) in new_faceted_docids { - let mut docids = - self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - docids |= new_docids; - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; - } + let incremental_update = + FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data) + .group_size(self.level_group_size) + .max_group_size(self.max_level_group_size) + .min_level_size(self.min_level_size); + incremental_update.execute(wtxn)?; } Ok(()) } } + +#[cfg(test)] +mod tests { + // here I want to create a benchmark + // to find out at which point it is faster to do it incrementally + + #[test] + fn update() {} +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c2b3896c4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c2b3896c4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..c9f8951ac --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..c9f8951ac --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 9a89691b1..1d415166d 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,9 +6,9 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::FieldDocIdFacetF64Codec; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, +}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 078a82335..e6a41067b 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,8 +4,7 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, StrRefCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e18cb4e16..03f15945a 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use grenad::{CompressionType, Reader, Sorter}; +use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -208,36 +208,6 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn write_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - reader: Reader, - merge: MergeFn, -) -> Result<()> { - debug!("Writing MTBL stores..."); - let before = Instant::now(); - - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - pub fn sorter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 7e2ebd2d3..8fb629cae 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,8 +9,8 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, - writer_into_reader, GrenadParameters, MergeableReader, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2a2511362..96bea9589 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,8 +27,7 @@ pub use self::enrich::{ pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, - ClonableMmap, MergeFn, + sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 8fba16d3d..b13118e09 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,7 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; -pub use self::facet::incremental::FacetsUpdateIncremental; +pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; From b2f01ad2042ce102fe03141cbf2b3ef65762aced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Sep 2022 11:52:57 +0200 Subject: [PATCH 28/58] Refactor facet database tests --- .../search/facet/facet_distribution_iter.rs | 51 +-- milli/src/search/facet/facet_range_search.rs | 14 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 4 +- milli/src/search/facet/mod.rs | 146 ------- milli/src/update/facet/bulk.rs | 407 ++++++++---------- milli/src/update/facet/incremental.rs | 272 +++++------- milli/src/update/facet/mod.rs | 244 ++++++++++- .../default.hash.snap} | 2 +- .../large_group_small_min_level.hash.snap} | 2 +- .../odd_group_odd_min_level.hash.snap} | 2 +- .../small_group_large_min_level.hash.snap} | 2 +- .../small_group_small_min_level.hash.snap | 4 + .../default.hash.snap | 4 + .../large_group_small_min_level.hash.snap | 4 + .../odd_group_odd_min_level.hash.snap | 4 + .../small_group_large_min_level.hash.snap | 4 + .../small_group_small_min_level.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../default/facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../default/facet_id_string_docids.hash.snap | 4 - .../facet_id_string_docids.hash.snap | 4 - .../default/facet_id_string_docids.hash.snap | 4 - .../facet_id_string_docids.hash.snap | 4 - milli/src/update/mod.rs | 2 +- .../word_pair_proximity_docids.hash.snap | 4 - 28 files changed, 568 insertions(+), 644 deletions(-) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/default/facet_id_f64_docids.hash.snap => insert/default.hash.snap} (58%) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap => insert/large_group_small_min_level.hash.snap} (58%) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap => insert/odd_group_odd_min_level.hash.snap} (58%) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap => insert/small_group_large_min_level.hash.snap} (58%) create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 2eebffbcd..3379d1abe 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -114,14 +114,13 @@ where mod tests { use std::ops::ControlFlow; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use super::iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; - use crate::search::facet::test::FacetIndex; + use crate::update::facet::tests::FacetIndex; + use heed::BytesDecode; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -164,17 +163,11 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); - iterate_over_facet_distribution( - &txn, - index.db.content, - 0, - &candidates, - |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - }, - ) + iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + ControlFlow::Continue(()) + }) .unwrap(); milli_snap!(results, i); @@ -189,23 +182,17 @@ mod tests { let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; - iterate_over_facet_distribution( - &txn, - index.db.content, - 0, - &candidates, - |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - if nbr_facets == 100 { - return ControlFlow::Break(()); - } else { - nbr_facets += 1; - results.push_str(&format!("{facet}: {count}\n")); + iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return ControlFlow::Break(()); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - } - }, - ) + ControlFlow::Continue(()) + } + }) .unwrap(); milli_snap!(results, i); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index bb555e1ab..cb5fd14d2 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -263,8 +263,8 @@ mod tests { use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -312,7 +312,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -328,7 +328,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -354,7 +354,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -373,7 +373,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -401,7 +401,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -420,7 +420,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index fc5fd3d04..f320f9e77 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -89,8 +89,8 @@ mod tests { use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -133,7 +133,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let iter = ascending_facet_sort(&txn, index.db.content, 0, candidates).unwrap(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 42bae42a6..be5fe7841 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -122,8 +122,8 @@ mod tests { use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -166,7 +166,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.db.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ef72658ec..fc71acf37 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -74,149 +74,3 @@ pub(crate) fn get_highest_level<'t>( }) .unwrap_or(0)) } - -#[cfg(test)] -pub mod test { - use std::fmt::Display; - use std::marker::PhantomData; - use std::rc::Rc; - - use heed::{BytesDecode, BytesEncode, Env, RwTxn}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, - }; - use crate::snapshot_tests::display_bitmap; - use crate::update::FacetsUpdateIncrementalInner; - - // A dummy index that only contains the facet database, used for testing - pub struct FacetIndex - where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - pub env: Env, - pub db: Database, - _phantom: PhantomData, - } - - // The faecet database and its settings - pub struct Database { - pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: u8, - pub min_level_size: u8, - pub max_group_size: u8, - _tempdir: Rc, - } - - impl FacetIndex - where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - #[cfg(all(test, fuzzing))] - pub fn open_from_tempdir( - tempdir: Rc, - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 10 * 100); - unsafe { - options.flag(heed::flags::Flags::MdbAlwaysFreePages); - } - let env = options.open(tempdir.path()).unwrap(); - let content = env.open_database(None).unwrap().unwrap(); - - FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: tempdir, - }, - env, - _phantom: PhantomData, - } - } - pub fn new( - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 100); - let tempdir = tempfile::TempDir::new().unwrap(); - let env = options.open(tempdir.path()).unwrap(); - let content = env.create_database(None).unwrap(); - - FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: Rc::new(tempdir), - }, - env, - _phantom: PhantomData, - } - } - pub fn insert<'a>( - &self, - rwtxn: &'a mut RwTxn, - field_id: u16, - key: &'a >::EItem, - docids: &RoaringBitmap, - ) { - let update = FacetsUpdateIncrementalInner::new(self.db.content); - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); - } - pub fn delete<'a>( - &self, - rwtxn: &'a mut RwTxn, - field_id: u16, - key: &'a >::EItem, - value: u32, - ) { - let update = FacetsUpdateIncrementalInner::new(self.db.content); - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); - } - } - - impl Display for FacetIndex - where - for<'a> >::EItem: Sized + Display, - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let txn = self.env.read_txn().unwrap(); - let mut iter = self.db.content.iter(&txn).unwrap(); - while let Some(el) = iter.next() { - let (key, value) = el.unwrap(); - let FacetGroupKey { field_id, level, left_bound: bound } = key; - let bound = BoundCodec::bytes_decode(bound).unwrap(); - let FacetGroupValue { size, bitmap } = value; - writeln!( - f, - "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", - values = display_bitmap(&bitmap) - )?; - } - Ok(()) - } - } -} diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index ad97ed2de..321ae52d4 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -19,7 +19,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - level_group_size: u8, + group_size: u8, min_level_size: u8, facet_type: FacetType, // None if level 0 does not need to be updated @@ -42,7 +42,7 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - level_group_size: 4, + group_size: 4, min_level_size: 5, facet_type, new_data: Some(new_data), @@ -63,7 +63,7 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - level_group_size: 4, + group_size: 4, min_level_size: 5, facet_type, new_data: None, @@ -74,61 +74,85 @@ impl<'i> FacetsUpdateBulk<'i> { /// /// This setting is always greater than or equal to 2. pub fn level_group_size(mut self, value: u8) -> Self { - self.level_group_size = cmp::max(value, 2); + self.group_size = cmp::max(value, 2); self } /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(mut self, value: u8) -> Self { - self.min_level_size = cmp::max(value, 1); + self.min_level_size = cmp::max(value, 2); self } + #[logging_timer::time("FacetsUpdateBulk::{}")] + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + + let Self { index, database, group_size, min_level_size, facet_type, new_data } = self; + + index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size }; + + let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + + inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { + index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; + Ok(()) + })?; + + Ok(()) + } +} + +pub(crate) struct FacetsUpdateBulkInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub new_data: Option>, + pub group_size: u8, + pub min_level_size: u8, +} +impl FacetsUpdateBulkInner { + pub fn update( + mut self, + wtxn: &mut RwTxn, + field_ids: &[u16], + mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, + ) -> Result<()> { + self.update_level0(wtxn)?; + for &field_id in field_ids.iter() { + self.clear_levels(wtxn, field_id)?; + } + + for &field_id in field_ids.iter() { + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; + + handle_all_docids(wtxn, field_id, all_docids)?; + + for level_reader in level_readers { + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + self.db.remap_types::().put(wtxn, k, v)?; + } + } + } + Ok(()) + } + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; let range = left..=right; - self.database.delete_range(wtxn, &range).map(drop)?; + self.db.delete_range(wtxn, &range).map(drop)?; Ok(()) } - - #[logging_timer::time("FacetsUpdateBulk::{}")] - pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { - self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - - // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); - - for &field_id in faceted_fields.iter() { - self.clear_levels(wtxn, field_id)?; - } - self.update_level0(wtxn)?; - - for &field_id in faceted_fields.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; - - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?; - - for level_reader in level_readers { - let mut cursor = level_reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - self.database.remap_types::().put(wtxn, k, v)?; - } - } - } - - Ok(()) - } - fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { let new_data = match self.new_data.take() { Some(x) => x, None => return Ok(()), }; - if self.database.is_empty(wtxn)? { + if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); - let mut database = self.database.iter_mut(wtxn)?.remap_types::(); + let mut database = self.db.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { buffer.clear(); @@ -140,7 +164,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } else { let mut buffer = Vec::new(); - let database = self.database.remap_types::(); + let database = self.db.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -164,47 +188,29 @@ impl<'i> FacetsUpdateBulk<'i> { database.put(wtxn, key, &buffer)?; } } - Ok(()) } - fn compute_levels_for_field_id( &self, field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { // TODO: first check whether there is anything in level 0? - let algo = ComputeHigherLevels { - rtxn: txn, - db: &self.database, - field_id, - level_group_size: self.level_group_size, - min_level_size: self.min_level_size, - }; let mut all_docids = RoaringBitmap::new(); - let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { for bitmap in bitmaps { all_docids |= bitmap; } Ok(()) })?; - drop(algo); Ok((subwriters, all_docids)) } -} - -struct ComputeHigherLevels<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, - field_id: u16, - level_group_size: u8, - min_level_size: u8, -} -impl<'t> ComputeHigherLevels<'t> { - fn read_level_0( + fn read_level_0<'t>( &self, + rtxn: &'t RoTxn, + field_id: u16, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result<()> { // we read the elements one by one and @@ -213,13 +219,13 @@ impl<'t> ComputeHigherLevels<'t> { let mut bitmaps = vec![]; let mut level_0_prefix = vec![]; - level_0_prefix.extend_from_slice(&self.field_id.to_be_bytes()); + level_0_prefix.extend_from_slice(&field_id.to_be_bytes()); level_0_prefix.push(0); let level_0_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? + .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; @@ -235,7 +241,7 @@ impl<'t> ComputeHigherLevels<'t> { } bitmaps.push(docids); - if bitmaps.len() == self.level_group_size as usize { + if bitmaps.len() == self.group_size as usize { handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); @@ -254,13 +260,15 @@ impl<'t> ComputeHigherLevels<'t> { /// ## Returns: /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. - fn compute_higher_levels( + fn compute_higher_levels<'t>( &self, + rtxn: &'t RoTxn, + field_id: u16, level: u8, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { - self.read_level_0(handle_group)?; + self.read_level_0(rtxn, field_id, handle_group)?; // Level 0 is already in the database return Ok(vec![]); } @@ -270,7 +278,7 @@ impl<'t> ComputeHigherLevels<'t> { // of those elements, and their bitmaps, to the level above let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); - let mut cur_writer_len = 0; + let mut cur_writer_len: usize = 0; let mut group_sizes = vec![]; let mut left_bounds = vec![]; @@ -278,8 +286,13 @@ impl<'t> ComputeHigherLevels<'t> { // compute the levels below // in the callback, we fill `cur_writer` with the correct elements for this level - let mut sub_writers = - self.compute_higher_levels(level - 1, &mut |sub_bitmaps, left_bound| { + let mut sub_writers = self.compute_higher_levels( + rtxn, + field_id, + level - 1, + &mut |sub_bitmaps, left_bound| { + // TODO: is this done unnecessarily for all 32 levels? + println!("level: {level}"); let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; @@ -288,7 +301,7 @@ impl<'t> ComputeHigherLevels<'t> { left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() != self.level_group_size as usize { + if bitmaps.len() != self.group_size as usize { return Ok(()); } let left_bound = left_bounds.first().unwrap(); @@ -297,7 +310,7 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id, level, left_bound }; let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; @@ -307,15 +320,26 @@ impl<'t> ComputeHigherLevels<'t> { cur_writer_len += 1; } Ok(()) - })?; + }, + )?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size { + + // but only do so if the current number of elements to be inserted into this + // levelcould grow to the minimum level size + + if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) { + // the length of bitmaps is between 0 and group_size + assert!(bitmaps.len() < self.group_size as usize); + assert!(cur_writer_len > 0); + let left_bound = left_bounds.first().unwrap(); handle_group(&bitmaps, left_bound)?; + + // Note: how many bitmaps are there here? for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id, level, left_bound }; let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; @@ -324,9 +348,12 @@ impl<'t> ComputeHigherLevels<'t> { cur_writer_len += 1; } } - if cur_writer_len > self.min_level_size { + // if we inserted enough elements to reach the minimum level size, then we push the writer + if cur_writer_len as u8 >= self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); } else { + // otherwise, if there are still leftover elements, we give them to the level above + // this is necessary in order to get the union of all docids if !bitmaps.is_empty() { handle_group(&bitmaps, left_bounds.first().unwrap())?; } @@ -337,184 +364,90 @@ impl<'t> ComputeHigherLevels<'t> { #[cfg(test)] mod tests { - use std::num::NonZeroUsize; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; + use std::iter::once; #[test] - fn test_facets_number() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; + fn insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1_000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2)); - test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128)); - test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2)); - test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256)); - } - - #[test] - fn test_facets_string() { - let test = |name: &str, - group_size: Option, - min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..100 { - documents.push( - serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(), - ); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); } - for i in 0..10 { - documents.push( - serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), - ); + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); } - let documents = documents_batch_reader_from_objects(documents); + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - index.add_documents(documents).unwrap(); + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); - db_snap!(index, facet_id_string_docids, name); + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); }; - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); } - #[test] - fn test_facets_number_incremental_update() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; + fn insert_delete_field_insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + let mut wtxn = index.env.write_txn().unwrap(); - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..100u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - let mut documents = vec![]; - for i in 0..1000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + // delete all the elements for the facet id 0 + for i in 0..100u32 { + index.delete(&mut wtxn, 0, &(i as f64), i); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); - index.add_documents(documents_batch).unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + // then add some elements again for the facet id 1 + for i in 0..110u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - let mut documents = vec![]; - for i in 1000..1010 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 100..110 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); + wtxn.commit().unwrap(); - index.add_documents(documents_batch).unwrap(); + milli_snap!(format!("{index}"), name); + }; - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - } - - #[test] - fn test_facets_number_delete_facet_id_then_bulk_update() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); - - index.add_documents(documents_batch).unwrap(); - - // 1100 facets -> how long is the DB? - - let mut documents = vec![]; - for i in 1000..1010 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 100..110 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); - - index.add_documents(documents_batch).unwrap(); - - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 75ca5d55b..14b421242 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -84,15 +84,10 @@ impl<'i> FacetsUpdateIncremental<'i> { } pub struct FacetsUpdateIncrementalInner { - db: heed::Database, FacetGroupValueCodec>, - group_size: u8, - min_level_size: u8, - max_group_size: u8, -} -impl FacetsUpdateIncrementalInner { - pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { - Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } - } + pub db: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, } impl FacetsUpdateIncrementalInner { fn find_insertion_key_value( @@ -528,82 +523,13 @@ impl FacetsUpdateIncrementalInner { #[cfg(test)] mod tests { - use heed::types::ByteSlice; - use heed::{BytesDecode, BytesEncode}; + use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, StrRefCodec, - }; - use crate::milli_snap; - use crate::search::facet::get_highest_level; - use crate::search::facet::test::FacetIndex; - - pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) - where - for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, - { - let FacetIndex { env, db, .. } = index; - - let txn = env.write_txn().unwrap(); - let mut field_id_prefix = vec![]; - field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); - - let highest_level = get_highest_level(&txn, index.db.content, field_id).unwrap(); - txn.commit().unwrap(); - - let txn = env.read_txn().unwrap(); - for level_no in (1..=highest_level).rev() { - let mut level_no_prefix = vec![]; - level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); - level_no_prefix.push(level_no); - - let mut iter = db - .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level_no_prefix) - .unwrap(); - while let Some(el) = iter.next() { - let (key, value) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); - - let mut prefix_start_below = vec![]; - prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); - prefix_start_below.push(level_no - 1); - prefix_start_below.extend_from_slice(&key.left_bound); - - let start_below = { - let mut start_below_iter = db - .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - &txn, - &prefix_start_below, - ) - .unwrap(); - let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() - }; - - assert!(value.size > 0 && value.size < db.max_group_size); - - let mut actual_size = 0; - let mut values_below = RoaringBitmap::new(); - let mut iter_below = - db.content.range(&txn, &(start_below..)).unwrap().take(value.size as usize); - while let Some(el) = iter_below.next() { - let (_, value) = el.unwrap(); - actual_size += 1; - values_below |= value.bitmap; - } - assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); - - assert_eq!(value.bitmap, values_below); - } - } - } #[test] fn append() { let index = FacetIndex::::new(4, 8, 5); @@ -614,7 +540,9 @@ mod tests { index.insert(&mut txn, 0, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] @@ -641,9 +569,11 @@ mod tests { index.insert(&mut txn, 1, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); - verify_structure_validity(&index, 1); - verify_structure_validity(&index, 2); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] @@ -670,9 +600,11 @@ mod tests { index.insert(&mut txn, 1, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); - verify_structure_validity(&index, 1); - verify_structure_validity(&index, 2); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } @@ -686,8 +618,9 @@ mod tests { bitmap.insert(i); index.insert(&mut txn, 0, &(i as f64), &bitmap); } + + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}")); } @@ -705,146 +638,138 @@ mod tests { bitmap.insert(key); index.insert(&mut txn, 0, &(key as f64), &bitmap); } + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}")); } #[test] fn merge_values() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..256).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); + for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(rng.gen_range(256..512)); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] fn delete_from_end() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); - txn.commit().unwrap(); } for i in (200..256).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 200); + let mut txn = index.env.write_txn().unwrap(); for i in (150..200).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 150); - + let mut txn = index.env.write_txn().unwrap(); for i in (100..150).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 100); - + let mut txn = index.env.write_txn().unwrap(); for i in (17..100).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 17); - let mut txn = index.env.write_txn().unwrap(); for i in (15..17).into_iter().rev() { index.delete(&mut txn, 0, &(i as f64), i as u32); } + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}"), 15); + let mut txn = index.env.write_txn().unwrap(); for i in (0..15).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 0); } #[test] fn delete_from_start() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(i as f64), &bitmap); - txn.commit().unwrap(); } for i in 0..128 { - let mut txn = index.env.write_txn().unwrap(); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 215); + let mut txn = index.env.write_txn().unwrap(); for i in 216..256 { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 255); } #[test] fn delete_shuffled() { let index = FacetIndex::::new(4, 8, 5); - + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(i as f64), &bitmap); - txn.commit().unwrap(); } let mut keys = (0..256).into_iter().collect::>(); @@ -853,36 +778,37 @@ mod tests { for i in 0..128 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + let mut txn = index.env.write_txn().unwrap(); milli_snap!(format!("{index}"), 215); for i in 216..256 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 255); } #[test] fn in_place_level0_insert() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + let mut keys = (0..16).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -890,19 +816,19 @@ mod tests { for &key in keys.iter() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] fn in_place_level0_delete() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..64).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -912,27 +838,29 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "before_delete"); + let mut txn = index.env.write_txn().unwrap(); + for &key in keys.iter() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key + 100); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } #[test] fn shuffle_merge_string_and_delete() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (1000..1064).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -942,21 +870,21 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "before_delete"); + let mut txn = index.env.write_txn().unwrap(); + for &key in keys.iter() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } @@ -1083,7 +1011,7 @@ mod tests { // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } -// verify_structure_validity(&index, *field_id); +// index.verify_structure_validity(*field_id); // } // index.db.content.clear(&mut txn).unwrap(); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 3b46bb421..7298fecc5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -68,10 +68,244 @@ impl<'i> FacetsUpdate<'i> { } #[cfg(test)] -mod tests { - // here I want to create a benchmark - // to find out at which point it is faster to do it incrementally +pub(crate) mod tests { + use super::bulk::FacetsUpdateBulkInner; + use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + }; + use crate::search::facet::get_highest_level; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncrementalInner; + use crate::CboRoaringBitmapCodec; + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; - #[test] - fn update() {} + // A dummy index that only contains the facet database, used for testing + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, + _tempdir: Rc, + _phantom: PhantomData, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + #[cfg(all(test, fuzzing))] + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 100); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: tempdir, + }, + env, + _phantom: PhantomData, + } + } + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 100); + let tempdir = tempfile::TempDir::new().unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: Rc::new(tempdir), + env, + _phantom: PhantomData, + } + } + pub fn insert<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + } + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + value: u32, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, value).unwrap(); + } + + pub fn bulk_insert<'a, 'b>( + &self, + wtxn: &'a mut RwTxn, + field_ids: &[u16], + els: impl IntoIterator< + Item = &'a ((u16, >::EItem), RoaringBitmap), + >, + ) where + for<'c> >::EItem: Sized, + { + let mut new_data = vec![]; + let mut writer = grenad::Writer::new(&mut new_data); + for ((field_id, left_bound), docids) in els { + let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); + let key: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); + writer.insert(&key, &value).unwrap(); + } + writer.finish().unwrap(); + let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + + let update = FacetsUpdateBulkInner { + db: self.content, + new_data: Some(reader), + group_size: self.group_size, + min_level_size: self.min_level_size, + }; + + update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + } + + pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(txn, self.content, field_id).unwrap(); + + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0 && value.size < self.max_group_size); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = self + .content + .range(txn, &(start_below..)) + .unwrap() + .take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetGroupKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } } diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap index 960843592..bef20823c 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap index 960843592..74c40e6a3 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap index 960843592..6fb086d35 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap index 960843592..0271a6c6b 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..d801ef19f --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap new file mode 100644 index 000000000..e9988f527 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..64f5012a4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..bb0e9aa69 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 960843592..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 960843592..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap deleted file mode 100644 index c2b3896c4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index c2b3896c4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap deleted file mode 100644 index 574a3c393..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index 574a3c393..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap deleted file mode 100644 index c9f8951ac..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index c9f8951ac..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index b13118e09..952720725 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -17,7 +17,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facet; +pub(crate) mod facet; mod index_documents; mod indexer_config; mod prefix_word_pairs; diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap deleted file mode 100644 index e50e50347..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -6873ff1f78d08f2b1a13bb9e37349c01 From bee3c23b45c0a1a9212ba7269c82a4eca5ad6e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Sep 2022 13:39:08 +0200 Subject: [PATCH 29/58] Add comparison benchmark between bulk and incremental facet indexing --- milli/src/update/facet/bulk.rs | 2 - milli/src/update/facet/mod.rs | 85 ++++++++++++++++++- .../src/update/index_documents/typed_chunk.rs | 2 + 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 321ae52d4..90e287f23 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -291,8 +291,6 @@ impl FacetsUpdateBulkInner { field_id, level - 1, &mut |sub_bitmaps, left_bound| { - // TODO: is this done unnecessarily for all 32 levels? - println!("level: {level}"); let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 7298fecc5..caf88671e 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -13,8 +13,6 @@ pub struct FacetsUpdate<'i> { database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, - // Options: - // there's no way to change these for now level_group_size: u8, max_level_group_size: u8, min_level_size: u8, @@ -40,6 +38,28 @@ impl<'i> FacetsUpdate<'i> { } } + // TODO: use the options below? + // but I don't actually see why they should be configurable + // /// The minimum number of elements that a level is allowed to have. + // pub fn level_max_group_size(mut self, value: u8) -> Self { + // self.max_level_group_size = std::cmp::max(value, 4); + // self + // } + + // /// The number of elements from the level below that are represented by a single element in the level above + // /// + // /// This setting is always greater than or equal to 2. + // pub fn level_group_size(mut self, value: u8) -> Self { + // self.level_group_size = std::cmp::max(value, 2); + // self + // } + + // /// The minimum number of elements that a level is allowed to have. + // pub fn min_level_size(mut self, value: u8) -> Self { + // self.min_level_size = std::cmp::max(value, 2); + // self + // } + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { if self.new_data.is_empty() { return Ok(()); @@ -144,7 +164,7 @@ pub(crate) mod tests { let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 100); + let options = options.map_size(4096 * 4 * 1000); let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); @@ -309,3 +329,62 @@ pub(crate) mod tests { } } } + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + + use super::tests::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // It appears that the incremental indexer is about 50 times slower than the + // bulk indexer. + #[test] + fn benchmark_facet_indexing() { + // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it + + let mut facet_value = 0; + + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, facet_value as f64), once(i).collect())); + facet_value += 1; + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // insert one document + // + for _ in 0..nbr_doc { + index.insert(&mut txn, 0, &r.gen(), &once(1).collect()); + } + let time_spent = timer.elapsed().as_millis(); + println!(" add {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 16784bd92..f11414f20 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -138,11 +138,13 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; From 27454e9828ef76d85bb530a63a73e4948b902809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 16:44:08 +0200 Subject: [PATCH 30/58] Document and refine facet indexing algorithms --- milli/src/heed_codec/facet/mod.rs | 21 +- milli/src/update/facet/bulk.rs | 88 ++--- milli/src/update/facet/incremental.rs | 440 ++++++++++++++---------- milli/src/update/facet/mod.rs | 125 ++++--- milli/src/update/index_documents/mod.rs | 4 +- 5 files changed, 387 insertions(+), 291 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 40e395881..2e9f0b212 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -29,31 +29,14 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { } } +/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct FacetGroupKey { pub field_id: u16, pub level: u8, pub left_bound: T, } -impl<'a> FacetGroupKey<&'a [u8]> { - pub fn into_owned(self) -> FacetGroupKey> { - FacetGroupKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.to_vec(), - } - } -} - -impl<'a> FacetGroupKey> { - pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { - FacetGroupKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.as_slice(), - } - } -} #[derive(Debug)] pub struct FacetGroupValue { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 90e287f23..83fa51003 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,24 +1,30 @@ -use std::borrow::Cow; -use std::cmp; -use std::fs::File; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use std::borrow::Cow; +use std::fs::File; +use time::OffsetDateTime; +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; + +/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases +/// by rebuilding the database "from scratch". +/// +/// First, the new elements are inserted into the level 0 of the database. Then, the +/// higher levels are cleared and recomputed from the content of level 0. +/// +/// Finally, the `faceted_documents_ids` value in the main database of `Index` +/// is updated to contain the new set of faceted documents. pub struct FacetsUpdateBulk<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, group_size: u8, min_level_size: u8, facet_type: FacetType, @@ -31,22 +37,10 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, facet_type: FacetType, new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, ) -> FacetsUpdateBulk<'i> { - FacetsUpdateBulk { - index, - database: match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }, - group_size: 4, - min_level_size: 5, - facet_type, - new_data: Some(new_data), - } + FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) } } pub fn new_not_updating_level_0( @@ -55,44 +49,31 @@ impl<'i> FacetsUpdateBulk<'i> { ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, - database: match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }, - group_size: 4, - min_level_size: 5, + group_size: FACET_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, new_data: None, } } - /// The number of elements from the level below that are represented by a single element in the level above - /// - /// This setting is always greater than or equal to 2. - pub fn level_group_size(mut self, value: u8) -> Self { - self.group_size = cmp::max(value, 2); - self - } - - /// The minimum number of elements that a level is allowed to have. - pub fn min_level_size(mut self, value: u8) -> Self { - self.min_level_size = cmp::max(value, 2); - self - } - #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, database, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, group_size, min_level_size, facet_type, new_data } = self; + + let db = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size }; + let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); @@ -105,6 +86,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } +/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, pub new_data: Option>, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 14b421242..6be2dbf03 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,16 +1,14 @@ -use std::collections::HashMap; -use std::fs::File; - -use heed::types::ByteSlice; -use heed::{BytesDecode, Error, RoTxn, RwTxn}; -use roaring::RoaringBitmap; - use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; +use std::collections::HashMap; +use std::fs::File; enum InsertionResult { InPlace, @@ -18,10 +16,15 @@ enum InsertionResult { } enum DeletionResult { InPlace, - Reduce { prev: Option>, next: Option> }, - Remove { prev: Option>, next: Option> }, + Reduce { next: Option> }, + Remove { next: Option> }, } +/// Algorithm to incrementally insert and delete elememts into the +/// `facet_id_(string/f64)_docids` databases. +/// +/// Rhe `faceted_documents_ids` value in the main database of `Index` +/// is also updated to contain the new set of faceted documents. pub struct FacetsUpdateIncremental<'i> { index: &'i Index, inner: FacetsUpdateIncrementalInner, @@ -30,7 +33,14 @@ pub struct FacetsUpdateIncremental<'i> { } impl<'i> FacetsUpdateIncremental<'i> { - pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + max_group_size: u8, + ) -> Self { FacetsUpdateIncremental { index, inner: FacetsUpdateIncrementalInner { @@ -42,26 +52,15 @@ impl<'i> FacetsUpdateIncremental<'i> { .facet_id_f64_docids .remap_key_type::>(), }, - group_size: 4, - max_group_size: 8, - min_level_size: 5, + group_size, + max_group_size, + min_level_size, }, facet_type, new_data, } } - pub fn group_size(mut self, size: u8) -> Self { - self.inner.group_size = size; - self - } - pub fn min_level_size(mut self, size: u8) -> Self { - self.inner.min_level_size = size; - self - } - pub fn max_group_size(mut self, size: u8) -> Self { - self.inner.max_group_size = size; - self - } + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { let mut new_faceted_docids = HashMap::::default(); @@ -83,6 +82,7 @@ impl<'i> FacetsUpdateIncremental<'i> { } } +/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type pub struct FacetsUpdateIncrementalInner { pub db: heed::Database, FacetGroupValueCodec>, pub group_size: u8, @@ -90,22 +90,36 @@ pub struct FacetsUpdateIncrementalInner { pub max_group_size: u8, } impl FacetsUpdateIncrementalInner { + /// Find the `FacetGroupKey`/`FacetGroupValue` in the database that + /// should be used to insert the new `facet_value` for the given `field_id` and `level` + /// where `level` must be strictly greater than 0. + /// + /// For example, when inserting the facet value `4`, there are two possibilities: + /// + /// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore, + /// we know that the implicit range of the first key is 3..6, which contains 4. + /// So the new facet value belongs in that first key/value pair. + /// + /// 2. The first key of the level has a lower bound of `5`. We return this key/value pair + /// but will need to change the lowerbound of this key to `4` in order to insert this facet value. fn find_insertion_key_value( &self, field_id: u16, level: u8, - search_key: &[u8], + facet_value: &[u8], txn: &RoTxn, ) -> Result<(FacetGroupKey>, FacetGroupValue)> { + assert!(level > 0); + let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); - prefix.extend_from_slice(search_key); + prefix.extend_from_slice(facet_value); let mut prefix_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( @@ -115,10 +129,10 @@ impl FacetsUpdateIncrementalInner { value, )) } else { - let key = FacetGroupKey { field_id, level, left_bound: search_key }; + let key = FacetGroupKey { field_id, level, left_bound: facet_value }; match self.db.get_lower_than(txn, &key)? { Some((key, value)) => { - if key.level != level || key.field_id != field_id { + if key.level != level { let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); @@ -126,7 +140,7 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>( + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( txn, &prefix.as_slice(), )?; @@ -146,15 +160,19 @@ impl FacetsUpdateIncrementalInner { } } + /// Insert the given facet value and corresponding document ids in the level 0 of the database + /// + /// ## Return + /// See documentation of `insert_in_level` fn insert_in_level_0<'t>( &self, txn: &'t mut RwTxn, field_id: u16, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: new_key }; - let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; let mut level0_prefix = vec![]; level0_prefix.extend_from_slice(&field_id.to_be_bytes()); @@ -163,7 +181,7 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?; + .prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?; if iter.next().is_none() { drop(iter); @@ -186,143 +204,158 @@ impl FacetsUpdateIncrementalInner { } } } + + /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of adding the facet value to the database on the given `level`. + /// + /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. fn insert_in_level<'t>( &self, txn: &'t mut RwTxn, field_id: u16, level: u8, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result { if level == 0 { - return self.insert_in_level_0(txn, field_id, new_key, new_values); + return self.insert_in_level_0(txn, field_id, facet_value, docids); } let max_group_size = self.max_group_size; - let (insertion_key, insertion_value) = - self.find_insertion_key_value(field_id, level, new_key, txn)?; - - let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?; + let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; // level below inserted an element - let insertion_key = { - let mut new_insertion_key = insertion_key.clone(); - let mut modified = false; - - if new_key < insertion_key.left_bound.as_slice() { - new_insertion_key.left_bound = new_key.to_vec(); - modified = true; - } - if modified { - let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; - assert!(is_deleted); - self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; - } - new_insertion_key - }; + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; match result { - // TODO: this could go above the block recomputing insertion key - // because we know that if we inserted in place, the key is not a new one - // thus it doesn't extend a group + // because we know that we inserted in place, the facet_value is not a new one + // thus it doesn't extend a group, and thus the insertion key computed above is + // still correct InsertionResult::InPlace => { - let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); - updated_value.bitmap |= new_values; + let mut updated_value = insertion_value; + updated_value.bitmap |= docids; self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; return Ok(InsertionResult::InPlace); } InsertionResult::Insert => {} } - let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + + // Here we know that inserting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let insertion_key = { + let mut new_insertion_key = insertion_key.clone(); + let mut key_should_be_modified = false; + + if facet_value < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = facet_value.to_vec(); + key_should_be_modified = true; + } + if key_should_be_modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + new_insertion_key + }; + // Now we know that the insertion key contains the `facet_value`. + + // We still need to update the insertion value by: + // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) + // 2. Merge the previous docids with the new one + let mut updated_value = insertion_value; updated_value.size += 1; - if updated_value.size == max_group_size { - let size_left = max_group_size / 2; - let size_right = max_group_size - size_left; - let level_below = level - 1; + if updated_value.size < max_group_size { + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - let (start_key, _) = self - .db - .get_greater_than_or_equal_to( - &txn, - &FacetGroupKey { - field_id, - level: level_below, - left_bound: insertion_key.left_bound.as_slice(), - }, - )? - .unwrap(); - - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); - - let group_left = { - let mut values_left = RoaringBitmap::new(); - - let mut i = 0; - while let Some(next) = iter.next() { - let (_key, value) = next?; - i += 1; - values_left |= &value.bitmap; - if i == size_left { - break; - } - } - - let key = - FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; - let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; - (key, value) - }; - - let group_right = { - let mut values_right = RoaringBitmap::new(); - let mut right_start_key = None; - - while let Some(next) = iter.next() { - let (key, value) = next?; - if right_start_key.is_none() { - right_start_key = Some(key.left_bound); - } - values_right |= &value.bitmap; - } - - let key = FacetGroupKey { - field_id, - level, - left_bound: right_start_key.unwrap().to_vec(), - }; - let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; - (key, value) - }; - drop(iter); - - let _ = self.db.delete(txn, &insertion_key.as_ref())?; - - self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; - self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; - - Ok(InsertionResult::Insert) - } else { - let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); - value.bitmap |= new_values; - value.size += 1; - self.db.put(txn, &insertion_key.as_ref(), &value).unwrap(); - - Ok(InsertionResult::InPlace) + return Ok(InsertionResult::InPlace); } + + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + + let size_left = max_group_size / 2; + let size_right = max_group_size - size_left; + + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + while let Some(next) = iter.next() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let ( + FacetGroupKey { left_bound: right_left_bound, .. }, + FacetGroupValue { bitmap: mut values_right, .. }, + ) = iter.next().unwrap()?; + + while let Some(next) = iter.next() { + let (_, value) = next?; + values_right |= &value.bitmap; + } + + let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() }; + let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) } + /// Insert the given facet value and corresponding document ids in the database. pub fn insert<'a, 't>( &self, txn: &'t mut RwTxn, field_id: u16, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result<()> { - if new_values.is_empty() { + if docids.is_empty() { return Ok(()); } let group_size = self.group_size; @@ -330,12 +363,15 @@ impl FacetsUpdateIncrementalInner { let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = - self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; + self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { InsertionResult::InPlace => return Ok(()), InsertionResult::Insert => {} } + // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. + // If it has, we must build an addition level above it. + let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -384,36 +420,61 @@ impl FacetsUpdateIncrementalInner { Ok(()) } + /// Delete the given document id from the given facet value in the database, from level 0 to the + /// the given level. + /// + /// ## Return + /// Returns the effect of removing the document id from the database on the given `level`. + /// + /// - `DeletionResult::InPlace` means that deleting the document id did not have + /// an effect on the keys in that level. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. fn delete_in_level<'t>( &self, txn: &'t mut RwTxn, field_id: u16, level: u8, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result { if level == 0 { - return self.delete_in_level_0(txn, field_id, key, value); + return self.delete_in_level_0(txn, field_id, facet_value, docid); } let (deletion_key, mut bitmap) = - self.find_insertion_key_value(field_id, level, key, txn)?; + self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?; + let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?; let mut decrease_size = false; - let (prev_key, next_key) = match result { + let next_key = match result { DeletionResult::InPlace => { - bitmap.bitmap.remove(value); + bitmap.bitmap.remove(docid); self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; return Ok(DeletionResult::InPlace); } - DeletionResult::Reduce { prev, next } => (prev, next), - DeletionResult::Remove { prev, next } => { + DeletionResult::Reduce { next } => next, + DeletionResult::Remove { next } => { decrease_size = true; - (prev, next) + next } }; + // If either DeletionResult::Reduce or DeletionResult::Remove was returned, + // then we may need to adjust the left_bound of the deletion key. + // If DeletionResult::Remove was returned, then we need to decrease the group + // size of the deletion key. let mut updated_value = bitmap; if decrease_size { updated_value.size -= 1; @@ -421,17 +482,21 @@ impl FacetsUpdateIncrementalInner { if updated_value.size == 0 { self.db.delete(txn, &deletion_key.as_ref())?; - Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + Ok(DeletionResult::Remove { next: next_key }) } else { let mut updated_deletion_key = deletion_key.clone(); - if key == deletion_key.left_bound { + let reduced_range = facet_value == deletion_key.left_bound; + if reduced_range { updated_deletion_key.left_bound = next_key.clone().unwrap(); } - updated_value.bitmap.remove(value); + updated_value.bitmap.remove(docid); let _ = self.db.delete(txn, &deletion_key.as_ref())?; self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; - - Ok(DeletionResult::Reduce { prev: prev_key, next: next_key }) + if reduced_range { + Ok(DeletionResult::Reduce { next: next_key }) + } else { + Ok(DeletionResult::InPlace) + } } } @@ -439,27 +504,24 @@ impl FacetsUpdateIncrementalInner { &self, txn: &'t mut RwTxn, field_id: u16, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; - bitmap.remove(value); + bitmap.remove(docid); if bitmap.is_empty() { - let mut prev_key = None; let mut next_key = None; - - if let Some(prev) = self.db.get_lower_than(&txn, &key)? { - prev_key = Some(prev.0.left_bound.to_vec()); - } - if let Some(next) = self.db.get_greater_than(&txn, &key)? { - if next.0.level == 0 { - next_key = Some(next.0.left_bound.to_vec()); + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(&txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); } } self.db.delete(txn, &key)?; - Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + Ok(DeletionResult::Remove { next: next_key }) } else { self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; Ok(DeletionResult::InPlace) @@ -470,22 +532,30 @@ impl FacetsUpdateIncrementalInner { &self, txn: &'t mut RwTxn, field_id: u16, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result<()> { - if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() { + if self + .db + .remap_data_type::() + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? + .is_none() + { return Ok(()); } let highest_level = get_highest_level(&txn, self.db, field_id)?; - // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - - let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?; + let result = + self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?; match result { DeletionResult::InPlace => return Ok(()), - DeletionResult::Reduce { .. } => {} + DeletionResult::Reduce { .. } => return Ok(()), DeletionResult::Remove { .. } => {} } + + // if we either removed a key from the highest level, its size may have fallen + // below `min_level_size`, in which case we need to remove the entire level + let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -521,6 +591,26 @@ impl FacetsUpdateIncrementalInner { } } +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + #[cfg(test)] mod tests { use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index caf88671e..ea6468538 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,3 +1,79 @@ +/*! +This module implements two different algorithms for updating the `facet_id_string_docids` +and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that +it recreates the database from scratch when new elements are added to it. The second algorithm +is incremental: it modifies the database as little as possible. + +The databases must be able to return results for queries such as: +1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y +2. Min/Max : find the minimum/maximum facet value among these document ids +3. Sort : sort these document ids by increasing/decreasing facet values +4. Distribution : given some document ids, make a list of each facet value + found in these documents along with the number of documents that contain it + +The algorithms that implement these queries are found in the `src/search/facet` folder. + +To make these queries fast to compute, the database adopts a tree structure: +```ignore + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` +In the diagram above, each cell corresponds to a node in the tree. The first line of the cell +contains the left bound of the range of facet values as well as the number of children of the node. +The second line contains the document ids which have a facet value within the range of the node. +The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range. + +In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because +`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`. +These documents all contain a facet value that is contained within `ab .. gaf`. + +In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a +[`FacetGroupValue`], which have the following format: + +```ignore +FacetGroupKey: +- field id : u16 +- level : u8 +- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str + +FacetGroupValue: +- #children : u8 +- docids : RoaringBitmap +``` + +When the database is first created using the "bulk" method, each node has a fixed number of children +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` +(default to `FACET_MIN_LEVEL_SIZE`) elements in it. + +When the database is incrementally updated, the number of children of a node can vary between +1 and `max_group_size`. This is done so that most incremental operations do not need to change +the structure of the tree. When the number of children of a node reaches `max_group_size`, +we split the node in two and update the number of children of its parent. + +When adding documents to the databases, it is important to determine which method to use to +minimise indexing time. The incremental method is faster when adding few new facet values, but the +bulk method is faster when a large part of the database is modified. Empirically, it seems that +it takes 50x more time to incrementally add N facet values to an existing database than it is to +construct a database of N facet values. This is the heuristic that is used to choose between the +two methods. +*/ + +pub const FACET_MAX_GROUP_SIZE: u8 = 8; +pub const FACET_GROUP_SIZE: u8 = 4; +pub const FACET_MIN_LEVEL_SIZE: u8 = 5; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; @@ -13,8 +89,8 @@ pub struct FacetsUpdate<'i> { database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, - level_group_size: u8, - max_level_group_size: u8, + group_size: u8, + max_group_size: u8, min_level_size: u8, } impl<'i> FacetsUpdate<'i> { @@ -30,57 +106,24 @@ impl<'i> FacetsUpdate<'i> { Self { index, database, - level_group_size: 4, - max_level_group_size: 8, - min_level_size: 5, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, new_data, } } - // TODO: use the options below? - // but I don't actually see why they should be configurable - // /// The minimum number of elements that a level is allowed to have. - // pub fn level_max_group_size(mut self, value: u8) -> Self { - // self.max_level_group_size = std::cmp::max(value, 4); - // self - // } - - // /// The number of elements from the level below that are represented by a single element in the level above - // /// - // /// This setting is always greater than or equal to 2. - // pub fn level_group_size(mut self, value: u8) -> Self { - // self.level_group_size = std::cmp::max(value, 2); - // self - // } - - // /// The minimum number of elements that a level is allowed to have. - // pub fn min_level_size(mut self, value: u8) -> Self { - // self.min_level_size = std::cmp::max(value, 2); - // self - // } - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { if self.new_data.is_empty() { return Ok(()); } - // here, come up with a better condition! - // ideally we'd choose which method to use for each field id individually - // but I dont' think it's worth the effort yet - // As a first requirement, we ask that the length of the new data is less - // than a 1/50th of the length of the database in order to use the incremental - // method. if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) - .level_group_size(self.level_group_size) - .min_level_size(self.min_level_size); + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size); bulk_update.execute(wtxn)?; } else { let incremental_update = - FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data) - .group_size(self.level_group_size) - .max_group_size(self.max_level_group_size) - .min_level_size(self.min_level_size); + FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size); incremental_update.execute(wtxn)?; } Ok(()) @@ -346,7 +389,7 @@ mod comparison_bench { // of the incremental vs. bulk indexer. // It appears that the incremental indexer is about 50 times slower than the // bulk indexer. - #[test] + // #[test] fn benchmark_facet_indexing() { // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 96bea9589..7b02fd1af 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -7,7 +7,7 @@ mod typed_chunk; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroU32; use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; @@ -82,8 +82,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, F> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub facet_level_group_size: Option, - pub facet_min_level_size: Option, pub words_prefix_threshold: Option, pub max_prefix_length: Option, pub words_positions_level_group_size: Option, From fca4577e233d943990757c7b4e1408f8bec7840f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 17:56:38 +0200 Subject: [PATCH 31/58] Return original string in facet distributions, work on facet tests --- milli/src/search/facet/facet_distribution.rs | 249 +++++++++++++++++- .../search/facet/facet_distribution_iter.rs | 104 +++----- milli/src/search/facet/facet_range_search.rs | 72 ++--- .../src/search/facet/facet_sort_ascending.rs | 41 +-- .../src/search/facet/facet_sort_descending.rs | 42 +-- milli/src/search/facet/filter.rs | 6 +- milli/src/search/facet/mod.rs | 37 +++ .../random_looking_index_snap.hash.snap | 4 - .../random_looking_index_snap.hash.snap | 4 - .../random_looking_index_snap.hash.snap | 4 - 10 files changed, 350 insertions(+), 213 deletions(-) delete mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap delete mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap delete mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7c554d368..7eb438a03 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -140,13 +140,13 @@ impl<'a> FacetDistribution<'a> { self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, - |facet_key, nbr_docids| { + |facet_key, nbr_docids, _| { let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); distribution.insert(facet_key.to_string(), nbr_docids); if distribution.len() == self.max_values_per_facet { - ControlFlow::Break(()) + Ok(ControlFlow::Break(())) } else { - ControlFlow::Continue(()) + Ok(ControlFlow::Continue(())) } }, ) @@ -163,13 +163,22 @@ impl<'a> FacetDistribution<'a> { self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, - |facet_key, nbr_docids| { + |facet_key, nbr_docids, any_docid| { let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); - distribution.insert(facet_key.to_string(), nbr_docids); + + let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); + let original_string = self + .index + .field_id_docid_facet_strings + .get(self.rtxn, &key)? + .unwrap() + .to_owned(); + + distribution.insert(original_string, nbr_docids); if distribution.len() == self.max_values_per_facet { - ControlFlow::Break(()) + Ok(ControlFlow::Break(())) } else { - ControlFlow::Continue(()) + Ok(ControlFlow::Continue(())) } }, ) @@ -186,7 +195,8 @@ impl<'a> FacetDistribution<'a> { let db = self.index.facet_id_f64_docids; let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(0); + prefix.push(0); // read values from level 0 only + let iter = db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? @@ -207,10 +217,15 @@ impl<'a> FacetDistribution<'a> { .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? .remap_types::, FacetGroupValueCodec>(); - // TODO: get the original value of the facet somewhere (in the documents DB?) for result in iter { let (key, value) = result?; - distribution.insert(key.left_bound.to_owned(), value.bitmap.len()); + + let docid = value.bitmap.iter().next().unwrap(); + let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound); + let original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); + + distribution.insert(original_string, value.bitmap.len()); if distribution.len() == self.max_values_per_facet { break; } @@ -304,3 +319,217 @@ impl fmt::Debug for FacetDistribution<'_> { .finish() } } + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::{ + documents::documents_batch_reader_from_objects, index::tests::TempIndex, milli_snap, + FacetDistribution, + }; + + #[test] + fn few_candidates_few_facet_values() { + // All the tests here avoid using the code in `facet_distribution_iter` because there aren't + // enough candidates. + + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let documents = documents!([ + { "colour": "Blue" }, + { "colour": " blue" }, + { "colour": "RED" } + ]); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + // I think it would be fine if " blue" was "Blue" instead. + // We just need to get any non-normalised string I think, even if it's not in + // the candidates + milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + } + + #[test] + fn many_candidates_few_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 5], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###); + } + + #[test] + fn many_candidates_many_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::>(); + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 1000], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(2) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992"); + } +} diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 3379d1abe..ad330b8db 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,8 +4,9 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, +use crate::{ + heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + DocumentId, }; pub fn iterate_over_facet_distribution<'t, CB>( @@ -16,7 +17,7 @@ pub fn iterate_over_facet_distribution<'t, CB>( callback: CB, ) -> Result<()> where - CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = @@ -32,7 +33,7 @@ where struct FacetDistribution<'t, CB> where - CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -42,7 +43,7 @@ where impl<'t, CB> FacetDistribution<'t, CB> where - CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { fn iterate_level_0( &mut self, @@ -62,7 +63,8 @@ where } let docids_in_common = value.bitmap.intersection_len(candidates); if docids_in_common > 0 { - match (self.callback)(key.left_bound, docids_in_common) { + let any_docid = value.bitmap.iter().next().unwrap(); + match (self.callback)(key.left_bound, docids_in_common, any_docid)? { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } @@ -112,50 +114,14 @@ where #[cfg(test)] mod tests { + use super::iterate_over_facet_distribution; + use crate::milli_snap; + use crate::search::facet::tests::get_random_looking_index; + use crate::{heed_codec::facet::OrderedF64Codec, search::facet::tests::get_simple_index}; + use heed::BytesDecode; + use roaring::RoaringBitmap; use std::ops::ControlFlow; - use super::iterate_over_facet_distribution; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::milli_snap; - use crate::update::facet::tests::FacetIndex; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - - #[test] - fn random_looking_index_snap() { - let index = get_random_looking_index(); - milli_snap!(format!("{index}")); - } #[test] fn filter_distribution_all() { let indexes = [get_simple_index(), get_random_looking_index()]; @@ -163,11 +129,17 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); - iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - }) + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + Ok(ControlFlow::Continue(())) + }, + ) .unwrap(); milli_snap!(results, i); @@ -182,17 +154,23 @@ mod tests { let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; - iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - if nbr_facets == 100 { - return ControlFlow::Break(()); - } else { - nbr_facets += 1; - results.push_str(&format!("{facet}: {count}\n")); + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return Ok(ControlFlow::Break(())); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - } - }) + Ok(ControlFlow::Continue(())) + } + }, + ) .unwrap(); milli_snap!(results, i); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index cb5fd14d2..c99ac8e92 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,8 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, -) -> Result + docids: &mut RoaringBitmap, +) -> Result<()> where BoundCodec: for<'a> BytesEncode<'a>, for<'a> >::EItem: Sized, @@ -45,16 +46,15 @@ where Bound::Unbounded => Bound::Unbounded, }; let db = db.remap_key_type::>(); - let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; - Ok(docids) + Ok(()) } else { - return Ok(RoaringBitmap::new()); + return Ok(()); } } @@ -255,45 +255,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { - use std::ops::Bound; - - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use crate::update::facet::tests::FacetIndex; - - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } + use roaring::RoaringBitmap; + use std::ops::Bound; #[test] fn random_looking_index_snap() { @@ -310,12 +278,14 @@ mod tests { let i = i as f64; let start = Bound::Included(0.); let end = Bound::Included(i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -326,12 +296,14 @@ mod tests { let i = i as f64; let start = Bound::Excluded(0.); let end = Bound::Excluded(i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -352,12 +324,14 @@ mod tests { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255.); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -371,12 +345,14 @@ mod tests { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255.); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -399,12 +375,14 @@ mod tests { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255. - i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -418,12 +396,14 @@ mod tests { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255. - i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index f320f9e77..33ca7d1ce 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,49 +83,12 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - - #[test] - fn random_looking_index_snap() { - let index = get_random_looking_index(); - milli_snap!(format!("{index}")); - } #[test] fn filter_sort() { let indexes = [get_simple_index(), get_random_looking_index()]; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index be5fe7841..69f286886 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -116,49 +116,13 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - - #[test] - fn random_looking_index_snap() { - let index = get_random_looking_index(); - milli_snap!(format!("{index}")); - } #[test] fn filter_sort_descending() { let indexes = [get_simple_index(), get_random_looking_index()]; diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 15edafb03..4263eea7b 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -278,11 +278,9 @@ impl<'a> Filter<'a> { (Excluded(l), Included(r)) if l >= r => return Ok(()), (_, _) => (), } - let x = facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, + facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, db, field_id, &left, &right, output, )?; - // TODO: the facet range search should take a mutable roaring bitmap as argument - *output = x; Ok(()) } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index fc71acf37..415c2b51a 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -74,3 +74,40 @@ pub(crate) fn get_highest_level<'t>( }) .unwrap_or(0)) } + +#[cfg(test)] +pub(crate) mod tests { + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + + use crate::{heed_codec::facet::OrderedF64Codec, update::facet::tests::FacetIndex}; + + pub fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + index + } +} diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap deleted file mode 100644 index 661e1a35b..000000000 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/search/facet/facet_distribution_iter.rs ---- -3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap deleted file mode 100644 index 64ff762db..000000000 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/search/facet/facet_sort_ascending.rs ---- -3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap deleted file mode 100644 index 0649e3c5d..000000000 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/search/facet/facet_sort_descending.rs ---- -3256c76a7c1b768a013e78d5fa6e9ff9 From 3d7ed3263f3cfb4eed14b446de62f04dc9ef6efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 18:00:00 +0200 Subject: [PATCH 32/58] Fix bug in string facet distribution with few candidates --- milli/src/search/facet/facet_distribution.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7eb438a03..0eaeec399 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -96,7 +96,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); let db = self.index.field_id_docid_facet_strings; - for docid in candidates.into_iter() { + 'outer: for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -112,7 +112,7 @@ impl<'a> FacetDistribution<'a> { *count += 1; if normalized_distribution.len() == self.max_values_per_facet { - break; + break 'outer; } } } @@ -393,7 +393,7 @@ mod tests { .execute() .unwrap(); - milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 1}}"###); } #[test] From b1ab09196cdd97549a5e960fb33fd2f2018244d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 18:03:52 +0200 Subject: [PATCH 33/58] Remove outdated TODOs --- milli/src/update/facet/bulk.rs | 2 -- .../index_documents/extract/extract_facet_string_docids.rs | 2 +- milli/src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 83fa51003..3a371995e 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -177,8 +177,6 @@ impl FacetsUpdateBulkInner { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - // TODO: first check whether there is anything in level 0? - let mut all_docids = RoaringBitmap::new(); let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { for bitmap in bitmaps { diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index e6a41067b..a7b027ce3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -21,7 +21,7 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, // TODO: check that it is correct + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 208dfc74d..5f557d812 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_cbo_roaring_bitmaps, // TODO: check (cbo?) + merge_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index f11414f20..16784bd92 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -138,13 +138,11 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { - // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { - // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; From 985a94adfc6fbe4f333f39572b6b7e6f1f1a46b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 18:04:07 +0200 Subject: [PATCH 34/58] cargo fmt --- milli/src/search/facet/facet_distribution.rs | 7 ++- .../search/facet/facet_distribution_iter.rs | 18 ++++---- milli/src/search/facet/facet_range_search.rs | 6 ++- .../src/search/facet/facet_sort_ascending.rs | 3 +- .../src/search/facet/facet_sort_descending.rs | 3 +- milli/src/search/facet/mod.rs | 3 +- milli/src/update/facet/bulk.rs | 27 +++++++----- milli/src/update/facet/incremental.rs | 19 ++++---- milli/src/update/facet/mod.rs | 44 ++++++++++++------- 9 files changed, 78 insertions(+), 52 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 0eaeec399..2e2e448c2 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -325,10 +325,9 @@ mod tests { use big_s::S; use maplit::hashset; - use crate::{ - documents::documents_batch_reader_from_objects, index::tests::TempIndex, milli_snap, - FacetDistribution, - }; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::{milli_snap, FacetDistribution}; #[test] fn few_candidates_few_facet_values() { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index ad330b8db..01266187a 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,10 +4,10 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::{ - heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, - DocumentId, +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, }; +use crate::DocumentId; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, @@ -114,13 +114,15 @@ where #[cfg(test)] mod tests { - use super::iterate_over_facet_distribution; - use crate::milli_snap; - use crate::search::facet::tests::get_random_looking_index; - use crate::{heed_codec::facet::OrderedF64Codec, search::facet::tests::get_simple_index}; + use std::ops::ControlFlow; + use heed::BytesDecode; use roaring::RoaringBitmap; - use std::ops::ControlFlow; + + use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; #[test] fn filter_distribution_all() { diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index c99ac8e92..8934873b7 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -255,13 +255,15 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { + use std::ops::Bound; + + use roaring::RoaringBitmap; + use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use roaring::RoaringBitmap; - use std::ops::Bound; #[test] fn random_looking_index_snap() { diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 33ca7d1ce..6567fe95e 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,11 +83,12 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use roaring::RoaringBitmap; #[test] fn filter_sort() { diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 69f286886..2eab9fca1 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -116,12 +116,13 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use roaring::RoaringBitmap; #[test] fn filter_sort_descending() { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 415c2b51a..18c3a652b 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -80,7 +80,8 @@ pub(crate) mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::{heed_codec::facet::OrderedF64Codec, update::facet::tests::FacetIndex}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::tests::FacetIndex; pub fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 3a371995e..e82af5d66 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,19 +1,20 @@ +use std::borrow::Cow; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::fs::File; -use time::OffsetDateTime; - -use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases /// by rebuilding the database "from scratch". @@ -342,11 +343,13 @@ impl FacetsUpdateBulkInner { #[cfg(test)] mod tests { + use std::iter::once; + + use roaring::RoaringBitmap; + use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::update::facet::tests::FacetIndex; - use roaring::RoaringBitmap; - use std::iter::once; #[test] fn insert() { diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 6be2dbf03..a06c8e1c2 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,14 +1,16 @@ +use std::collections::HashMap; +use std::fs::File; + +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, Error, RoTxn, RwTxn}; -use roaring::RoaringBitmap; -use std::collections::HashMap; -use std::fs::File; enum InsertionResult { InPlace, @@ -613,13 +615,14 @@ impl<'a> FacetGroupKey> { #[cfg(test)] mod tests { - use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; - use crate::milli_snap; - use crate::update::facet::tests::FacetIndex; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + #[test] fn append() { let index = FacetIndex::::new(4, 8, 5); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ea6468538..9263d3a6a 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -53,8 +53,8 @@ FacetGroupValue: ``` When the database is first created using the "bulk" method, each node has a fixed number of children -(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). -The tree is also built such that the highest level has more than `min_level_size` +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` (default to `FACET_MIN_LEVEL_SIZE`) elements in it. When the database is incrementally updated, the number of children of a node can vary between @@ -66,7 +66,7 @@ When adding documents to the databases, it is important to determine which metho minimise indexing time. The incremental method is faster when adding few new facet values, but the bulk method is faster when a large part of the database is modified. Empirically, it seems that it takes 50x more time to incrementally add N facet values to an existing database than it is to -construct a database of N facet values. This is the heuristic that is used to choose between the +construct a database of N facet values. This is the heuristic that is used to choose between the two methods. */ @@ -74,12 +74,13 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; +use std::fs::File; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::{Index, Result}; -use std::fs::File; pub mod bulk; pub mod incremental; @@ -119,11 +120,23 @@ impl<'i> FacetsUpdate<'i> { return Ok(()); } if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size); + let bulk_update = FacetsUpdateBulk::new( + self.index, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + ); bulk_update.execute(wtxn)?; } else { - let incremental_update = - FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size); + let incremental_update = FacetsUpdateIncremental::new( + self.index, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + self.max_group_size, + ); incremental_update.execute(wtxn)?; } Ok(()) @@ -132,6 +145,14 @@ impl<'i> FacetsUpdate<'i> { #[cfg(test)] pub(crate) mod tests { + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; + + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + use super::bulk::FacetsUpdateBulkInner; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, @@ -140,12 +161,6 @@ pub(crate) mod tests { use crate::snapshot_tests::display_bitmap; use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; - use heed::types::ByteSlice; - use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; - use roaring::RoaringBitmap; - use std::fmt::Display; - use std::marker::PhantomData; - use std::rc::Rc; // A dummy index that only contains the facet database, used for testing pub struct FacetIndex @@ -381,9 +396,8 @@ mod comparison_bench { use rand::Rng; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use super::tests::FacetIndex; + use crate::heed_codec::facet::OrderedF64Codec; // This is a simple test to get an intuition on the relative speed // of the incremental vs. bulk indexer. From de52a9bf75e3fc9b2c8a7f86511daef356504711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 08:47:40 +0200 Subject: [PATCH 35/58] Improve documentation of some facet-related algorithms --- milli/src/search/criteria/asc_desc.rs | 5 ++-- .../search/facet/facet_distribution_iter.rs | 19 ++++++++++---- .../src/search/facet/facet_sort_ascending.rs | 22 ++++++++++++++++ .../src/search/facet/facet_sort_descending.rs | 3 +++ milli/src/search/facet/mod.rs | 25 ++++++++++++------- 5 files changed, 57 insertions(+), 17 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bb2788cc8..76dd3db29 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -9,9 +9,8 @@ use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::facet_sort_ascending::ascending_facet_sort; -use crate::search::facet::facet_sort_descending::descending_facet_sort; -// use crate::search::facet::FacetStringIter; +use crate::search::facet::ascending_facet_sort; +use crate::search::facet::descending_facet_sort; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 01266187a..ab546f7a9 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,14 +1,23 @@ -use std::ops::ControlFlow; - -use heed::Result; -use roaring::RoaringBitmap; - use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, }; use crate::DocumentId; +use heed::Result; +use roaring::RoaringBitmap; +use std::ops::ControlFlow; +/// Call the given closure on the facet distribution of the candidate documents. +/// +/// The arguments to the closure are: +/// - the facet value, as a byte slice +/// - the number of documents among the candidates that contain this facet value +/// - the id of a document which contains the facet value. Note that this document +/// is not necessarily from the list of candidates, it is simply *any* document which +/// contains this facet value. +/// +/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should +/// keep iterating over the different facet values or stop. pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 6567fe95e..2b0a45e15 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -6,6 +6,28 @@ use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +/// Return an iterator which iterates over the given candidate documents in +/// ascending order of their facet value for the given field id. +/// +/// The documents returned by the iterator are grouped by the facet values that +/// determined their rank. For example, given the documents: +/// +/// ```ignore +/// 0: { "colour": ["blue", "green"] } +/// 1: { "colour": ["blue", "red"] } +/// 2: { "colour": ["orange", "red"] } +/// 3: { "colour": ["green", "red"] } +/// 4: { "colour": ["blue", "orange", "red"] } +/// ``` +/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator +/// over the following elements: +/// ```ignore +/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" +/// [3] // same for "green" +/// [2] // same for "orange" +/// END +/// ``` +/// Note that once a document id is returned by the iterator, it is never returned again. pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 2eab9fca1..47d0f145b 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -8,6 +8,9 @@ use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). +/// +/// This function does the same thing, but in the opposite order. pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 18c3a652b..b880c2e01 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,17 +1,19 @@ -use heed::types::ByteSlice; -use heed::{BytesDecode, RoTxn}; - pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +pub use facet_sort_ascending::ascending_facet_sort; +pub use facet_sort_descending::descending_facet_sort; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, RoTxn}; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; -pub mod facet_sort_ascending; -pub mod facet_sort_descending; +mod facet_sort_ascending; +mod facet_sort_descending; mod filter; +/// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: heed::Database, FacetGroupValueCodec>, @@ -23,8 +25,9 @@ where let mut level0prefix = vec![]; level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.push(0); - let mut level0_iter_forward = - db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) @@ -34,6 +37,8 @@ where Ok(None) } } + +/// Get the last facet value in the facet database pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: heed::Database, FacetGroupValueCodec>, @@ -47,7 +52,7 @@ where level0prefix.push(0); let mut level0_iter_backward = db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) @@ -57,6 +62,8 @@ where Ok(None) } } + +/// Get the height of the highest level in the facet database pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -65,7 +72,7 @@ pub(crate) fn get_highest_level<'t>( let field_id_prefix = &field_id.to_be_bytes(); Ok(db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)? + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, field_id_prefix)? .next() .map(|el| { let (key, _) = el.unwrap(); From 86d9f50b9c3d9456f1ba738a2b35fcfabbc688ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 11:53:01 +0200 Subject: [PATCH 36/58] Fix bugs in incremental facet indexing with variable parameters e.g. add one facet value incrementally with a group_size = X and then add another one with group_size = Y It is not actually possible to do so with the public API of milli, but I wanted to make sure the algorithm worked well in those cases anyway. The bugs were found by fuzzing the code with fuzzcheck, which I've added to milli as a conditional dev-dependency. But it can be removed later. --- .gitignore | 2 + milli/Cargo.toml | 3 + milli/src/lib.rs | 2 + milli/src/search/criteria/asc_desc.rs | 3 +- .../search/facet/facet_distribution_iter.rs | 8 +- milli/src/search/facet/mod.rs | 7 +- milli/src/update/facet/incremental.rs | 614 +++++++++++------- milli/src/update/facet/mod.rs | 67 +- 8 files changed, 435 insertions(+), 271 deletions(-) diff --git a/.gitignore b/.gitignore index cef7b7b4c..39623a232 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ /target /Cargo.lock +milli/target/ + # datasets *.csv *.mmdb diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 658ef0d24..2f881fccb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -56,6 +56,9 @@ maplit = "1.0.2" md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } +[target.'cfg(fuzzing)'.dev-dependencies] +fuzzcheck = { path = "../../fuzzcheck-rs/fuzzcheck" } + [features] default = [ "charabia/default" ] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ffbe8f38f..630d13125 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,5 @@ +#![cfg_attr(all(test, fuzzing), feature(no_coverage))] + #[macro_use] pub mod documents; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 76dd3db29..586605116 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -9,8 +9,7 @@ use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::ascending_facet_sort; -use crate::search::facet::descending_facet_sort; +use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index ab546f7a9..4c6dc75fa 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,11 +1,13 @@ +use std::ops::ControlFlow; + +use heed::Result; +use roaring::RoaringBitmap; + use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, }; use crate::DocumentId; -use heed::Result; -use roaring::RoaringBitmap; -use std::ops::ControlFlow; /// Call the given closure on the facet distribution of the candidate documents. /// diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index b880c2e01..be04fbd7f 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,11 +1,12 @@ -pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -pub use self::filter::Filter; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; pub use facet_sort_ascending::ascending_facet_sort; pub use facet_sort_descending::descending_facet_sort; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, RoTxn}; +pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; +pub use self::filter::Filter; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; + mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a06c8e1c2..c2115aee5 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -14,6 +14,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { InPlace, + Expand, Insert, } enum DeletionResult { @@ -251,6 +252,7 @@ impl FacetsUpdateIncrementalInner { return Ok(InsertionResult::InPlace); } + InsertionResult::Expand => {} InsertionResult::Insert => {} } @@ -258,7 +260,7 @@ impl FacetsUpdateIncrementalInner { // of a new key. Therefore, it may be the case that we need to modify the left bound of the // insertion key (see documentation of `find_insertion_key_value` for an example of when that // could happen). - let insertion_key = { + let (insertion_key, insertion_key_was_modified) = { let mut new_insertion_key = insertion_key.clone(); let mut key_should_be_modified = false; @@ -271,7 +273,7 @@ impl FacetsUpdateIncrementalInner { assert!(is_deleted); self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; } - new_insertion_key + (new_insertion_key, key_should_be_modified) }; // Now we know that the insertion key contains the `facet_value`. @@ -280,20 +282,25 @@ impl FacetsUpdateIncrementalInner { // 2. Merge the previous docids with the new one let mut updated_value = insertion_value; - updated_value.size += 1; + if matches!(result, InsertionResult::Insert) { + updated_value.size += 1; + } if updated_value.size < max_group_size { updated_value.bitmap |= docids; self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - - return Ok(InsertionResult::InPlace); + if insertion_key_was_modified { + return Ok(InsertionResult::Expand); + } else { + return Ok(InsertionResult::InPlace); + } } // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` // Therefore it must be split into two nodes. - let size_left = max_group_size / 2; - let size_right = max_group_size - size_left; + let size_left = updated_value.size / 2; + let size_right = updated_value.size - size_left; let level_below = level - 1; @@ -303,7 +310,8 @@ impl FacetsUpdateIncrementalInner { left_bound: insertion_key.left_bound.as_slice(), }; - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); + let mut iter = + self.db.range(&txn, &(start_key..))?.take((size_left as usize) + (size_right as usize)); let group_left = { let mut values_left = RoaringBitmap::new(); @@ -368,6 +376,7 @@ impl FacetsUpdateIncrementalInner { self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { InsertionResult::InPlace => return Ok(()), + InsertionResult::Expand => return Ok(()), InsertionResult::Insert => {} } @@ -393,8 +402,11 @@ impl FacetsUpdateIncrementalInner { .as_polymorph() .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; + let nbr_new_groups = size_highest_level / self.group_size as usize; + let nbr_leftover_elements = size_highest_level % self.group_size as usize; + let mut to_add = vec![]; - for _ in 0..self.min_level_size { + for _ in 0..nbr_new_groups { let mut first_key = None; let mut values = RoaringBitmap::new(); for _ in 0..group_size { @@ -415,6 +427,30 @@ impl FacetsUpdateIncrementalInner { let value = FacetGroupValue { size: group_size as u8, bitmap: values }; to_add.push((key.into_owned(), value)); } + // now we add the rest of the level, in case its size is > group_size * min_level_size + // this can indeed happen if the min_level_size parameter changes between two calls to `insert` + if nbr_leftover_elements > 0 { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..nbr_leftover_elements { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + drop(groups_iter); for (key, value) in to_add { self.db.put(txn, &key.as_ref(), &value)?; @@ -983,243 +1019,345 @@ mod tests { // fuzz tests } -// #[cfg(all(test, fuzzing))] -// mod fuzz { -// use crate::codec::U16Codec; +#[cfg(all(test, fuzzing))] +mod fuzz { + use std::borrow::Cow; + use std::collections::{BTreeMap, HashMap}; + use std::convert::TryFrom; + use std::rc::Rc; -// use super::tests::verify_structure_validity; -// use super::*; -// use fuzzcheck::mutators::integer_within_range::U16WithinRangeMutator; -// use fuzzcheck::DefaultMutator; -// use roaring::RoaringBitmap; -// use std::collections::BTreeMap; -// use std::collections::HashMap; + use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; + use fuzzcheck::DefaultMutator; + use heed::BytesEncode; + use roaring::RoaringBitmap; + use tempfile::TempDir; -// #[derive(Default)] -// pub struct TrivialDatabase { -// pub elements: BTreeMap>, -// } -// impl TrivialDatabase -// where -// T: Ord + Clone + Copy + Eq + std::fmt::Debug, -// { -// pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { -// if new_values.is_empty() { -// return; -// } -// let values_field_id = self.elements.entry(field_id).or_default(); -// let values = values_field_id.entry(new_key).or_default(); -// *values |= new_values; -// } -// pub fn delete(&mut self, field_id: u16, key: T, value: u32) { -// if let Some(values_field_id) = self.elements.get_mut(&field_id) { -// if let Some(values) = values_field_id.get_mut(&key) { -// values.remove(value); -// if values.is_empty() { -// values_field_id.remove(&key); -// } -// } -// if values_field_id.is_empty() { -// self.elements.remove(&field_id); -// } -// } -// } -// } -// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] -// struct Operation { -// key: Key, -// #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] -// field_id: u16, -// kind: OperationKind, -// } -// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] -// enum OperationKind { -// Insert(Vec), -// Delete(u8), -// } + use super::*; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; -// fn compare_with_trivial_database( -// tempdir: Rc, -// group_size: u8, -// max_group_size: u8, -// operations: &[Operation], -// ) { -// let index = FacetIndex::::open_from_tempdir(tempdir, group_size, max_group_size); -// let mut trivial_db = TrivialDatabase::::default(); -// let mut value_to_keys = HashMap::>::new(); -// let mut txn = index.env.write_txn().unwrap(); -// for Operation { key, field_id, kind } in operations { -// match kind { -// OperationKind::Insert(values) => { -// let mut bitmap = RoaringBitmap::new(); -// for value in values { -// bitmap.insert(*value as u32); -// value_to_keys.entry(*value).or_default().push(*key); -// } -// index.insert(&mut txn, *field_id, key, &bitmap); -// trivial_db.insert(*field_id, *key, &bitmap); -// } -// OperationKind::Delete(value) => { -// if let Some(keys) = value_to_keys.get(value) { -// for key in keys { -// index.delete(&mut txn, *field_id, key, *value as u32); -// trivial_db.delete(*field_id, *key, *value as u32); -// } -// } -// } -// } -// } -// for (field_id, values_field_id) in trivial_db.elements.iter() { -// let level0iter = index -// .db -// .content -// .as_polymorph() -// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( -// &mut txn, -// &field_id.to_be_bytes(), -// ) -// .unwrap(); + struct NEU16Codec; + impl<'a> BytesEncode<'a> for NEU16Codec { + type EItem = u16; + #[no_coverage] + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Owned(item.to_be_bytes().to_vec())) + } + } + impl<'a> BytesDecode<'a> for NEU16Codec { + type DItem = u16; + #[no_coverage] + fn bytes_decode(bytes: &'a [u8]) -> Option { + let bytes = <[u8; 2]>::try_from(&bytes[0..=1]).unwrap(); + Some(u16::from_be_bytes(bytes)) + } + } -// for ((key, values), group) in values_field_id.iter().zip(level0iter) { -// let (group_key, group_values) = group.unwrap(); -// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); -// assert_eq!(key, &group_key.left_bound); -// assert_eq!(values, &group_values.bitmap); -// } -// } + #[derive(Default)] + pub struct TrivialDatabase { + pub elements: BTreeMap>, + } + impl TrivialDatabase + where + T: Ord + Clone + Copy + Eq + std::fmt::Debug, + { + #[no_coverage] + pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { + if new_values.is_empty() { + return; + } + let values_field_id = self.elements.entry(field_id).or_default(); + let values = values_field_id.entry(new_key).or_default(); + *values |= new_values; + } + #[no_coverage] + pub fn delete(&mut self, field_id: u16, key: T, value: u32) { + if let Some(values_field_id) = self.elements.get_mut(&field_id) { + if let Some(values) = values_field_id.get_mut(&key) { + values.remove(value); + if values.is_empty() { + values_field_id.remove(&key); + } + } + if values_field_id.is_empty() { + self.elements.remove(&field_id); + } + } + } + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + struct Operation { + key: Key, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + max_group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + min_level_size: u8, + #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] + field_id: u16, + kind: OperationKind, + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + enum OperationKind { + Insert(Vec), + Delete(u8), + } -// txn.commit().unwrap(); -// let mut txn = index.env.write_txn().unwrap(); -// for (field_id, values_field_id) in trivial_db.elements.iter() { -// let level0iter = index -// .db -// .content -// .as_polymorph() -// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) -// .unwrap(); + #[no_coverage] + fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten + // let mut txn = index.env.write_txn().unwrap(); + let mut txn = index.env.write_txn().unwrap(); -// for ((key, values), group) in values_field_id.iter().zip(level0iter) { -// let (group_key, group_values) = group.unwrap(); -// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); -// assert_eq!(key, &group_key.left_bound); -// assert_eq!(values, &group_values.bitmap); -// } -// index.verify_structure_validity(*field_id); -// } + let mut trivial_db = TrivialDatabase::::default(); + let mut value_to_keys = HashMap::>::new(); + for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in + operations + { + index.set_group_size(*group_size); + index.set_max_group_size(*max_group_size); + index.set_min_level_size(*min_level_size); + match kind { + OperationKind::Insert(values) => { + let mut bitmap = RoaringBitmap::new(); + for value in values { + bitmap.insert(*value as u32); + value_to_keys.entry(*value).or_default().push(*key); + } + index.insert(&mut txn, *field_id, key, &bitmap); + trivial_db.insert(*field_id, *key, &bitmap); + } + OperationKind::Delete(value) => { + if let Some(keys) = value_to_keys.get(value) { + for key in keys { + index.delete(&mut txn, *field_id, key, *value as u32); + trivial_db.delete(*field_id, *key, *value as u32); + } + } + } + } + } -// index.db.content.clear(&mut txn).unwrap(); -// txn.commit().unwrap(); -// } + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + &mut txn, + &field_id.to_be_bytes(), + ) + .unwrap(); -// #[test] -// fn fuzz() { -// let tempdir = Rc::new(TempDir::new().unwrap()); -// let tempdir_cloned = tempdir.clone(); -// let result = fuzzcheck::fuzz_test(move |x: &(u8, u8, Vec>)| { -// compare_with_trivial_database(tempdir_cloned.clone(), x.0, x.1, &x.2) -// }) -// .default_mutator() -// .serde_serializer() -// .default_sensor_and_pool_with_custom_filter(|file, function| { -// if file.is_relative() -// && !function.contains("serde") -// && !function.contains("tests::") -// && !function.contains("fuzz::") -// && !function.contains("display_bitmap") -// { -// true -// } else { -// false -// } -// }) -// .arguments_from_cargo_fuzzcheck() -// .launch(); -// assert!(!result.found_test_failure); -// } + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + } -// #[test] -// fn reproduce_bug() { -// let operations = r#" -// [ -// {"key":0, "field_id": 0, "kind":{"Insert":[109]}}, -// {"key":143, "field_id": 0, "kind":{"Insert":[243]}}, -// {"key":90, "field_id": 0, "kind":{"Insert":[217]}}, -// {"key":172, "field_id": 0, "kind":{"Insert":[94]}}, -// {"key":27, "field_id": 0, "kind":{"Insert":[4]}}, -// {"key":124, "field_id": 0, "kind":{"Insert":[0]}}, -// {"key":123, "field_id": 0, "kind":{"Insert":[0]}}, -// {"key":67, "field_id": 0, "kind":{"Insert":[109]}}, -// {"key":13, "field_id": 0, "kind":{"Insert":[0]}}, -// {"key":162, "field_id": 0, "kind":{"Insert":[213]}}, -// {"key":235, "field_id": 0, "kind":{"Insert":[67]}}, -// {"key":251, "field_id": 0, "kind":{"Insert":[50]}}, -// {"key":218, "field_id": 0, "kind":{"Insert":[164]}}, -// {"key":166, "field_id": 0, "kind":{"Insert":[67]}}, -// {"key":64, "field_id": 0, "kind":{"Insert":[61]}}, -// {"key":183, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":250, "field_id": 0, "kind":{"Delete":50}} -// ] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); -// } + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) + .unwrap(); -// #[test] -// fn reproduce_bug2() { -// let operations = r#" -// [ -// {"key":102, "field_id": 0, "kind":{"Insert":[122]}}, -// {"key":73, "field_id": 0, "kind":{"Insert":[132]}}, -// {"key":20, "field_id": 0, "kind":{"Insert":[215]}}, -// {"key":39, "field_id": 0, "kind":{"Insert":[152]}}, -// {"key":151, "field_id": 0, "kind":{"Insert":[226]}}, -// {"key":17, "field_id": 0, "kind":{"Insert":[101]}}, -// {"key":74, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":2, "field_id": 0, "kind":{"Insert":[130]}}, -// {"key":64, "field_id": 0, "kind":{"Insert":[180]}}, -// {"key":83, "field_id": 0, "kind":{"Insert":[250]}}, -// {"key":80, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":113, "field_id": 0, "kind":{"Insert":[63]}}, -// {"key":201, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":200, "field_id": 0, "kind":{"Insert":[5]}}, -// {"key":93, "field_id": 0, "kind":{"Insert":[98]}}, -// {"key":162, "field_id": 0, "kind":{"Insert":[5]}}, -// {"key":80, "field_id": 0, "kind":{"Delete":210}} -// ] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); -// } -// #[test] -// fn reproduce_bug3() { -// let operations = r#" -// [ -// {"key":27488, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":64716, "field_id": 0, "kind":{"Insert":[216]}}, -// {"key":60886, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":59509, "field_id": 0, "kind":{"Insert":[187,231]}}, -// {"key":55057, "field_id": 0, "kind":{"Insert":[37]}}, -// {"key":45200, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":55056, "field_id": 0, "kind":{"Insert":[37]}}, -// {"key":63679, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":52155, "field_id": 0, "kind":{"Insert":[74]}}, -// {"key":20648, "field_id": 0, "kind":{"Insert":[47,138,157]}} -// ] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 0, 7, &operations); -// } + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + index.verify_structure_validity(&txn, *field_id); + } + txn.abort().unwrap(); + } -// #[test] -// fn reproduce_bug4() { -// let operations = r#" -// [{"key":63499, "field_id": 0, "kind":{"Insert":[87]}},{"key":25374, "field_id": 0, "kind":{"Insert":[14]}},{"key":64481, "field_id": 0, "kind":{"Delete":87}},{"key":23038, "field_id": 0, "kind":{"Insert":[173]}},{"key":14862, "field_id": 0, "kind":{"Insert":[8]}},{"key":13145, "field_id": 0, "kind":{"Insert":[5,64]}},{"key":23446, "field_id": 0, "kind":{"Insert":[86,59]}},{"key":17972, "field_id": 0, "kind":{"Insert":[58,137]}},{"key":21273, "field_id": 0, "kind":{"Insert":[121,132,81,147]}},{"key":28264, "field_id": 0, "kind":{"Insert":[36]}},{"key":46659, "field_id": 0, "kind":{"Insert":[]}}] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 2, 1, &operations); -// } -// } + #[test] + #[no_coverage] + fn fuzz() { + let tempdir = Rc::new(TempDir::new().unwrap()); + let tempdir_cloned = tempdir.clone(); + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + compare_with_trivial_database(tempdir_cloned.clone(), operations) + }) + .default_mutator() + .serde_serializer() + .default_sensor_and_pool_with_custom_filter(|file, function| { + file == std::path::Path::new("milli/src/update/facet/incremental.rs") + && !function.contains("serde") + && !function.contains("tests::") + && !function.contains("fuzz::") + && !function.contains("display_bitmap") + }) + .arguments_from_cargo_fuzzcheck() + .launch(); + assert!(!result.found_test_failure); + } + + #[test] + #[no_coverage] + fn reproduce_bug1() { + let operations = r#" + [ + {"key":0, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, + {"key":143, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[243]}}, + {"key":90, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[217]}}, + {"key":172, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[94]}}, + {"key":27, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[4]}}, + {"key":124, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":123, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":67, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, + {"key":13, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[213]}}, + {"key":235, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, + {"key":251, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[50]}}, + {"key":218, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[164]}}, + {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, + {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, + {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":50}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug2() { + let operations = r#" + [ + {"key":102, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[122]}}, + {"key":73, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[132]}}, + {"key":20, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[215]}}, + {"key":39, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[152]}}, + {"key":151, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[226]}}, + {"key":17, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[101]}}, + {"key":74, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":2, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[130]}}, + {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[180]}}, + {"key":83, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[250]}}, + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":113, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[63]}}, + {"key":201, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, + {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, + {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":210}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + #[test] + #[no_coverage] + fn reproduce_bug3() { + let operations = r#" + [ + {"key":27488, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":64716, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[216]}}, + {"key":60886, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":59509, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[187,231]}}, + {"key":55057, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, + {"key":45200, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":55056, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, + {"key":63679, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":52155, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[74]}}, + {"key":20648, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[47,138,157]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug4() { + let operations = r#"[ + {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, + {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, + {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":87}}, + {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, + {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, + {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, + {"key":23446, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[86,59]}}, + {"key":17972, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[58,137]}}, + {"key":21273, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[121,132,81,147]}}, + {"key":28264, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[36]}}, + {"key":46659, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug5() { + let input = r#" + [ + { + "key":3438, + "group_size":11, + "max_group_size":0, + "min_level_size":17, + "field_id":3, + "kind":{"Insert":[198]} + }, + + { + "key":47098, + "group_size":0, + "max_group_size":8, + "min_level_size":0, + "field_id":3, + "kind":{"Insert":[11]} + }, + { + "key":22453, + "group_size":0, + "max_group_size":0, + "min_level_size":0, + "field_id":3, + "kind":{"Insert":[145]} + }, + { + "key":14105, + "group_size":14, + "max_group_size":4, + "min_level_size":25, + "field_id":3, + "kind":{"Delete":11} + } + ] + "#; + let operations: Vec> = serde_json::from_str(input).unwrap(); + let tmpdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tmpdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug6() { + let input = r#" + [ + {"key":45720,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[120]}}, + {"key":37463,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[187]}}, + {"key":21512,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, + {"key":21511,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, + {"key":37737,"group_size":12,"max_group_size":0,"min_level_size":6,"field_id":0,"kind":{"Insert":[181]}}, + {"key":53042,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}} + ] + "#; + let operations: Vec> = serde_json::from_str(input).unwrap(); + let tmpdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tmpdir), &operations); + } +} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 9263d3a6a..e7d14c788 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -145,6 +145,7 @@ impl<'i> FacetsUpdate<'i> { #[cfg(test)] pub(crate) mod tests { + use std::cell::Cell; use std::fmt::Display; use std::marker::PhantomData; use std::rc::Rc; @@ -170,9 +171,9 @@ pub(crate) mod tests { { pub env: Env, pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: u8, - pub min_level_size: u8, - pub max_group_size: u8, + pub group_size: Cell, + pub min_level_size: Cell, + pub max_group_size: Cell, _tempdir: Rc, _phantom: PhantomData, } @@ -189,9 +190,9 @@ pub(crate) mod tests { max_group_size: u8, min_level_size: u8, ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16 + let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16 + let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 10 * 100); @@ -202,13 +203,11 @@ pub(crate) mod tests { let content = env.open_database(None).unwrap().unwrap(); FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: tempdir, - }, + content, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), + _tempdir: tempdir, env, _phantom: PhantomData, } @@ -229,14 +228,32 @@ pub(crate) mod tests { FacetIndex { content, - group_size, - max_group_size, - min_level_size, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), _tempdir: Rc::new(tempdir), env, _phantom: PhantomData, } } + + pub fn set_group_size(&self, group_size: u8) { + // 2 <= x <= 64 + self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); + } + pub fn set_max_group_size(&self, max_group_size: u8) { + // 2*group_size <= x <= 128 + let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); + self.max_group_size.set(max_group_size); + if self.group_size.get() < max_group_size / 2 { + self.group_size.set(max_group_size / 2); + } + } + pub fn set_min_level_size(&self, min_level_size: u8) { + // 1 <= x <= inf + self.min_level_size.set(std::cmp::max(1, min_level_size)); + } + pub fn insert<'a>( &self, wtxn: &'a mut RwTxn, @@ -246,9 +263,9 @@ pub(crate) mod tests { ) { let update = FacetsUpdateIncrementalInner { db: self.content, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); @@ -262,9 +279,9 @@ pub(crate) mod tests { ) { let update = FacetsUpdateIncrementalInner { db: self.content, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.delete(wtxn, field_id, &key_bytes, value).unwrap(); @@ -296,8 +313,8 @@ pub(crate) mod tests { let update = FacetsUpdateBulkInner { db: self.content, new_data: Some(reader), - group_size: self.group_size, - min_level_size: self.min_level_size, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), }; update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); @@ -341,7 +358,7 @@ pub(crate) mod tests { FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; - assert!(value.size > 0 && value.size < self.max_group_size); + assert!(value.size > 0); let mut actual_size = 0; let mut values_below = RoaringBitmap::new(); From 3baa34d84214924854e297b8fbf9028d25822454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:10:45 +0200 Subject: [PATCH 37/58] Fix compiler errors/warnings --- milli/src/update/facet/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index e7d14c788..c5046784f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -237,10 +237,12 @@ pub(crate) mod tests { } } + #[cfg(all(test, fuzzing))] pub fn set_group_size(&self, group_size: u8) { // 2 <= x <= 64 self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); } + #[cfg(all(test, fuzzing))] pub fn set_max_group_size(&self, max_group_size: u8) { // 2*group_size <= x <= 128 let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); @@ -249,6 +251,7 @@ pub(crate) mod tests { self.group_size.set(max_group_size / 2); } } + #[cfg(all(test, fuzzing))] pub fn set_min_level_size(&self, min_level_size: u8) { // 1 <= x <= inf self.min_level_size.set(std::cmp::max(1, min_level_size)); From cb8442a119c7bb8e7acaeeb433cf7124597d8b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:28:17 +0200 Subject: [PATCH 38/58] Further unify facet databases of f64s and strings --- ...4_codec.rs => field_doc_id_facet_codec.rs} | 30 +++--- .../facet/field_doc_id_facet_string_codec.rs | 50 ---------- milli/src/heed_codec/facet/mod.rs | 12 ++- milli/src/search/mod.rs | 2 +- milli/src/update/delete_documents.rs | 98 +++++++------------ 5 files changed, 63 insertions(+), 129 deletions(-) rename milli/src/heed_codec/facet/{field_doc_id_facet_f64_codec.rs => field_doc_id_facet_codec.rs} (54%) delete mode 100644 milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs similarity index 54% rename from milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs rename to milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 22159601c..7c636e98a 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -1,13 +1,15 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; use crate::{try_split_array_at, DocumentId, FieldId}; +use heed::{BytesDecode, BytesEncode}; +use std::borrow::Cow; +use std::marker::PhantomData; -pub struct FieldDocIdFacetF64Codec; +pub struct FieldDocIdFacetCodec(PhantomData); -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { - type DItem = (FieldId, DocumentId, f64); +impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec +where + C: BytesDecode<'a>, +{ + type DItem = (FieldId, DocumentId, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { let (field_id_bytes, bytes) = try_split_array_at(bytes)?; @@ -16,22 +18,24 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; + let value = C::bytes_decode(&bytes[8..])?; Some((field_id, document_id, value)) } } -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { - type EItem = (FieldId, DocumentId, f64); +impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec +where + C: BytesEncode<'a>, +{ + type EItem = (FieldId, DocumentId, C::EItem); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes()); - let value_bytes = f64_into_bytes(*value)?; + let value_bytes = C::bytes_encode(value)?; bytes.extend_from_slice(&value_bytes); - bytes.extend_from_slice(&value.to_be_bytes()); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs deleted file mode 100644 index 178bb21c1..000000000 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldDocIdFacetStringCodec; - -impl FieldDocIdFacetStringCodec { - pub fn serialize_into( - field_id: FieldId, - document_id: DocumentId, - normalized_value: &str, - out: &mut Vec, - ) { - out.reserve(2 + 4 + normalized_value.len()); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(normalized_value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { - type DItem = (FieldId, DocumentId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let (document_id_bytes, bytes) = try_split_array_at(bytes)?; - let document_id = u32::from_be_bytes(document_id_bytes); - - let normalized_value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, normalized_value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { - type EItem = (FieldId, DocumentId, &'a str); - - fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into( - *field_id, - *document_id, - normalized_value, - &mut bytes, - ); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 2e9f0b212..8db8b7df1 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,5 +1,4 @@ -mod field_doc_id_facet_f64_codec; -mod field_doc_id_facet_string_codec; +mod field_doc_id_facet_codec; mod ordered_f64_codec; mod str_ref; @@ -7,16 +6,19 @@ use std::borrow::Cow; use std::convert::TryFrom; use std::marker::PhantomData; -use heed::types::OwnedType; +use heed::types::{DecodeIgnore, OwnedType}; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; -pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; -pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; +pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; + pub type FieldIdCodec = OwnedType; /// Tries to split a slice in half at the given middle point, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e6651737c..f62a37c1b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; +pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 14ef5fd6a..a56a61026 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,7 +1,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; -use heed::types::{ByteSlice, Str}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -11,11 +11,13 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetIgnoreCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, }; @@ -187,10 +189,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_position_docids, word_prefix_position_docids, facet_id_f64_docids: _, - facet_id_exists_docids, facet_id_string_docids: _, - field_id_docid_facet_f64s, - field_id_docid_facet_strings, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + facet_id_exists_docids, documents, } = self.index; @@ -449,6 +451,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { fields_ids_map.clone(), facet_type, )?; + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; + docids -= &self.to_delete_docids; + self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; + + remove_docids_from_field_id_docid_facet_value( + &self.index, + self.wtxn, + facet_type, + field_id, + &self.to_delete_docids, + )?; + } } // We delete the documents ids that are under the facet field id values. @@ -458,47 +475,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; - // Remove the documents ids from the faceted documents ids. - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::Number)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::Number, - &docids, - )?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_f64s, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - - // Remove docids from the string faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::String)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::String, - &docids, - )?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_strings, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - } - Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -564,26 +540,28 @@ fn remove_from_word_docids( Ok(()) } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( +fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( + index: &'i Index, wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, - convert: F, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a, DItem = K>, - DC: heed::BytesDecode<'a, DItem = V>, - F: Fn(K) -> DocumentId, -{ +) -> heed::Result<()> { + let db = match facet_type { + FacetType::String => { + index.field_id_docid_facet_strings.remap_types::() + } + FacetType::Number => { + index.field_id_docid_facet_f64s.remap_types::() + } + }; let mut iter = db - .remap_key_type::() .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::(); while let Some(result) = iter.next() { - let (key, _) = result?; - if to_remove.contains(convert(key)) { + let ((_, docid, _), _) = result?; + if to_remove.contains(docid) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } From 51961e10645135d0f7cfc76db9bc98d8ec9a1dc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:41:01 +0200 Subject: [PATCH 39/58] Polish some details --- .../facet/field_doc_id_facet_codec.rs | 6 ++-- milli/src/heed_codec/facet/mod.rs | 4 +++ milli/src/heed_codec/facet/str_ref.rs | 2 ++ .../extract/extract_facet_string_docids.rs | 5 +--- .../helpers/merge_functions.rs | 28 ------------------- 5 files changed, 11 insertions(+), 34 deletions(-) diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 7c636e98a..4e18a0145 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -1,8 +1,10 @@ -use crate::{try_split_array_at, DocumentId, FieldId}; -use heed::{BytesDecode, BytesEncode}; use std::borrow::Cow; use std::marker::PhantomData; +use heed::{BytesDecode, BytesEncode}; + +use crate::{try_split_array_at, DocumentId, FieldId}; + pub struct FieldDocIdFacetCodec(PhantomData); impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 8db8b7df1..35ec925dc 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -40,6 +40,8 @@ pub struct FacetGroupKey { pub left_bound: T, } +/// The value in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. #[derive(Debug)] pub struct FacetGroupValue { pub size: u8, @@ -102,6 +104,8 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { } } +/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. pub struct ByteSliceRef; impl<'a> BytesEncode<'a> for ByteSliceRef { diff --git a/milli/src/heed_codec/facet/str_ref.rs b/milli/src/heed_codec/facet/str_ref.rs index 80a51c803..36e702627 100644 --- a/milli/src/heed_codec/facet/str_ref.rs +++ b/milli/src/heed_codec/facet/str_ref.rs @@ -2,6 +2,8 @@ use std::borrow::Cow; use heed::{BytesDecode, BytesEncode}; +/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a str`) and these values can reside within another structure. pub struct StrRefCodec; impl<'a> BytesEncode<'a> for StrRefCodec { type EItem = &'a str; diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index a7b027ce3..bf523cbb3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -33,10 +33,6 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - // document_id_bytes is a big-endian u32 - // merge_cbo_roaring_bitmap works with native endian u32s - // that is a problem, I think - let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -45,6 +41,7 @@ pub fn extract_facet_string_docids( let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + // document id is encoded in native-endian because of the CBO roaring bitmap codec facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index cef27ab30..37af7ab6a 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -5,7 +5,6 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use super::read_u32_ne_bytes; -// use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -49,33 +48,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -// pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( -// _key: &[u8], -// values: &[Cow<'a, [u8]>], -// ) -> Result> { -// if values.len() == 1 { -// Ok(values[0].clone()) -// } else { -// let original = decode_prefix_string(&values[0]).unwrap().0; -// let merged_bitmaps = values -// .iter() -// .map(AsRef::as_ref) -// .map(decode_prefix_string) -// .map(Option::unwrap) -// .map(|(_, bitmap_bytes)| bitmap_bytes) -// .map(RoaringBitmap::deserialize_from) -// .map(StdResult::unwrap) -// .reduce(|a, b| a | b) -// .unwrap(); - -// let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); -// let mut buffer = Vec::with_capacity(cap); -// encode_prefix_string(original, &mut buffer)?; -// merged_bitmaps.serialize_into(&mut buffer)?; -// Ok(Cow::Owned(buffer)) -// } -// } - pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) } From 1ecd3bb8227b1d389e8f71d2d7140ee6c54fac8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 11:02:30 +0200 Subject: [PATCH 40/58] Fix bug in FieldDocIdFacetCodec --- milli/src/heed_codec/facet/field_doc_id_facet_codec.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 4e18a0145..cc9919ad2 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -20,7 +20,7 @@ where let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = C::bytes_decode(&bytes[8..])?; + let value = C::bytes_decode(bytes)?; Some((field_id, document_id, value)) } @@ -33,10 +33,11 @@ where type EItem = (FieldId, DocumentId, C::EItem); fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.extend_from_slice(&document_id.to_be_bytes()); + let mut bytes = Vec::with_capacity(32); + bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes + bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes let value_bytes = C::bytes_encode(value)?; + // variable length, if f64 -> 16 bytes, if string -> large, potentially bytes.extend_from_slice(&value_bytes); Some(Cow::Owned(bytes)) } From a2270b7432d2921603df502f6befc88d58f75118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 11:07:05 +0200 Subject: [PATCH 41/58] Change fuzzcheck dependency to point to git repository --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2f881fccb..49988da0b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = { path = "../../fuzzcheck-rs/fuzzcheck" } +fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } [features] default = [ "charabia/default" ] From d0109627b901178182f0ec0102d365080c683618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 14:39:11 +0200 Subject: [PATCH 42/58] Fix a bug in facet_range_search and add documentation --- milli/src/search/facet/facet_range_search.rs | 122 ++++++++++++++---- milli/src/search/facet/mod.rs | 31 +++++ .../excluded_2.hash.snap | 4 + .../excluded_3.hash.snap | 4 + .../included_2.hash.snap | 4 + .../included_3.hash.snap | 4 + .../excluded_2.hash.snap | 4 + .../excluded_3.hash.snap | 4 + .../included_2.hash.snap | 4 + .../included_3.hash.snap | 4 + .../filter_range_pinch/excluded_2.hash.snap | 4 + .../filter_range_pinch/excluded_3.hash.snap | 4 + .../filter_range_pinch/included_2.hash.snap | 4 + .../filter_range_pinch/included_3.hash.snap | 4 + 14 files changed, 173 insertions(+), 28 deletions(-) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 8934873b7..a7b4674f1 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -9,6 +9,8 @@ use crate::heed_codec::facet::{ }; use crate::Result; +/// Find all the document ids for which the given field contains a value contained within +/// the two bounds. pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -24,11 +26,11 @@ where let inner; let left = match left { Bound::Included(left) => { - inner = BoundCodec::bytes_encode(left).unwrap(); + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; Bound::Included(inner.as_ref()) } Bound::Excluded(left) => { - inner = BoundCodec::bytes_encode(left).unwrap(); + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; Bound::Excluded(inner.as_ref()) } Bound::Unbounded => Bound::Unbounded, @@ -36,11 +38,11 @@ where let inner; let right = match right { Bound::Included(right) => { - inner = BoundCodec::bytes_encode(right).unwrap(); + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; Bound::Included(inner.as_ref()) } Bound::Excluded(right) => { - inner = BoundCodec::bytes_encode(right).unwrap(); + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; Bound::Excluded(inner.as_ref()) } Bound::Unbounded => Bound::Unbounded, @@ -49,9 +51,11 @@ where let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); - f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; + if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let rightmost_bound = + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + let group_size = usize::MAX; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) } else { return Ok(()); @@ -107,7 +111,25 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { Ok(()) } - /// Recursive part of the algorithm for level > 0 + /// Recursive part of the algorithm for level > 0. + /// + /// It works by visiting a slice of a level and checking whether the range asscociated + /// with each visited element is contained within the bounds. + /// + /// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating + /// 2. If the element's range is fully contained by the bounds, then all of its docids are added to + /// the roaring bitmap. + /// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively + /// on the children of the element from the level below. + /// 4. If the element's range is greater than the right bound, we do nothing and stop iterating. + /// Note that the right bound is found through either the `left_bound` of the *next* element, + /// or from the `rightmost_bound` argument + /// + /// ## Arguments + /// - `level`: the level being visited + /// - `starting_left_bound`: the left_bound of the first element to visit + /// - `rightmost_bound`: the right bound of the last element that should be visited + /// - `group_size`: the number of elements that should be visited fn run( &mut self, level: u8, @@ -123,13 +145,14 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + // We iterate over the range while keeping in memory the previous value let (mut previous_key, mut previous_value) = iter.next().unwrap()?; for el in iter { let (next_key, next_value) = el?; - // the right of the iter range is unbounded, so we need to make sure that we are not iterating - // on the next field id + // the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX), + // so we need to make sure that we are not iterating on the next field id if next_key.field_id != self.field_id { - return Ok(()); + break; } // now, do we skip, stop, or visit? let should_skip = { @@ -176,6 +199,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { previous_value = next_value; continue; } + // from here, we should visit the children of the previous element and + // call the function recursively let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -187,7 +212,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { previous_key = next_key; previous_value = next_value; } - // previous_key/previous_value are the last element + // previous_key/previous_value are the last element's key/value // now, do we skip, stop, or visit? let should_skip = { @@ -224,18 +249,41 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { Bound::Unbounded => true, }; let right_condition = match (self.right, rightmost_bound) { - (Bound::Included(right), Bound::Included(rightmost)) => rightmost <= right, - (Bound::Included(right), Bound::Excluded(rightmost)) => rightmost < right, - // e.g. x < 8 and rightmost is <= y - // condition met if rightmost < 8 - (Bound::Excluded(right), Bound::Included(rightmost)) => rightmost < right, - // e.g. x < 8 and rightmost is < y - // condition met only if y <= 8? - (Bound::Excluded(right), Bound::Excluded(rightmost)) => rightmost <= right, - // e.g. x < inf. , so yes we take the whole thing - (Bound::Unbounded, _) => true, - // e.g. x < 7 , righmost is inf - (_, Bound::Unbounded) => false, // panic? + (Bound::Included(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Included(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Excluded(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost < right + rightmost < right + } + (Bound::Excluded(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Unbounded, _) => { + // we need to stay within the bound ..inf + // so the element always fits entirely within the bound + true + } + (_, Bound::Unbounded) => { + // we need to stay within a finite bound + // but the element's range goes to ..inf + // so the element never fits entirely within the bound + false + } }; left_condition && right_condition }; @@ -262,7 +310,10 @@ mod tests { use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; - use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + }; use crate::snapshot_tests::display_bitmap; #[test] @@ -272,7 +323,12 @@ mod tests { } #[test] fn filter_range_increasing() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); @@ -316,7 +372,12 @@ mod tests { } #[test] fn filter_range_decreasing() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); @@ -367,7 +428,12 @@ mod tests { } #[test] fn filter_range_pinch() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index be04fbd7f..c854b546d 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -119,4 +119,35 @@ pub(crate) mod tests { txn.commit().unwrap(); index } + pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, fid, &(i as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, fid, &(key as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } } diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap new file mode 100644 index 000000000..7bf13e05c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +fcedc563a82c1c61f50174a5f3f982b6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap new file mode 100644 index 000000000..100b928d7 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +6cc26e77fc6bd9145deedf14cf422b03 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap new file mode 100644 index 000000000..be0b06ded --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +57d35cfa419a19a1a1f8d7c8ef096e0f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap new file mode 100644 index 000000000..93fe17b0c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3dbe0547b42759795e9b16989df72cee diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap new file mode 100644 index 000000000..db11ce952 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c1c7a0bb91d53d33724583b6d4a99f16 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap new file mode 100644 index 000000000..f5a81c121 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +12213d3f1047a0c3d08e4670a7d688e7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap new file mode 100644 index 000000000..fa7242056 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ca59f20e043a4d52c49e15b10adf96bb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap new file mode 100644 index 000000000..a7611d8c1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +cb69e0fe10fb299bafe77514204379cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap new file mode 100644 index 000000000..07664807e --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3456db9a1bb94c33c1e9f656184ee711 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap new file mode 100644 index 000000000..ef530faa1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2127cd818b457e0611e0c8e1a871602a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap new file mode 100644 index 000000000..db8a314b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b976551ceff412bfb2ec9bfbda320bbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap new file mode 100644 index 000000000..2b82e07e8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7620ca1a96882c7147d3fd996570f9b3 From 0ade6998735e943dcaba9844814556d858a93319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 14:39:43 +0200 Subject: [PATCH 43/58] Don't crash when failing to decode using StrRef codec --- milli/src/heed_codec/facet/str_ref.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/heed_codec/facet/str_ref.rs b/milli/src/heed_codec/facet/str_ref.rs index 36e702627..ced5cc65e 100644 --- a/milli/src/heed_codec/facet/str_ref.rs +++ b/milli/src/heed_codec/facet/str_ref.rs @@ -16,7 +16,7 @@ impl<'a> BytesDecode<'a> for StrRefCodec { type DItem = &'a str; fn bytes_decode(bytes: &'a [u8]) -> Option { - let s = std::str::from_utf8(bytes).unwrap(); + let s = std::str::from_utf8(bytes).ok()?; Some(s) } } From 1165ba217197f7abae6ee4e9d9b159bc09cdf275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 15:53:39 +0200 Subject: [PATCH 44/58] Make facet deletion incremental --- milli/src/update/delete_documents.rs | 92 ++++++++------------------- milli/src/update/facet/bulk.rs | 19 ++++-- milli/src/update/facet/delete.rs | 92 +++++++++++++++++++++++++++ milli/src/update/facet/incremental.rs | 48 +++++++------- milli/src/update/facet/mod.rs | 31 +++++++-- 5 files changed, 182 insertions(+), 100 deletions(-) create mode 100644 milli/src/update/facet/delete.rs diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index a56a61026..de2f4480c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,4 +1,5 @@ use std::collections::btree_map::Entry; +use std::collections::{HashMap, HashSet}; use fst::IntoStreamer; use heed::types::{ByteSlice, DecodeIgnore, Str}; @@ -8,17 +9,16 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; -use super::{ClearDocuments, FacetsUpdateBulk}; +use super::facet::delete::FacetsDelete; +use super::ClearDocuments; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetIgnoreCodec, -}; +use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, - RoaringBitmapCodec, SmallString32, BEU32, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, + SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -444,13 +444,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } for facet_type in [FacetType::Number, FacetType::String] { - remove_docids_from_facet_id_docids( - self.wtxn, - self.index, - &self.to_delete_docids, - fields_ids_map.clone(), - facet_type, - )?; + let mut affected_facet_values = HashMap::new(); for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids let mut docids = @@ -458,14 +452,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docids -= &self.to_delete_docids; self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; - remove_docids_from_field_id_docid_facet_value( + let facet_values = remove_docids_from_field_id_docid_facet_value( &self.index, self.wtxn, facet_type, field_id, &self.to_delete_docids, )?; + if !facet_values.is_empty() { + affected_facet_values.insert(field_id, facet_values); + } } + FacetsDelete::new( + self.index, + facet_type, + affected_facet_values, + &self.to_delete_docids, + ) + .execute(self.wtxn)?; } // We delete the documents ids that are under the facet field id values. @@ -546,7 +550,7 @@ fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, -) -> heed::Result<()> { +) -> heed::Result>> { let db = match facet_type { FacetType::String => { index.field_id_docid_facet_strings.remap_types::() @@ -555,19 +559,23 @@ fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( index.field_id_docid_facet_f64s.remap_types::() } }; + let mut all_affected_facet_values = HashSet::default(); let mut iter = db .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::>(); while let Some(result) = iter.next() { - let ((_, docid, _), _) = result?; + let ((_, docid, facet_value), _) = result?; if to_remove.contains(docid) { + if !all_affected_facet_values.contains(facet_value) { + all_affected_facet_values.insert(facet_value.to_owned()); + } // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } } - Ok(()) + Ok(all_affected_facet_values) } fn remove_docids_from_facet_id_exists_docids<'a, C>( @@ -595,54 +603,6 @@ where Ok(()) } -fn remove_docids_from_facet_id_docids<'a>( - wtxn: &'a mut heed::RwTxn, - index: &Index, - to_remove: &RoaringBitmap, - fields_ids_map: FieldsIdsMap, - facet_type: FacetType, -) -> Result<()> { - let db = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }; - let mut modified = false; - for field_id in fields_ids_map.ids() { - let mut level0_prefix = vec![]; - level0_prefix.extend_from_slice(&field_id.to_be_bytes()); - level0_prefix.push(0); - let mut iter = db - .as_polymorph() - .prefix_iter_mut::<_, ByteSlice, FacetGroupValueCodec>(wtxn, &level0_prefix)?; - - while let Some(result) = iter.next() { - let (bytes, mut value) = result?; - let previous_len = value.bitmap.len(); - value.bitmap -= to_remove; - if value.bitmap.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - modified = true; - } else if value.bitmap.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &value)? }; - modified = true; - } - } - } - if !modified { - return Ok(()); - } - let builder = FacetsUpdateBulk::new_not_updating_level_0(index, facet_type); - builder.execute(wtxn)?; - - Ok(()) -} #[cfg(test)] mod tests { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index e82af5d66..d3db0a0fa 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -29,6 +29,7 @@ pub struct FacetsUpdateBulk<'i> { group_size: u8, min_level_size: u8, facet_type: FacetType, + field_ids: Vec, // None if level 0 does not need to be updated new_data: Option>, } @@ -36,20 +37,30 @@ pub struct FacetsUpdateBulk<'i> { impl<'i> FacetsUpdateBulk<'i> { pub fn new( index: &'i Index, + field_ids: Vec, facet_type: FacetType, new_data: grenad::Reader, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { - FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) } + FacetsUpdateBulk { + index, + field_ids, + group_size, + min_level_size, + facet_type, + new_data: Some(new_data), + } } pub fn new_not_updating_level_0( index: &'i Index, + field_ids: Vec, facet_type: FacetType, ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, + field_ids, group_size: FACET_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, @@ -61,7 +72,7 @@ impl<'i> FacetsUpdateBulk<'i> { pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; let db = match facet_type { FacetType::String => { @@ -76,8 +87,6 @@ impl<'i> FacetsUpdateBulk<'i> { let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; - let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); - inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; Ok(()) @@ -405,7 +414,7 @@ mod tests { index.verify_structure_validity(&wtxn, 1); // delete all the elements for the facet id 0 for i in 0..100u32 { - index.delete(&mut wtxn, 0, &(i as f64), i); + index.delete_single_docid(&mut wtxn, 0, &(i as f64), i); } index.verify_structure_validity(&wtxn, 0); index.verify_structure_validity(&wtxn, 1); diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs new file mode 100644 index 000000000..efe1d800a --- /dev/null +++ b/milli/src/update/facet/delete.rs @@ -0,0 +1,92 @@ +use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::{ + facet::FacetType, + heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}, + FieldId, Index, Result, +}; +use heed::RwTxn; +use roaring::RoaringBitmap; +use std::collections::{HashMap, HashSet}; + +pub struct FacetsDelete<'i, 'b> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + group_size: u8, + max_group_size: u8, + min_level_size: u8, +} +impl<'i, 'b> FacetsDelete<'i, 'b> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + ) -> Self { + let database = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + facet_type, + affected_facet_values, + docids_to_delete, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + } + } + + pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { + for (field_id, affected_facet_values) in self.affected_facet_values { + if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) { + // Bulk delete + let mut modified = false; + + for facet_value in affected_facet_values { + let key = + FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; + let mut old = self.database.get(wtxn, &key)?.unwrap(); + let previous_len = old.bitmap.len(); + old.bitmap -= self.docids_to_delete; + if old.bitmap.is_empty() { + modified = true; + self.database.delete(wtxn, &key)?; + } else if old.bitmap.len() != previous_len { + modified = true; + self.database.put(wtxn, &key, &old)?; + } + } + if modified { + let builder = FacetsUpdateBulk::new_not_updating_level_0( + self.index, + vec![field_id], + self.facet_type, + ); + builder.execute(wtxn)?; + } + } else { + // Incremental + let inc = FacetsUpdateIncrementalInner { + db: self.database, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + for facet_value in affected_facet_values { + inc.delete(wtxn, field_id, facet_value.as_slice(), &self.docids_to_delete)?; + } + } + } + Ok(()) + } +} diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index c2115aee5..895713d43 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -485,20 +485,20 @@ impl FacetsUpdateIncrementalInner { field_id: u16, level: u8, facet_value: &[u8], - docid: u32, + docids: &RoaringBitmap, ) -> Result { if level == 0 { - return self.delete_in_level_0(txn, field_id, facet_value, docid); + return self.delete_in_level_0(txn, field_id, facet_value, docids); } let (deletion_key, mut bitmap) = self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?; + let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; let mut decrease_size = false; let next_key = match result { DeletionResult::InPlace => { - bitmap.bitmap.remove(docid); + bitmap.bitmap -= docids; self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; return Ok(DeletionResult::InPlace); } @@ -527,7 +527,7 @@ impl FacetsUpdateIncrementalInner { if reduced_range { updated_deletion_key.left_bound = next_key.clone().unwrap(); } - updated_value.bitmap.remove(docid); + updated_value.bitmap -= docids; let _ = self.db.delete(txn, &deletion_key.as_ref())?; self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; if reduced_range { @@ -543,11 +543,11 @@ impl FacetsUpdateIncrementalInner { txn: &'t mut RwTxn, field_id: u16, facet_value: &[u8], - docid: u32, + docids: &RoaringBitmap, ) -> Result { let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; - bitmap.remove(docid); + bitmap -= docids; if bitmap.is_empty() { let mut next_key = None; @@ -571,7 +571,7 @@ impl FacetsUpdateIncrementalInner { txn: &'t mut RwTxn, field_id: u16, facet_value: &[u8], - docid: u32, + docids: &RoaringBitmap, ) -> Result<()> { if self .db @@ -584,7 +584,7 @@ impl FacetsUpdateIncrementalInner { let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = - self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?; + self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { DeletionResult::InPlace => return Ok(()), DeletionResult::Reduce { .. } => return Ok(()), @@ -807,7 +807,7 @@ mod tests { for i in (200..256).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -816,7 +816,7 @@ mod tests { for i in (150..200).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -824,7 +824,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in (100..150).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -832,14 +832,14 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in (17..100).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); milli_snap!(format!("{index}"), 17); let mut txn = index.env.write_txn().unwrap(); for i in (15..17).into_iter().rev() { - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -847,7 +847,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in (0..15).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -867,7 +867,7 @@ mod tests { } for i in 0..128 { - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -875,7 +875,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -883,7 +883,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in 216..256 { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -908,7 +908,7 @@ mod tests { for i in 0..128 { let key = keys[i]; index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key as u32); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -917,7 +917,7 @@ mod tests { for i in 128..216 { let key = keys[i]; index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key as u32); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -926,7 +926,7 @@ mod tests { for i in 216..256 { let key = keys[i]; index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key as u32); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -979,7 +979,7 @@ mod tests { for &key in keys.iter() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key + 100); + index.delete_single_docid(&mut txn, 0, &(key as f64), key + 100); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -1010,7 +1010,7 @@ mod tests { for &key in keys.iter() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); + index.delete_single_docid(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -1131,7 +1131,7 @@ mod fuzz { OperationKind::Delete(value) => { if let Some(keys) = value_to_keys.get(value) { for key in keys { - index.delete(&mut txn, *field_id, key, *value as u32); + index.delete_single_docid(&mut txn, *field_id, key, *value as u32); trivial_db.delete(*field_id, *key, *value as u32); } } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index c5046784f..c75713158 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -74,15 +74,15 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; -use std::fs::File; - use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::{Index, Result}; +use std::fs::File; pub mod bulk; +pub mod delete; pub mod incremental; pub struct FacetsUpdate<'i> { @@ -120,8 +120,11 @@ impl<'i> FacetsUpdate<'i> { return Ok(()); } if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + let field_ids = + self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( self.index, + field_ids, self.facet_type, self.new_data, self.group_size, @@ -273,12 +276,12 @@ pub(crate) mod tests { let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); } - pub fn delete<'a>( + pub fn delete_single_docid<'a>( &self, wtxn: &'a mut RwTxn, field_id: u16, key: &'a >::EItem, - value: u32, + docid: u32, ) { let update = FacetsUpdateIncrementalInner { db: self.content, @@ -287,7 +290,25 @@ pub(crate) mod tests { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.delete(wtxn, field_id, &key_bytes, value).unwrap(); + let mut docids = RoaringBitmap::new(); + docids.insert(docid); + update.delete(wtxn, field_id, &key_bytes, &docids).unwrap(); + } + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); } pub fn bulk_insert<'a, 'b>( From a034a1e628175fcc046741037670bf030bda056c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 09:42:55 +0200 Subject: [PATCH 45/58] Move StrRefCodec and ByteSliceRefCodec to their own files --- milli/Cargo.toml | 2 +- milli/src/heed_codec/byte_slice_ref.rs | 23 ++++++++++++++++ milli/src/heed_codec/facet/mod.rs | 26 +++---------------- milli/src/heed_codec/mod.rs | 4 +++ milli/src/heed_codec/{facet => }/str_ref.rs | 0 milli/src/index.rs | 3 ++- milli/src/search/criteria/asc_desc.rs | 7 ++--- milli/src/search/facet/facet_distribution.rs | 13 +++++++--- .../search/facet/facet_distribution_iter.rs | 22 +++++++++------- milli/src/search/facet/facet_range_search.rs | 18 +++++++------ .../src/search/facet/facet_sort_ascending.rs | 13 ++++++---- .../src/search/facet/facet_sort_descending.rs | 20 +++++++------- milli/src/search/facet/mod.rs | 12 ++++----- milli/src/update/facet/bulk.rs | 19 +++++++------- milli/src/update/facet/delete.rs | 13 +++++----- milli/src/update/facet/incremental.rs | 25 ++++++++++-------- milli/src/update/facet/mod.rs | 24 +++++++++-------- .../extract/extract_facet_string_docids.rs | 3 ++- 18 files changed, 140 insertions(+), 107 deletions(-) create mode 100644 milli/src/heed_codec/byte_slice_ref.rs rename milli/src/heed_codec/{facet => }/str_ref.rs (100%) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 49988da0b..b768476e3 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } +fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } # TODO: use released version [features] default = [ "charabia/default" ] diff --git a/milli/src/heed_codec/byte_slice_ref.rs b/milli/src/heed_codec/byte_slice_ref.rs new file mode 100644 index 000000000..48eda63c5 --- /dev/null +++ b/milli/src/heed_codec/byte_slice_ref.rs @@ -0,0 +1,23 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. +pub struct ByteSliceRefCodec; + +impl<'a> BytesEncode<'a> for ByteSliceRefCodec { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for ByteSliceRefCodec { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 35ec925dc..a727b148f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,6 +1,5 @@ mod field_doc_id_facet_codec; mod ordered_f64_codec; -mod str_ref; use std::borrow::Cow; use std::convert::TryFrom; @@ -12,9 +11,10 @@ use roaring::RoaringBitmap; pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; -pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; +use super::StrRefCodec; + pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; @@ -33,7 +33,7 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { /// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] /// databases. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] // TODO: try removing PartialOrd and Ord pub struct FacetGroupKey { pub field_id: u16, pub level: u8, @@ -103,23 +103,3 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { Some(FacetGroupValue { size, bitmap }) } } - -/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated -/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. -pub struct ByteSliceRef; - -impl<'a> BytesEncode<'a> for ByteSliceRef { - type EItem = &'a [u8]; - - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Borrowed(item)) - } -} - -impl<'a> BytesDecode<'a> for ByteSliceRef { - type DItem = &'a [u8]; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(bytes) - } -} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index e07e47c79..6a058f95f 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,10 +1,12 @@ mod beu32_str_codec; +mod byte_slice_ref; pub mod facet; mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; mod str_beu32_codec; +mod str_ref; mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -16,3 +18,5 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; +pub use byte_slice_ref::ByteSliceRefCodec; +pub use str_ref::StrRefCodec; diff --git a/milli/src/heed_codec/facet/str_ref.rs b/milli/src/heed_codec/str_ref.rs similarity index 100% rename from milli/src/heed_codec/facet/str_ref.rs rename to milli/src/heed_codec/str_ref.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 893817d59..7c5e92d05 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -16,8 +16,9 @@ use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, - FieldIdCodec, OrderedF64Codec, StrRefCodec, + FieldIdCodec, OrderedF64Codec, }; +use crate::heed_codec::StrRefCodec; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 586605116..fd03b1b60 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,8 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; +use crate::heed_codec::facet::FacetGroupKeyCodec; +use crate::heed_codec::ByteSliceRefCodec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; @@ -194,14 +195,14 @@ fn facet_ordered<'t>( let number_iter = make_iter( rtxn, - index.facet_id_f64_docids.remap_key_type::>(), + index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates.clone(), )?; let string_iter = make_iter( rtxn, - index.facet_id_string_docids.remap_key_type::>(), + index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, )?; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 2e2e448c2..f6a53dbd4 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,9 +9,10 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, OrderedF64Codec, StrRefCodec, + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + OrderedF64Codec, }; +use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; @@ -137,7 +138,9 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_f64_docids.remap_key_type::>(), + self.index + .facet_id_f64_docids + .remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids, _| { @@ -160,7 +163,9 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_string_docids.remap_key_type::>(), + self.index + .facet_id_string_docids + .remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids, any_docid| { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 4c6dc75fa..0fdca4118 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,9 +4,8 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, -}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; use crate::DocumentId; /// Call the given closure on the facet distribution of the candidate documents. @@ -22,7 +21,7 @@ use crate::DocumentId; /// keep iterating over the different facet values or stop. pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -31,10 +30,13 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; - let highest_level = - get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; + let highest_level = get_highest_level( + rtxn, + db.remap_key_type::>(), + field_id, + )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { @@ -47,7 +49,7 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } @@ -72,11 +74,13 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } + // TODO: use real intersection and then take min()? let docids_in_common = value.bitmap.intersection_len(candidates); if docids_in_common > 0 { + // TODO: use min() let any_docid = value.bitmap.iter().next().unwrap(); match (self.callback)(key.left_bound, docids_in_common, any_docid)? { - ControlFlow::Continue(_) => {} + ControlFlow::Continue(_) => (), // TODO use unit instead of empty scope ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index a7b4674f1..07300e920 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,8 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, -}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; use crate::Result; /// Find all the document ids for which the given field contains a value contained within @@ -47,13 +46,16 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); + let db = db.remap_key_type::>(); let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { + let rightmost_bound = Bound::Included( + get_last_facet_value::(rtxn, db, field_id)?.unwrap(), + ); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -65,7 +67,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 2b0a45e15..2f1f73db3 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -3,8 +3,9 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; /// Return an iterator which iterates over the given candidate documents in /// ascending order of their facet value for the given field id. @@ -30,12 +31,12 @@ use crate::heed_codec::facet::{ /// Note that once a document id is returned by the iterator, it is never returned again. pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); @@ -47,11 +48,13 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take< + heed::RoRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, )>, } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 47d0f145b..5f09d708b 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -5,22 +5,23 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; /// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). /// /// This function does the same thing, but in the opposite order. pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(Box::new(DescendingFacetSort { @@ -36,12 +37,12 @@ pub fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, std::iter::Take< - heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, >, Bound<&'t [u8]>, )>, @@ -97,7 +98,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *right_bound = Bound::Excluded(left_bound); let iter = match self .db - .remap_key_type::>() + .remap_key_type::>() .rev_range( &self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), @@ -121,7 +122,8 @@ impl<'t> Iterator for DescendingFacetSort<'t> { mod tests { use roaring::RoaringBitmap; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; + use crate::heed_codec::facet::FacetGroupKeyCodec; + use crate::heed_codec::ByteSliceRefCodec; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; @@ -134,7 +136,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index c854b546d..ccf40d6aa 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -5,8 +5,8 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; - +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; @@ -17,7 +17,7 @@ mod filter; /// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -42,7 +42,7 @@ where /// Get the last facet value in the facet database pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -67,7 +67,7 @@ where /// Get the height of the highest level in the facet database pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); @@ -77,7 +77,7 @@ pub(crate) fn get_highest_level<'t>( .next() .map(|el| { let (key, _) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); key.level }) .unwrap_or(0)) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index d3db0a0fa..4e10c22dd 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -11,8 +11,9 @@ use time::OffsetDateTime; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -75,11 +76,11 @@ impl<'i> FacetsUpdateBulk<'i> { let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; let db = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; @@ -98,7 +99,7 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { - pub db: heed::Database, FacetGroupValueCodec>, + pub db: heed::Database, FacetGroupValueCodec>, pub new_data: Option>, pub group_size: u8, pub min_level_size: u8, @@ -216,7 +217,7 @@ impl FacetsUpdateBulkInner { .db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; @@ -299,7 +300,7 @@ impl FacetsUpdateBulkInner { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id, level, left_bound }; - let key = FacetGroupKeyCodec::::bytes_encode(&key) + let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = @@ -328,7 +329,7 @@ impl FacetsUpdateBulkInner { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id, level, left_bound }; - let key = FacetGroupKeyCodec::::bytes_encode(&key) + let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index efe1d800a..74c17e8f2 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -1,7 +1,8 @@ use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::{ facet::FacetType, - heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + heed_codec::ByteSliceRefCodec, update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}, FieldId, Index, Result, }; @@ -11,7 +12,7 @@ use std::collections::{HashMap, HashSet}; pub struct FacetsDelete<'i, 'b> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, affected_facet_values: HashMap>>, docids_to_delete: &'b RoaringBitmap, @@ -27,11 +28,11 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { docids_to_delete: &'b RoaringBitmap, ) -> Self { let database = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 895713d43..9dda86a46 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -7,8 +7,9 @@ use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -50,10 +51,10 @@ impl<'i> FacetsUpdateIncremental<'i> { db: match facet_type { FacetType::String => index .facet_id_string_docids - .remap_key_type::>(), + .remap_key_type::>(), FacetType::Number => index .facet_id_f64_docids - .remap_key_type::>(), + .remap_key_type::>(), }, group_size, max_group_size, @@ -69,7 +70,7 @@ impl<'i> FacetsUpdateIncremental<'i> { let mut cursor = self.new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(key) + let key = FacetGroupKeyCodec::::bytes_decode(key) .ok_or(heed::Error::Encoding)?; let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; @@ -87,7 +88,7 @@ impl<'i> FacetsUpdateIncremental<'i> { /// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type pub struct FacetsUpdateIncrementalInner { - pub db: heed::Database, FacetGroupValueCodec>, + pub db: heed::Database, FacetGroupValueCodec>, pub group_size: u8, pub min_level_size: u8, pub max_group_size: u8, @@ -126,7 +127,7 @@ impl FacetsUpdateIncrementalInner { if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -149,7 +150,7 @@ impl FacetsUpdateIncrementalInner { )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -411,7 +412,7 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -434,7 +435,7 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..nbr_leftover_elements { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -616,7 +617,7 @@ impl FacetsUpdateIncrementalInner { while let Some(el) = iter.next() { let (k, _) = el?; to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k) + FacetGroupKeyCodec::::bytes_decode(k) .ok_or(Error::Encoding)? .into_owned(), ); @@ -655,7 +656,8 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; use crate::milli_snap; use crate::update::facet::tests::FacetIndex; @@ -1019,6 +1021,7 @@ mod tests { // fuzz tests } + #[cfg(all(test, fuzzing))] mod fuzz { use std::borrow::Cow; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index c75713158..a6d8c3d60 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -77,7 +77,8 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; use crate::{Index, Result}; use std::fs::File; @@ -87,7 +88,7 @@ pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, group_size: u8, @@ -97,11 +98,11 @@ pub struct FacetsUpdate<'i> { impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { let database = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { @@ -159,8 +160,9 @@ pub(crate) mod tests { use super::bulk::FacetsUpdateBulkInner; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; + use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; use crate::update::FacetsUpdateIncrementalInner; @@ -173,7 +175,7 @@ pub(crate) mod tests { BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { pub env: Env, - pub content: heed::Database, FacetGroupValueCodec>, + pub content: heed::Database, FacetGroupValueCodec>, pub group_size: Cell, pub min_level_size: Cell, pub max_group_size: Cell, @@ -327,7 +329,7 @@ pub(crate) mod tests { let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); let key: FacetGroupKey<&[u8]> = FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; - let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); writer.insert(&key, &value).unwrap(); } @@ -362,7 +364,7 @@ pub(crate) mod tests { .unwrap(); while let Some(el) = iter.next() { let (key, value) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); let mut prefix_start_below = vec![]; prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); @@ -379,7 +381,7 @@ pub(crate) mod tests { ) .unwrap(); let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; assert!(value.size > 0); diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index bf523cbb3..221356ba0 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,7 +4,8 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, StrRefCodec}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::StrRefCodec; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; From acc8caebe62f758794e25cdeb71ac88dd380ee3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 09:46:31 +0200 Subject: [PATCH 46/58] Add link to GitHub PR to document of update/facet module --- milli/src/update/facet/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index a6d8c3d60..a5d527282 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -68,6 +68,8 @@ bulk method is faster when a large part of the database is modified. Empirically it takes 50x more time to incrementally add N facet values to an existing database than it is to construct a database of N facet values. This is the heuristic that is used to choose between the two methods. + +Related PR: https://github.com/meilisearch/milli/pull/619 */ pub const FACET_MAX_GROUP_SIZE: u8 = 8; From 2295e0e3ce32d72c9960d1ebfc04b637d07b5047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 10:23:40 +0200 Subject: [PATCH 47/58] Use real delete function in facet indexing fuzz tests By deleting multiple docids at once instead of one-by-one --- milli/src/update/facet/incremental.rs | 49 ++++++++++++++++++--------- milli/src/update/facet/mod.rs | 18 ++++------ 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 9dda86a46..a4c756aec 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1018,25 +1018,26 @@ mod tests { txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } - - // fuzz tests } +// fuzz tests #[cfg(all(test, fuzzing))] mod fuzz { use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; + use std::iter::FromIterator; use std::rc::Rc; + use fuzzcheck::mutators::integer::U8Mutator; use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; + use fuzzcheck::mutators::vector::VecMutator; use fuzzcheck::DefaultMutator; use heed::BytesEncode; use roaring::RoaringBitmap; use tempfile::TempDir; use super::*; - use crate::milli_snap; use crate::update::facet::tests::FacetIndex; struct NEU16Codec; @@ -1074,10 +1075,10 @@ mod fuzz { *values |= new_values; } #[no_coverage] - pub fn delete(&mut self, field_id: u16, key: T, value: u32) { + pub fn delete(&mut self, field_id: u16, key: T, values_to_remove: &RoaringBitmap) { if let Some(values_field_id) = self.elements.get_mut(&field_id) { if let Some(values) = values_field_id.get_mut(&key) { - values.remove(value); + *values -= values_to_remove; if values.is_empty() { values_field_id.remove(&key); } @@ -1103,8 +1104,14 @@ mod fuzz { } #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] enum OperationKind { - Insert(Vec), - Delete(u8), + Insert( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), + Delete( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), } #[no_coverage] @@ -1131,13 +1138,23 @@ mod fuzz { index.insert(&mut txn, *field_id, key, &bitmap); trivial_db.insert(*field_id, *key, &bitmap); } - OperationKind::Delete(value) => { - if let Some(keys) = value_to_keys.get(value) { - for key in keys { - index.delete_single_docid(&mut txn, *field_id, key, *value as u32); - trivial_db.delete(*field_id, *key, *value as u32); + OperationKind::Delete(values) => { + let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); + let mut values_per_key = HashMap::new(); + + for value in values { + if let Some(keys) = value_to_keys.get(&(value as u8)) { + for key in keys { + let values: &mut RoaringBitmap = + values_per_key.entry(key).or_default(); + values.insert(value); + } } } + for (key, values) in values_per_key { + index.delete(&mut txn, *field_id, &key, &values); + trivial_db.delete(*field_id, *key, &values); + } } } } @@ -1221,7 +1238,7 @@ mod fuzz { {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":50}} + {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[50]}} ] "#; let operations: Vec> = serde_json::from_str(operations).unwrap(); @@ -1250,7 +1267,7 @@ mod fuzz { {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, - {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":210}} + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[210]}} ] "#; let operations: Vec> = serde_json::from_str(operations).unwrap(); @@ -1285,7 +1302,7 @@ mod fuzz { let operations = r#"[ {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, - {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":87}}, + {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":[87]}}, {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, @@ -1337,7 +1354,7 @@ mod fuzz { "max_group_size":4, "min_level_size":25, "field_id":3, - "kind":{"Delete":11} + "kind":{"Delete":[11]} } ] "#; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index a5d527282..5fb5c9e48 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -76,13 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; +use std::fs::File; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::ByteSliceRefCodec; use crate::{Index, Result}; -use std::fs::File; pub mod bulk; pub mod delete; @@ -153,6 +154,7 @@ impl<'i> FacetsUpdate<'i> { pub(crate) mod tests { use std::cell::Cell; use std::fmt::Display; + use std::iter::FromIterator; use std::marker::PhantomData; use std::rc::Rc; @@ -170,7 +172,7 @@ pub(crate) mod tests { use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; - // A dummy index that only contains the facet database, used for testing + /// A dummy index that only contains the facet database, used for testing pub struct FacetIndex where for<'a> BoundCodec: @@ -287,17 +289,9 @@ pub(crate) mod tests { key: &'a >::EItem, docid: u32, ) { - let update = FacetsUpdateIncrementalInner { - db: self.content, - group_size: self.group_size.get(), - min_level_size: self.min_level_size.get(), - max_group_size: self.max_group_size.get(), - }; - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - let mut docids = RoaringBitmap::new(); - docids.insert(docid); - update.delete(wtxn, field_id, &key_bytes, &docids).unwrap(); + self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid))) } + pub fn delete<'a>( &self, wtxn: &'a mut RwTxn, From ee1abfd1c18291a5bf7d9513c36ddb76663e4135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 10:25:56 +0200 Subject: [PATCH 48/58] Ignore files generated by fuzzcheck --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 39623a232..edd3e675c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ milli/target/ # Snapshots ## ... large *.full.snap - -# ... unreviewed +## ... unreviewed *.snap.new + +# Fuzzcheck data for the facet indexing fuzz test +milli/fuzz/update::facet::incremental::fuzz::fuzz/ From d885de16002e4c8aaf58602078cf8f88240b4d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 17:16:11 +0200 Subject: [PATCH 49/58] Add option to avoid soft deletion of documents --- milli/src/update/delete_documents.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index de2f4480c..2626c1555 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -26,6 +26,8 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, + #[cfg(test)] + disable_soft_delete: bool, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -46,9 +48,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, to_delete_docids: RoaringBitmap::new(), + #[cfg(test)] + disable_soft_delete: false, }) } + #[cfg(test)] + fn disable_soft_delete(&mut self, disable: bool) { + self.disable_soft_delete = disable; + } + pub fn delete_document(&mut self, docid: u32) { self.to_delete_docids.insert(docid); } @@ -147,7 +156,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { + let disable_soft_delete = { + #[cfg(not(test))] + { + false + } + #[cfg(test)] + { + self.disable_soft_delete + } + }; + if !disable_soft_delete + && percentage_available > 10 + && percentage_used_by_soft_deleted_documents < 10 + { self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), From ab5e56fd169dacaddc99b8abf39610d932222d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 25 Aug 2022 14:51:50 +0200 Subject: [PATCH 50/58] Add document deletion snapshot tests and tests for hard-deletion --- milli/src/update/delete_documents.rs | 179 ++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 29 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 2626c1555..cece56f4d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -27,7 +27,7 @@ pub struct DeleteDocuments<'t, 'u, 'i> { external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, #[cfg(test)] - disable_soft_delete: bool, + disable_soft_deletion: bool, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -49,13 +49,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { external_documents_ids, to_delete_docids: RoaringBitmap::new(), #[cfg(test)] - disable_soft_delete: false, + disable_soft_deletion: false, }) } #[cfg(test)] - fn disable_soft_delete(&mut self, disable: bool) { - self.disable_soft_delete = disable; + fn disable_soft_deletion(&mut self, disable: bool) { + self.disable_soft_deletion = disable; } pub fn delete_document(&mut self, docid: u32) { @@ -156,17 +156,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - let disable_soft_delete = { + let disable_soft_deletion = { #[cfg(not(test))] { false } #[cfg(test)] { - self.disable_soft_delete + self.disable_soft_deletion } }; - if !disable_soft_delete + if !disable_soft_deletion && percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { @@ -634,7 +634,7 @@ mod tests { use super::*; use crate::index::tests::TempIndex; - use crate::Filter; + use crate::{db_snap, Filter}; fn delete_documents<'t>( wtxn: &mut RwTxn<'t, '_>, @@ -680,6 +680,10 @@ mod tests { wtxn.commit().unwrap(); + db_snap!(index, documents_ids, @"[]"); + db_snap!(index, word_docids, @""); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + let rtxn = index.read_txn().unwrap(); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); @@ -689,6 +693,10 @@ mod tests { fn delete_documents_with_strange_primary_key() { let index = TempIndex::new(); + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + let mut wtxn = index.write_txn().unwrap(); index .add_documents_using_wtxn( @@ -700,14 +708,32 @@ mod tests { ]), ) .unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, word_docids, @r###" + benoit [2, ] + kevin [0, ] + kevina [1, ] + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); builder.execute().unwrap(); - wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[2, ]"); + db_snap!(index, word_docids, @r###" + benoit [2, ] + kevin [0, ] + kevina [1, ] + "###); + db_snap!(index, soft_deleted_documents_ids, @"[0, 1, ]"); } #[test] @@ -727,26 +753,29 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); @@ -759,6 +788,86 @@ mod tests { assert!(results.documents_ids.is_empty()); wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); + db_snap!(index, word_docids, @"e89cd44832e960519823e12b1e7e28af"); + db_snap!(index, facet_id_f64_docids, @""); + db_snap!(index, facet_id_string_docids, @"720ee1ba8c18342f3714c5863bc6c1f5"); + } + #[test] + fn facet_hard_deletion() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_external_id("1_4"); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, word_docids, 1, @"999733c2461093d4873321902fc8dcd7"); + db_snap!(index, facet_id_f64_docids, 1, @""); + db_snap!(index, facet_id_string_docids, 1, @"a12e80655ed5f0f8e869bb9c32af61e9"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete more than one document + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_external_id("1_5"); + builder.delete_external_id("1_7"); + builder.delete_external_id("1_70"); + builder.delete_external_id("1_72"); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); + db_snap!(index, word_docids, 2, @"b892636eaff43c917d5aa8b09c107a02"); + db_snap!(index, facet_id_f64_docids, 2, @""); + db_snap!(index, facet_id_string_docids, 2, @"b9946a9cb0ed2df40352e98d6836c8d0"); } #[test] @@ -814,6 +923,8 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); } #[test] @@ -869,6 +980,8 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); } #[test] @@ -923,6 +1036,10 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[4, 5, 6, 11, 16, 18, ]"); + db_snap!(index, facet_id_f64_docids, @"20727a38c0b1e1a20a44526b85cf2cbc"); + db_snap!(index, facet_id_string_docids, @""); } #[test] @@ -995,6 +1112,8 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); } #[test] @@ -1045,5 +1164,7 @@ mod tests { assert_eq!(Some(&2), results.get("number")); wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); } } From e3ba1fc88383da4eb265bdca07e277e869c33772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 22 Sep 2022 14:01:13 +0200 Subject: [PATCH 51/58] Make deletion tests for both soft-deletion and hard-deletion --- milli/src/snapshot_tests.rs | 9 + milli/src/update/delete_documents.rs | 336 ++++++++---------- .../false/documents_ids.snap | 4 + .../false/facet_id_exists_docids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../false/word_docids.snap | 4 + .../false/word_pair_proximity_docids.snap | 4 + .../true/documents_ids.snap | 4 + .../true/facet_id_exists_docids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../true/word_docids.snap | 4 + .../true/word_pair_proximity_docids.snap | 4 + .../false/documents_ids.snap | 4 + .../false/facet_id_exists_docids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../false/word_docids.snap | 7 + .../false/word_pair_proximity_docids.snap | 4 + .../true/documents_ids.snap | 4 + .../true/facet_id_exists_docids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../true/word_docids.snap | 5 + .../true/word_pair_proximity_docids.snap | 4 + .../false/facet_id_exists_docids.snap | 6 + .../false/facet_id_f64_docids.snap | 6 + .../false/facet_id_string_docids.snap | 19 + .../false/soft_deleted_documents_ids.snap | 4 + .../false/word_docids.snap | 42 +++ .../false/word_pair_proximity_docids.snap | 4 + .../true/facet_id_exists_docids.snap | 6 + .../true/facet_id_f64_docids.snap | 6 + .../true/facet_id_string_docids.snap | 18 + .../true/soft_deleted_documents_ids.snap | 4 + .../true/word_docids.snap | 40 +++ .../true/word_pair_proximity_docids.snap | 4 + .../false/facet_id_f64_docids.snap | 48 +++ .../false/facet_id_string_docids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/facet_id_f64_docids.snap | 36 ++ .../true/facet_id_string_docids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + 46 files changed, 533 insertions(+), 179 deletions(-) create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 9bc39d882..389d7b7a2 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -287,6 +287,12 @@ pub fn snap_facet_id_f64_docids(index: &Index) -> String { }); snap } +pub fn snap_facet_id_exists_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_exists_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }); + snap +} pub fn snap_facet_id_string_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( FacetGroupKey { field_id, level, left_bound }, @@ -488,6 +494,9 @@ macro_rules! full_snap_of_db { }}; ($index:ident, field_id_docid_facet_strings) => {{ $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; + ($index:ident, facet_id_exists_docids) => {{ + $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index cece56f4d..432e1497f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -640,6 +640,7 @@ mod tests { wtxn: &mut RwTxn<'t, '_>, index: &'t Index, external_ids: &[&str], + disable_soft_deletion: bool, ) -> Vec { let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); let ids_to_delete: Vec = external_ids @@ -649,14 +650,14 @@ mod tests { // Delete some documents. let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); + builder.disable_soft_deletion(disable_soft_deletion); external_ids.iter().for_each(|id| drop(builder.delete_external_id(id))); builder.execute().unwrap(); ids_to_delete } - #[test] - fn delete_documents_with_numbers_as_primary_key() { + fn delete_documents_with_numbers_as_primary_key_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -676,13 +677,17 @@ mod tests { builder.delete_document(0); builder.delete_document(1); builder.delete_document(2); + builder.disable_soft_deletion(disable_soft_deletion); builder.execute().unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, @"[]"); - db_snap!(index, word_docids, @""); - db_snap!(index, soft_deleted_documents_ids, @"[]"); + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, facet_id_exists_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); let rtxn = index.read_txn().unwrap(); @@ -690,7 +695,12 @@ mod tests { } #[test] - fn delete_documents_with_strange_primary_key() { + fn delete_documents_with_numbers_as_primary_key() { + delete_documents_with_numbers_as_primary_key_(true); + delete_documents_with_numbers_as_primary_key_(false); + } + + fn delete_documents_with_strange_primary_key_(disable_soft_deletion: bool) { let index = TempIndex::new(); index @@ -710,34 +720,31 @@ mod tests { .unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, @"[0, 1, 2, ]"); - db_snap!(index, word_docids, @r###" - benoit [2, ] - kevin [0, ] - kevina [1, ] - "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); - let mut wtxn = index.write_txn().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); + builder.disable_soft_deletion(disable_soft_deletion); builder.execute().unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, @"[2, ]"); - db_snap!(index, word_docids, @r###" - benoit [2, ] - kevin [0, ] - kevina [1, ] - "###); - db_snap!(index, soft_deleted_documents_ids, @"[0, 1, ]"); + db_snap!(index, documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents() { + fn delete_documents_with_strange_primary_key() { + delete_documents_with_strange_primary_key_(true); + delete_documents_with_strange_primary_key_(false); + } + + fn filtered_placeholder_search_should_not_return_deleted_documents_( + disable_soft_deletion: bool, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -745,7 +752,7 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label") }); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); }) .unwrap(); @@ -780,7 +787,7 @@ mod tests { ) .unwrap(); - delete_documents(&mut wtxn, &index, &["1_4"]); + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], disable_soft_deletion); // Placeholder search with filter let filter = Filter::from_str("label = sign").unwrap().unwrap(); @@ -789,21 +796,27 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); - db_snap!(index, word_docids, @"e89cd44832e960519823e12b1e7e28af"); - db_snap!(index, facet_id_f64_docids, @""); - db_snap!(index, facet_id_string_docids, @"720ee1ba8c18342f3714c5863bc6c1f5"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, facet_id_f64_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, facet_id_exists_docids, disable_soft_deletion); + db_snap!(index, facet_id_string_docids, disable_soft_deletion); } + #[test] - fn facet_hard_deletion() { + fn filtered_placeholder_search_should_not_return_deleted_documents() { + filtered_placeholder_search_should_not_return_deleted_documents_(true); + filtered_placeholder_search_should_not_return_deleted_documents_(false); + } + + fn placeholder_search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label") }); }) .unwrap(); @@ -838,78 +851,8 @@ mod tests { ) .unwrap(); - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); - builder.delete_external_id("1_4"); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - db_snap!(index, word_docids, 1, @"999733c2461093d4873321902fc8dcd7"); - db_snap!(index, facet_id_f64_docids, 1, @""); - db_snap!(index, facet_id_string_docids, 1, @"a12e80655ed5f0f8e869bb9c32af61e9"); - - let mut wtxn = index.write_txn().unwrap(); - - // Delete more than one document - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); - builder.delete_external_id("1_5"); - builder.delete_external_id("1_7"); - builder.delete_external_id("1_70"); - builder.delete_external_id("1_72"); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); - db_snap!(index, word_docids, 2, @"b892636eaff43c917d5aa8b09c107a02"); - db_snap!(index, facet_id_f64_docids, 2, @""); - db_snap!(index, facet_id_string_docids, 2, @"b9946a9cb0ed2df40352e98d6836c8d0"); - } - - #[test] - fn placeholder_search_should_not_return_deleted_documents() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } - ]), - ) - .unwrap(); - - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_4"], disable_soft_deletion); // Placeholder search let results = index.search(&wtxn).execute().unwrap(); @@ -923,12 +866,15 @@ mod tests { } wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); } #[test] - fn search_should_not_return_deleted_documents() { + fn placeholder_search_should_not_return_deleted_documents() { + placeholder_search_should_not_return_deleted_documents_(true); + placeholder_search_should_not_return_deleted_documents_(false); + } + + fn search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -942,31 +888,35 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - {"docid": "1_4", "label": "sign"}, - {"docid": "1_5", "label": "letter"}, - {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, - {"docid": "1_36","label": "drawing,painting,pattern"}, - {"docid": "1_37","label": "art,drawing,outdoor"}, - {"docid": "1_38","label": "aquarium,art,drawing"}, - {"docid": "1_39","label": "abstract"}, - {"docid": "1_40","label": "cartoon"}, - {"docid": "1_41","label": "art,drawing"}, - {"docid": "1_42","label": "art,pattern"}, - {"docid": "1_43","label": "abstract,art,drawing,pattern"}, - {"docid": "1_44","label": "drawing"}, - {"docid": "1_45","label": "art"}, - {"docid": "1_46","label": "abstract,colorfulness,pattern"}, - {"docid": "1_47","label": "abstract,pattern"}, - {"docid": "1_52","label": "abstract,cartoon"}, - {"docid": "1_57","label": "abstract,drawing,pattern"}, - {"docid": "1_58","label": "abstract,art,cartoon"}, - {"docid": "1_68","label": "design"}, - {"docid": "1_69","label": "geometry"} + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); // search for abstract let results = index.search(&wtxn).query("abstract").execute().unwrap(); @@ -981,11 +931,18 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + fn search_should_not_return_deleted_documents() { + search_should_not_return_deleted_documents_(true); + search_should_not_return_deleted_documents_(false); + } + + fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( + disable_soft_deletion: bool, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1021,7 +978,8 @@ mod tests { ])).unwrap(); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &external_ids_to_delete, disable_soft_deletion); // Placeholder search with geo filter let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); @@ -1037,13 +995,18 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[4, 5, 6, 11, 16, 18, ]"); - db_snap!(index, facet_id_f64_docids, @"20727a38c0b1e1a20a44526b85cf2cbc"); - db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, facet_id_f64_docids, disable_soft_deletion); + db_snap!(index, facet_id_string_docids, disable_soft_deletion); } #[test] - fn get_documents_should_not_return_deleted_documents() { + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + geo_filtered_placeholder_search_should_not_return_deleted_documents_(true); + geo_filtered_placeholder_search_should_not_return_deleted_documents_(false); + } + + fn get_documents_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1057,32 +1020,36 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); let deleted_external_ids = ["1_7", "1_52"]; - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &deleted_external_ids, disable_soft_deletion); // list all documents let results = index.all_documents(&wtxn).unwrap(); @@ -1113,11 +1080,16 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn stats_should_not_return_deleted_documents() { + fn get_documents_should_not_return_deleted_documents() { + get_documents_should_not_return_deleted_documents_(true); + get_documents_should_not_return_deleted_documents_(false); + } + + fn stats_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1129,29 +1101,29 @@ mod tests { .unwrap(); index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "docid": "1_4", "label": "sign"}, - { "docid": "1_5", "label": "letter"}, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, - { "docid": "1_36", "label": "drawing,painting,pattern"}, - { "docid": "1_37", "label": "art,drawing,outdoor"}, - { "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"}, - { "docid": "1_39", "label": "abstract"}, - { "docid": "1_40", "label": "cartoon"}, - { "docid": "1_41", "label": "art,drawing"}, - { "docid": "1_42", "label": "art,pattern"}, - { "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32}, - { "docid": "1_44", "label": "drawing", "number": 44i32}, - { "docid": "1_45", "label": "art"}, - { "docid": "1_46", "label": "abstract,colorfulness,pattern"}, - { "docid": "1_47", "label": "abstract,pattern"}, - { "docid": "1_52", "label": "abstract,cartoon"}, - { "docid": "1_57", "label": "abstract,drawing,pattern"}, - { "docid": "1_58", "label": "abstract,art,cartoon"}, - { "docid": "1_68", "label": "design"}, - { "docid": "1_69", "label": "geometry"} + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} ])).unwrap(); - delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); // count internal documents let results = index.number_of_documents(&wtxn).unwrap(); @@ -1165,6 +1137,12 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + stats_should_not_return_deleted_documents_(true); + stats_should_not_return_deleted_documents_(false); } } diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..9139b7a05 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap new file mode 100644 index 000000000..15c881e87 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] +kevin [0, ] +kevina [1, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap new file mode 100644 index 000000000..88d3a98aa --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..a7ee4348d --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [20, 21, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap new file mode 100644 index 000000000..565fadcb3 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 1.2 1.2 [20, 22, ] +2 0 2.2 2.2 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap new file mode 100644 index 000000000..019836089 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -0,0 +1,19 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +1 aquarium aquarium [5, ] +1 art art [4, 5, 8, 9, 10, 12, 17, ] +1 cartoon cartoon [2, 7, 15, 17, ] +1 colorfulness colorfulness [13, ] +1 design design [2, 18, ] +1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] +1 geometry geometry [19, ] +1 letter letter [1, ] +1 outdoor outdoor [4, ] +1 painting painting [3, ] +1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] +1 sign sign [0, ] +2 design design [21, ] +2 geometry geometry [20, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..1145cbd56 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap new file mode 100644 index 000000000..7909d9b06 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap @@ -0,0 +1,42 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1.2 [20, 22, ] +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_4 [0, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_70 [20, ] +1_71 [21, ] +1_72 [22, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, 20, 22, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] +sign [0, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..7299bc214 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [20, 21, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap new file mode 100644 index 000000000..565fadcb3 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 1.2 1.2 [20, 22, ] +2 0 2.2 2.2 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap new file mode 100644 index 000000000..9f8541607 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -0,0 +1,18 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +1 aquarium aquarium [5, ] +1 art art [4, 5, 8, 9, 10, 12, 17, ] +1 cartoon cartoon [2, 7, 15, 17, ] +1 colorfulness colorfulness [13, ] +1 design design [2, 18, ] +1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] +1 geometry geometry [19, ] +1 letter letter [1, ] +1 outdoor outdoor [4, ] +1 painting painting [3, ] +1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] +2 design design [21, ] +2 geometry geometry [20, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap new file mode 100644 index 000000000..c7e0c2d7a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap @@ -0,0 +1,40 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1.2 [20, 22, ] +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_70 [20, ] +1_71 [21, ] +1_72 [22, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, 20, 22, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap new file mode 100644 index 000000000..4d3786e09 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -0,0 +1,48 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 48.9021 [19, ] +3 0 49.4449 49.4449 [18, ] +3 0 49.9314 49.9314 [17, ] +3 0 50.1112 50.1112 [16, ] +3 0 50.1793 50.1793 [15, ] +3 0 50.2844 50.2844 [14, ] +3 0 50.3518 50.3518 [13, ] +3 0 50.4095 50.4095 [11, ] +3 0 50.4502 50.4502 [12, ] +3 0 50.6053 50.6053 [8, ] +3 0 50.6224 50.6224 [3, ] +3 0 50.6299 50.6299 [0, ] +3 0 50.6312 50.6312 [2, ] +3 0 50.6415 50.6415 [1, ] +3 0 50.6552 50.6552 [4, ] +3 0 50.6924 50.6924 [5, ] +3 0 50.7263 50.7263 [6, ] +3 0 50.7453 50.7453 [7, ] +3 0 50.8466 50.8466 [10, ] +3 0 51.0537 51.0537 [9, ] +3 1 48.9021 50.1112 [16, 17, 18, 19, ] +3 1 50.1793 50.4095 [11, 13, 14, 15, ] +3 1 50.4502 50.6299 [0, 3, 8, 12, ] +3 1 50.6312 50.6924 [1, 2, 4, 5, ] +3 1 50.7263 51.0537 [6, 7, 9, 10, ] +4 0 2.271 2.271 [17, ] +4 0 2.3708 2.3708 [19, ] +4 0 2.7637 2.7637 [14, ] +4 0 2.7913 2.7913 [18, ] +4 0 2.8547 2.8547 [16, ] +4 0 3.0569 3.0569 [0, ] +4 0 3.1106 3.1106 [1, 2, ] +4 0 3.1476 3.1476 [3, ] +4 0 3.1541 3.1541 [6, ] +4 0 3.1763 3.1763 [5, ] +4 0 3.1897 3.1897 [4, ] +4 0 3.2189 3.2189 [15, ] +4 0 3.2206 3.2206 [7, ] +4 0 3.3758 3.3758 [8, ] +4 0 3.5326 3.5326 [13, ] +4 0 3.6957 3.6957 [9, ] +4 0 3.9623 3.9623 [12, ] +4 0 4.337 4.337 [10, ] +4 0 4.4347 4.4347 [11, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..1260b12de --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap new file mode 100644 index 000000000..d380cf29c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -0,0 +1,36 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 48.9021 [19, ] +3 0 49.9314 49.9314 [17, ] +3 0 50.1793 50.1793 [15, ] +3 0 50.2844 50.2844 [14, ] +3 0 50.3518 50.3518 [13, ] +3 0 50.4502 50.4502 [12, ] +3 0 50.6053 50.6053 [8, ] +3 0 50.6224 50.6224 [3, ] +3 0 50.6299 50.6299 [0, ] +3 0 50.6312 50.6312 [2, ] +3 0 50.6415 50.6415 [1, ] +3 0 50.7453 50.7453 [7, ] +3 0 50.8466 50.8466 [10, ] +3 0 51.0537 51.0537 [9, ] +3 1 48.9021 50.1112 [17, 19, ] +3 1 50.1793 50.4095 [13, 14, 15, ] +3 1 50.4502 50.6299 [0, 3, 8, 12, ] +3 1 50.6312 50.6924 [1, 2, ] +3 1 50.7263 51.0537 [7, 9, 10, ] +4 0 2.271 2.271 [17, ] +4 0 2.3708 2.3708 [19, ] +4 0 2.7637 2.7637 [14, ] +4 0 3.0569 3.0569 [0, ] +4 0 3.1106 3.1106 [1, 2, ] +4 0 3.1476 3.1476 [3, ] +4 0 3.2189 3.2189 [15, ] +4 0 3.2206 3.2206 [7, ] +4 0 3.3758 3.3758 [8, ] +4 0 3.5326 3.5326 [13, ] +4 0 3.6957 3.6957 [9, ] +4 0 3.9623 3.9623 [12, ] +4 0 4.337 4.337 [10, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] From f198b20c4280b9414a8fd069c9efbd50b20767c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 12:32:33 +0200 Subject: [PATCH 52/58] Add facet deletion tests that use both the incremental and bulk methods + update deletion snapshots to the new database format --- milli/src/update/delete_documents.rs | 2 +- milli/src/update/facet/bulk.rs | 6 - milli/src/update/facet/delete.rs | 168 ++++++++++++++++-- milli/src/update/facet/mod.rs | 23 ++- .../1/facet_id_f64_docids.hash.snap | 4 + .../1/number_faceted_documents_ids.hash.snap | 4 + .../2/facet_id_f64_docids.hash.snap | 4 + .../2/number_faceted_documents_ids.hash.snap | 4 + .../false/facet_id_exists_docids.snap | 4 - .../true/facet_id_exists_docids.snap | 4 - .../false/facet_id_f64_docids.snap | 4 +- .../false/facet_id_string_docids.snap | 30 ++-- .../false/soft_deleted_documents_ids.snap | 2 +- .../true/facet_id_exists_docids.snap | 2 +- .../true/facet_id_f64_docids.snap | 3 +- .../true/facet_id_string_docids.snap | 27 ++- .../true/word_docids.snap | 5 +- .../false/facet_id_f64_docids.snap | 93 +++++----- .../true/facet_id_f64_docids.snap | 59 +++--- 19 files changed, 302 insertions(+), 146 deletions(-) create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 432e1497f..6ff41ccbb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -54,7 +54,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } #[cfg(test)] - fn disable_soft_deletion(&mut self, disable: bool) { + pub fn disable_soft_deletion(&mut self, disable: bool) { self.disable_soft_deletion = disable; } diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 4e10c22dd..ea0a7d3d7 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -4,9 +4,7 @@ use std::fs::File; use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; use roaring::RoaringBitmap; -use time::OffsetDateTime; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; @@ -71,8 +69,6 @@ impl<'i> FacetsUpdateBulk<'i> { #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; let db = match facet_type { @@ -84,8 +80,6 @@ impl<'i> FacetsUpdateBulk<'i> { } }; - index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 74c17e8f2..2bc54c7c1 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -1,15 +1,21 @@ -use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; -use crate::{ - facet::FacetType, - heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, - heed_codec::ByteSliceRefCodec, - update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}, - FieldId, Index, Result, -}; -use heed::RwTxn; -use roaring::RoaringBitmap; use std::collections::{HashMap, HashSet}; +use heed::RwTxn; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; +use crate::{FieldId, Index, Result}; + +/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of removed elements and the existing size of the database, we use either +/// a bulk delete method or an incremental delete method. pub struct FacetsDelete<'i, 'b> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, @@ -48,8 +54,18 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { } pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + for (field_id, affected_facet_values) in self.affected_facet_values { - if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) { + // This is an incorrect condition, since we assume that the length of the database is equal + // to the number of facet values for the given field_id. It means that in some cases, we might + // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could + // really be a performance problem is when we fully delete a large ratio of all facet values for + // each field id. This would almost never happen. Still, to be overly cautious, I have added a + // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance + // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. + if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { // Bulk delete let mut modified = false; @@ -91,3 +107,133 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + + use big_s::S; + use maplit::hashset; + use roaring::RoaringBitmap; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::update::DeleteDocuments; + + #[test] + fn delete_mixed_incremental_and_bulk() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": i / 10, + "colour": i / 100, + "timestamp": i / 2, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, 1); + db_snap!(index, number_faceted_documents_ids, 1); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_f64_docids, 2); + db_snap!(index, number_faceted_documents_ids, 2); + } +} + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::tests::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 70 times slower than the + // bulk indexer. + // #[test] + fn benchmark_facet_indexing_delete() { + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // delete one document + // + for _ in 0..nbr_doc { + let deleted_u32 = r.gen::() % size; + let deleted_f64 = deleted_u32 as f64; + index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) + } + let time_spent = timer.elapsed().as_millis(); + println!(" delete {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 5fb5c9e48..76e5514a1 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -78,6 +78,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use std::fs::File; +use log::debug; +use time::OffsetDateTime; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; @@ -89,6 +92,10 @@ pub mod bulk; pub mod delete; pub mod incremental; +/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of new elements and the existing size of the database, we use either +/// a bulk update method or an incremental update method. pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, @@ -123,6 +130,10 @@ impl<'i> FacetsUpdate<'i> { if self.new_data.is_empty() { return Ok(()); } + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + // See self::comparison_bench::benchmark_facet_indexing if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); @@ -204,7 +215,7 @@ pub(crate) mod tests { let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 10 * 100); + let options = options.map_size(4096 * 4 * 10 * 1000); unsafe { options.flag(heed::flags::Flags::MdbAlwaysFreePages); } @@ -230,7 +241,7 @@ pub(crate) mod tests { let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 1000); + let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); @@ -440,12 +451,14 @@ mod comparison_bench { // This is a simple test to get an intuition on the relative speed // of the incremental vs. bulk indexer. - // It appears that the incremental indexer is about 50 times slower than the + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 50 times slower than the // bulk indexer. // #[test] fn benchmark_facet_indexing() { - // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it - let mut facet_value = 0; let mut r = rand::thread_rng(); diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..fee486bab --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +550cd138d6fe31ccdd42cd5392fbd576 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap new file mode 100644 index 000000000..fcf957004 --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +9a0ea88e7c9dcf6dc0ef0b601736ffcf diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..29ceb250e --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +d4d5f14e7f1e1f09b86821a0b6defcc6 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap new file mode 100644 index 000000000..bbaf6d2a2 --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +3570e0ac0fdb21be9ebe433f59264b56 diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap index 565fadcb3..cfa649653 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -1,6 +1,6 @@ --- source: milli/src/update/delete_documents.rs --- -2 0 1.2 1.2 [20, 22, ] -2 0 2.2 2.2 [21, ] +2 0 1.2 1 [20, 22, ] +2 0 2.2 1 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap index 019836089..8336bd712 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -1,19 +1,19 @@ --- source: milli/src/update/delete_documents.rs --- -1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -1 aquarium aquarium [5, ] -1 art art [4, 5, 8, 9, 10, 12, 17, ] -1 cartoon cartoon [2, 7, 15, 17, ] -1 colorfulness colorfulness [13, ] -1 design design [2, 18, ] -1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] -1 geometry geometry [19, ] -1 letter letter [1, ] -1 outdoor outdoor [4, ] -1 painting painting [3, ] -1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] -1 sign sign [0, ] -2 design design [21, ] -2 geometry geometry [20, 22, ] +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +1 0 sign 1 [0, ] +2 0 design 1 [21, ] +2 0 geometry 1 [20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap index 1145cbd56..dfac98e59 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/delete_documents.rs --- -[0, ] +[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap index 7299bc214..7481b11c4 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap @@ -2,5 +2,5 @@ source: milli/src/update/delete_documents.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [20, 21, 22, ] +2 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap index 565fadcb3..87856f6dc 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -1,6 +1,5 @@ --- source: milli/src/update/delete_documents.rs --- -2 0 1.2 1.2 [20, 22, ] -2 0 2.2 2.2 [21, ] +2 0 2.2 1 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap index 9f8541607..ab1d2175f 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -1,18 +1,17 @@ --- source: milli/src/update/delete_documents.rs --- -1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -1 aquarium aquarium [5, ] -1 art art [4, 5, 8, 9, 10, 12, 17, ] -1 cartoon cartoon [2, 7, 15, 17, ] -1 colorfulness colorfulness [13, ] -1 design design [2, 18, ] -1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] -1 geometry geometry [19, ] -1 letter letter [1, ] -1 outdoor outdoor [4, ] -1 painting painting [3, ] -1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] -2 design design [21, ] -2 geometry geometry [20, 22, ] +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +2 0 design 1 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap index c7e0c2d7a..d8125dfcf 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap @@ -1,7 +1,6 @@ --- source: milli/src/update/delete_documents.rs --- -1.2 [20, 22, ] 1_36 [3, ] 1_37 [4, ] 1_38 [5, ] @@ -21,9 +20,7 @@ source: milli/src/update/delete_documents.rs 1_68 [18, ] 1_69 [19, ] 1_7 [2, ] -1_70 [20, ] 1_71 [21, ] -1_72 [22, ] 2.2 [21, ] abstract [2, 6, 10, 13, 14, 15, 16, 17, ] aquarium [5, ] @@ -32,7 +29,7 @@ cartoon [2, 7, 15, 17, ] colorfulness [13, ] design [2, 18, 21, ] drawing [3, 4, 5, 8, 10, 11, 16, ] -geometry [19, 20, 22, ] +geometry [19, ] letter [1, ] outdoor [4, ] painting [3, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap index 4d3786e09..c909a3cd8 100644 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -1,48 +1,53 @@ --- source: milli/src/update/delete_documents.rs --- -3 0 48.9021 48.9021 [19, ] -3 0 49.4449 49.4449 [18, ] -3 0 49.9314 49.9314 [17, ] -3 0 50.1112 50.1112 [16, ] -3 0 50.1793 50.1793 [15, ] -3 0 50.2844 50.2844 [14, ] -3 0 50.3518 50.3518 [13, ] -3 0 50.4095 50.4095 [11, ] -3 0 50.4502 50.4502 [12, ] -3 0 50.6053 50.6053 [8, ] -3 0 50.6224 50.6224 [3, ] -3 0 50.6299 50.6299 [0, ] -3 0 50.6312 50.6312 [2, ] -3 0 50.6415 50.6415 [1, ] -3 0 50.6552 50.6552 [4, ] -3 0 50.6924 50.6924 [5, ] -3 0 50.7263 50.7263 [6, ] -3 0 50.7453 50.7453 [7, ] -3 0 50.8466 50.8466 [10, ] -3 0 51.0537 51.0537 [9, ] -3 1 48.9021 50.1112 [16, 17, 18, 19, ] -3 1 50.1793 50.4095 [11, 13, 14, 15, ] -3 1 50.4502 50.6299 [0, 3, 8, 12, ] -3 1 50.6312 50.6924 [1, 2, 4, 5, ] -3 1 50.7263 51.0537 [6, 7, 9, 10, ] -4 0 2.271 2.271 [17, ] -4 0 2.3708 2.3708 [19, ] -4 0 2.7637 2.7637 [14, ] -4 0 2.7913 2.7913 [18, ] -4 0 2.8547 2.8547 [16, ] -4 0 3.0569 3.0569 [0, ] -4 0 3.1106 3.1106 [1, 2, ] -4 0 3.1476 3.1476 [3, ] -4 0 3.1541 3.1541 [6, ] -4 0 3.1763 3.1763 [5, ] -4 0 3.1897 3.1897 [4, ] -4 0 3.2189 3.2189 [15, ] -4 0 3.2206 3.2206 [7, ] -4 0 3.3758 3.3758 [8, ] -4 0 3.5326 3.5326 [13, ] -4 0 3.6957 3.6957 [9, ] -4 0 3.9623 3.9623 [12, ] -4 0 4.337 4.337 [10, ] -4 0 4.4347 4.4347 [11, ] +3 0 48.9021 1 [19, ] +3 0 49.4449 1 [18, ] +3 0 49.9314 1 [17, ] +3 0 50.1112 1 [16, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4095 1 [11, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.6552 1 [4, ] +3 0 50.6924 1 [5, ] +3 0 50.7263 1 [6, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +3 1 48.9021 4 [16, 17, 18, 19, ] +3 1 50.1793 4 [11, 13, 14, 15, ] +3 1 50.4502 4 [0, 3, 8, 12, ] +3 1 50.6312 4 [1, 2, 4, 5, ] +3 1 50.7263 4 [6, 7, 9, 10, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 2.7913 1 [18, ] +4 0 2.8547 1 [16, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.1541 1 [6, ] +4 0 3.1763 1 [5, ] +4 0 3.1897 1 [4, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] +4 0 4.4347 1 [11, ] +4 1 2.271 4 [14, 17, 18, 19, ] +4 1 2.8547 4 [0, 1, 2, 3, 16, ] +4 1 3.1541 4 [4, 5, 6, 15, ] +4 1 3.2206 4 [7, 8, 9, 13, ] +4 1 3.9623 3 [10, 11, 12, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap index d380cf29c..18a9d9309 100644 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -1,36 +1,31 @@ --- source: milli/src/update/delete_documents.rs --- -3 0 48.9021 48.9021 [19, ] -3 0 49.9314 49.9314 [17, ] -3 0 50.1793 50.1793 [15, ] -3 0 50.2844 50.2844 [14, ] -3 0 50.3518 50.3518 [13, ] -3 0 50.4502 50.4502 [12, ] -3 0 50.6053 50.6053 [8, ] -3 0 50.6224 50.6224 [3, ] -3 0 50.6299 50.6299 [0, ] -3 0 50.6312 50.6312 [2, ] -3 0 50.6415 50.6415 [1, ] -3 0 50.7453 50.7453 [7, ] -3 0 50.8466 50.8466 [10, ] -3 0 51.0537 51.0537 [9, ] -3 1 48.9021 50.1112 [17, 19, ] -3 1 50.1793 50.4095 [13, 14, 15, ] -3 1 50.4502 50.6299 [0, 3, 8, 12, ] -3 1 50.6312 50.6924 [1, 2, ] -3 1 50.7263 51.0537 [7, 9, 10, ] -4 0 2.271 2.271 [17, ] -4 0 2.3708 2.3708 [19, ] -4 0 2.7637 2.7637 [14, ] -4 0 3.0569 3.0569 [0, ] -4 0 3.1106 3.1106 [1, 2, ] -4 0 3.1476 3.1476 [3, ] -4 0 3.2189 3.2189 [15, ] -4 0 3.2206 3.2206 [7, ] -4 0 3.3758 3.3758 [8, ] -4 0 3.5326 3.5326 [13, ] -4 0 3.6957 3.6957 [9, ] -4 0 3.9623 3.9623 [12, ] -4 0 4.337 4.337 [10, ] +3 0 48.9021 1 [19, ] +3 0 49.9314 1 [17, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] From 206a3e00e5ca68075581c64fe8d4d50aaad8b695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 12:35:44 +0200 Subject: [PATCH 53/58] cargo fmt --- milli/src/heed_codec/facet/mod.rs | 3 +-- milli/src/snapshot_tests.rs | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index a727b148f..4609bfe7f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -11,9 +11,8 @@ use roaring::RoaringBitmap; pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; -use crate::{CboRoaringBitmapCodec, BEU16}; - use super::StrRefCodec; +use crate::{CboRoaringBitmapCodec, BEU16}; pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 389d7b7a2..bcb9805ea 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -495,7 +495,7 @@ macro_rules! full_snap_of_db { ($index:ident, field_id_docid_facet_strings) => {{ $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) }}; - ($index:ident, facet_id_exists_docids) => {{ + ($index:ident, facet_id_exists_docids) => {{ $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) }}; ($index:ident, documents_ids) => {{ From 14ca8048a8539d2b658f1d5ed0269ea19a980d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 17 Oct 2022 12:42:12 +0200 Subject: [PATCH 54/58] Add some documentation on how to run the facet db fuzzer --- milli/src/update/facet/incremental.rs | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a4c756aec..2558c81a3 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1022,6 +1022,35 @@ mod tests { // fuzz tests #[cfg(all(test, fuzzing))] +/** +Fuzz test for the incremental indxer. + +The fuzz test uses fuzzcheck, a coverage-guided fuzzer. +See https://github.com/loiclec/fuzzcheck-rs and https://fuzzcheck.neocities.org +for more information. + +It is only run when using the `cargo fuzzcheck` command line tool, which can be installed with: +```sh +cargo install cargo-fuzzcheck +``` +To start the fuzz test, run (from the base folder or from milli/): +```sh +cargo fuzzcheck update::facet::incremental::fuzz::fuzz +``` +and wait a couple minutes to make sure the code was thoroughly tested, then +hit `Ctrl-C` to stop the fuzzer. The corpus generated by the fuzzer is located in milli/fuzz. + +To work on this module with rust-analyzer working properly, add the following to your .cargo/config.toml file: +```toml +[build] +rustflags = ["--cfg", "fuzzing"] +``` + +The fuzz test generates sequences of additions and deletions to the facet database and +ensures that: +1. its structure is still internally valid +2. its content is the same as a trivially correct implementation of the same database +*/ mod fuzz { use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; From 3b1f908e5e80335d517c8eb1dd5e35952d45b49a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 17 Oct 2022 12:48:10 +0200 Subject: [PATCH 55/58] Revert behaviour of facet distribution to what it was before Where the docid that is used to get the original facet string value definitely belongs to the candidates --- milli/src/search/facet/facet_distribution_iter.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 0fdca4118..9cd85b667 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -74,13 +74,12 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } - // TODO: use real intersection and then take min()? - let docids_in_common = value.bitmap.intersection_len(candidates); - if docids_in_common > 0 { - // TODO: use min() - let any_docid = value.bitmap.iter().next().unwrap(); - match (self.callback)(key.left_bound, docids_in_common, any_docid)? { - ControlFlow::Continue(_) => (), // TODO use unit instead of empty scope + let docids_in_common = value.bitmap & candidates; + if !docids_in_common.is_empty() { + let any_docid_in_common = docids_in_common.min().unwrap(); + match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? + { + ControlFlow::Continue(_) => (), ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } From b7f2428961198cfaee5f601d94925099723d070c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Oct 2022 13:49:33 +0200 Subject: [PATCH 56/58] Fix formatting and warning after rebasing from main --- milli/src/heed_codec/mod.rs | 5 +++-- milli/src/update/index_documents/mod.rs | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 6a058f95f..702dcf661 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -9,6 +9,9 @@ mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; +pub use byte_slice_ref::ByteSliceRefCodec; +pub use str_ref::StrRefCodec; + pub use self::beu32_str_codec::BEU32StrCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::obkv_codec::ObkvCodec; @@ -18,5 +21,3 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; -pub use byte_slice_ref::ByteSliceRefCodec; -pub use str_ref::StrRefCodec; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7b02fd1af..10a831ddf 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, - PrefixWordPairsProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, + WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; From 631e9910da878ec410ecb215d853617c93524ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Oct 2022 14:06:59 +0200 Subject: [PATCH 57/58] Depend on released version of fuzzcheck from crates.io --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b768476e3..52fdf2374 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } # TODO: use released version +fuzzcheck = "0.12.1" [features] default = [ "charabia/default" ] From 2fa85a24ec700f4c6823e283fae20c8c79833a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Oct 2022 14:09:35 +0200 Subject: [PATCH 58/58] Remove outdated files from http-ui/ and infos/ ... that were reintroduced after a rebase --- http-ui/src/main.rs | 1 - infos/src/main.rs | 1 - 2 files changed, 2 deletions(-) delete mode 100644 http-ui/src/main.rs delete mode 100644 infos/src/main.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs deleted file mode 100644 index 8b1378917..000000000 --- a/http-ui/src/main.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/infos/src/main.rs b/infos/src/main.rs deleted file mode 100644 index 8b1378917..000000000 --- a/infos/src/main.rs +++ /dev/null @@ -1 +0,0 @@ -