From 9026867d17744a0a95ea4086d0efff48ddd323af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 17:31:26 +0200 Subject: [PATCH] Give same interface to bulk and incremental facet indexing types + cargo fmt, oops, sorry for the bad history :( --- milli/src/heed_codec/facet/mod.rs | 14 ++- milli/src/index.rs | 8 +- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/distinct/facet_distinct.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 8 +- .../search/facet/facet_distribution_iter.rs | 14 ++- milli/src/search/facet/facet_range_search.rs | 7 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 7 +- milli/src/search/facet/filter.rs | 5 +- milli/src/search/facet/mod.rs | 59 ++++++--- milli/src/snapshot_tests.rs | 2 +- milli/src/update/clear_documents.rs | 3 +- milli/src/update/delete_documents.rs | 2 +- milli/src/update/facet/bulk.rs | 118 ++++++++++++++++-- milli/src/update/facet/incremental.rs | 117 ++++++++++++----- milli/src/update/facet/mod.rs | 68 +++++----- .../default/facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../default/facet_id_string_docids.hash.snap | 4 + .../facet_id_string_docids.hash.snap | 4 + .../extract/extract_facet_number_docids.rs | 6 +- .../extract/extract_facet_string_docids.rs | 3 +- .../index_documents/helpers/grenad_helpers.rs | 32 +---- .../src/update/index_documents/helpers/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/mod.rs | 2 +- 27 files changed, 333 insertions(+), 174 deletions(-) create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 299aeceb4..40e395881 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -3,17 +3,19 @@ mod field_doc_id_facet_string_codec; mod ordered_f64_codec; mod str_ref; +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; + +use heed::types::OwnedType; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; + pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; -use heed::types::OwnedType; -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::convert::TryFrom; -use std::marker::PhantomData; pub type FieldIdCodec = OwnedType; diff --git a/milli/src/index.rs b/milli/src/index.rs index 66a53d98c..893817d59 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,10 +14,10 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec}; +use crate::heed_codec::facet::{ + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldIdCodec, OrderedF64Codec, StrRefCodec, +}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 2908f0e78..bb2788cc8 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::facet_sort_descending::descending_facet_sort; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index b9d584eb6..1725346be 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,8 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; -use crate::heed_codec::facet::FacetGroupKey; -use crate::heed_codec::facet::*; +use crate::heed_codec::facet::{FacetGroupKey, *}; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 10b995d97..7c554d368 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -8,10 +8,10 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, OrderedF64Codec, StrRefCodec, +}; use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 6eec64b25..2eebffbcd 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -112,17 +112,19 @@ where #[cfg(test)] mod tests { + use std::ops::ControlFlow; + + use heed::BytesDecode; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use super::iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use std::ops::ControlFlow; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -133,7 +135,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index d9a6c5fd4..bb555e1ab 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -261,14 +261,13 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::FacetGroupKeyCodec; - use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -279,7 +278,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e620f9f1d..fc5fd3d04 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -93,7 +93,7 @@ mod tests { use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -104,7 +104,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 5425a5051..42bae42a6 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -119,15 +119,14 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -138,7 +137,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 1b40f6db1..15edafb03 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -9,8 +9,9 @@ use roaring::RoaringBitmap; use super::facet_range_search; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, +}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ec5caa2a8..ef72658ec 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; mod facet_distribution; mod facet_distribution_iter; @@ -27,8 +27,8 @@ where db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; - let first_key = - FacetGroupKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) + .ok_or(heed::Error::Encoding)?; Ok(Some(first_key.left_bound)) } else { Ok(None) @@ -50,8 +50,8 @@ where .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; - let last_key = - FacetGroupKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) + .ok_or(heed::Error::Encoding)?; Ok(Some(last_key.left_bound)) } else { Ok(None) @@ -85,11 +85,12 @@ pub mod test { use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::snapshot_tests::display_bitmap; - use crate::update::FacetsUpdateIncremental; + use crate::update::FacetsUpdateIncrementalInner; + // A dummy index that only contains the facet database, used for testing pub struct FacetIndex where for<'a> BoundCodec: @@ -100,10 +101,12 @@ pub mod test { _phantom: PhantomData, } + // The faecet database and its settings pub struct Database { pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: usize, - pub max_group_size: usize, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, _tempdir: Rc, } @@ -117,9 +120,12 @@ pub mod test { tempdir: Rc, group_size: u8, max_group_size: u8, + min_level_size: u8, ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; - let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 10 * 100); unsafe { @@ -129,14 +135,25 @@ pub mod test { let content = env.open_database(None).unwrap().unwrap(); FacetIndex { - db: Database { content, group_size, max_group_size, _tempdir: tempdir }, + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: tempdir, + }, env, _phantom: PhantomData, } } - pub fn new(group_size: u8, max_group_size: u8) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; - let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 100); let tempdir = tempfile::TempDir::new().unwrap(); @@ -144,7 +161,13 @@ pub mod test { let content = env.create_database(None).unwrap(); FacetIndex { - db: Database { content, group_size, max_group_size, _tempdir: Rc::new(tempdir) }, + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: Rc::new(tempdir), + }, env, _phantom: PhantomData, } @@ -156,7 +179,7 @@ pub mod test { key: &'a >::EItem, docids: &RoaringBitmap, ) { - let update = FacetsUpdateIncremental::new(self.db.content); + let update = FacetsUpdateIncrementalInner::new(self.db.content); let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); } @@ -167,7 +190,7 @@ pub mod test { key: &'a >::EItem, value: u32, ) { - let update = FacetsUpdateIncremental::new(self.db.content); + let update = FacetsUpdateIncrementalInner::new(self.db.content); let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index ab9dddaf2..9bc39d882 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -5,7 +5,7 @@ use std::path::Path; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupValue, FacetGroupKey}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; #[track_caller] diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 7d89ca89a..adeea11fa 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,8 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{facet::FacetType, ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::facet::FacetType; +use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5b9e99d77..14ef5fd6a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -11,7 +11,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 70392b7db..ad97ed2de 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,18 +1,20 @@ +use std::borrow::Cow; +use std::cmp; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::cmp; -use std::fs::File; -use time::OffsetDateTime; pub struct FacetsUpdateBulk<'i> { index: &'i Index, @@ -367,9 +369,7 @@ mod tests { documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); } let documents = documents_batch_reader_from_objects(documents); - dbg!(); index.add_documents(documents).unwrap(); - dbg!(); db_snap!(index, facet_id_f64_docids, name); }; @@ -421,4 +421,100 @@ mod tests { test("default", None, None); test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); } + + #[test] + fn test_facets_number_incremental_update() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + let mut documents = vec![]; + for i in 1000..1010 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 100..110 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } + + #[test] + fn test_facets_number_delete_facet_id_then_bulk_update() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + // 1100 facets -> how long is the DB? + + let mut documents = vec![]; + for i in 1000..1010 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 100..110 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index bcde3bc53..75ca5d55b 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,12 +1,16 @@ +use std::collections::HashMap; +use std::fs::File; + use heed::types::ByteSlice; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; -use crate::Result; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { InPlace, @@ -18,30 +22,79 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -pub struct FacetsUpdateIncremental { +pub struct FacetsUpdateIncremental<'i> { + index: &'i Index, + inner: FacetsUpdateIncrementalInner, + facet_type: FacetType, + new_data: grenad::Reader, +} + +impl<'i> FacetsUpdateIncremental<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + FacetsUpdateIncremental { + index, + inner: FacetsUpdateIncrementalInner { + db: match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => index + .facet_id_f64_docids + .remap_key_type::>(), + }, + group_size: 4, + max_group_size: 8, + min_level_size: 5, + }, + facet_type, + new_data, + } + } + pub fn group_size(mut self, size: u8) -> Self { + self.inner.group_size = size; + self + } + pub fn min_level_size(mut self, size: u8) -> Self { + self.inner.min_level_size = size; + self + } + pub fn max_group_size(mut self, size: u8) -> Self { + self.inner.max_group_size = size; + self + } + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + Ok(()) + } +} + +pub struct FacetsUpdateIncrementalInner { db: heed::Database, FacetGroupValueCodec>, group_size: u8, min_level_size: u8, max_group_size: u8, } -impl FacetsUpdateIncremental { +impl FacetsUpdateIncrementalInner { pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } - pub fn group_size(mut self, size: u8) -> Self { - self.group_size = size; - self - } - pub fn min_level_size(mut self, size: u8) -> Self { - self.min_level_size = size; - self - } - pub fn max_group_size(mut self, size: u8) -> Self { - self.max_group_size = size; - self - } } -impl FacetsUpdateIncremental { +impl FacetsUpdateIncrementalInner { fn find_insertion_key_value( &self, field_id: u16, @@ -481,9 +534,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::heed_codec::facet::StrRefCodec; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; + use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, StrRefCodec, + }; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; @@ -534,7 +587,7 @@ mod tests { FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; - assert!(value.size > 0 && (value.size as usize) < db.max_group_size); + assert!(value.size > 0 && value.size < db.max_group_size); let mut actual_size = 0; let mut values_below = RoaringBitmap::new(); @@ -553,7 +606,7 @@ mod tests { } #[test] fn append() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -566,7 +619,7 @@ mod tests { } #[test] fn many_field_ids_append() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -595,7 +648,7 @@ mod tests { } #[test] fn many_field_ids_prepend() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in (0..256).into_iter().rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -625,7 +678,7 @@ mod tests { #[test] fn prepend() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in (0..256).into_iter().rev() { @@ -640,7 +693,7 @@ mod tests { #[test] fn shuffled() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..256).into_iter().collect::>(); @@ -659,7 +712,7 @@ mod tests { #[test] fn merge_values() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..256).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -680,7 +733,7 @@ mod tests { #[test] fn delete_from_end() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); @@ -745,7 +798,7 @@ mod tests { #[test] fn delete_from_start() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); @@ -783,7 +836,7 @@ mod tests { #[test] fn delete_shuffled() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); @@ -829,7 +882,7 @@ mod tests { #[test] fn in_place_level0_insert() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..16).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -849,7 +902,7 @@ mod tests { #[test] fn in_place_level0_delete() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..64).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -879,7 +932,7 @@ mod tests { #[test] fn shuffle_merge_string_and_delete() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (1000..1064).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 04810cb48..3b46bb421 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,12 +1,9 @@ -use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; -use crate::{ - facet::FacetType, - heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, - CboRoaringBitmapCodec, FieldId, Index, Result, -}; -use heed::BytesDecode; -use roaring::RoaringBitmap; -use std::{collections::HashMap, fs::File}; +use self::incremental::FacetsUpdateIncremental; +use super::FacetsUpdateBulk; +use crate::facet::FacetType; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::{Index, Result}; +use std::fs::File; pub mod bulk; pub mod incremental; @@ -14,11 +11,13 @@ pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + new_data: grenad::Reader, + // Options: + // there's no way to change these for now level_group_size: u8, max_level_group_size: u8, min_level_size: u8, - facet_type: FacetType, - new_data: grenad::Reader, } impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { @@ -42,36 +41,37 @@ impl<'i> FacetsUpdate<'i> { } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.new_data.is_empty() { + return Ok(()); + } // here, come up with a better condition! - if self.database.is_empty(wtxn)? { + // ideally we'd choose which method to use for each field id individually + // but I dont' think it's worth the effort yet + // As a first requirement, we ask that the length of the new data is less + // than a 1/50th of the length of the database in order to use the incremental + // method. + if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) .level_group_size(self.level_group_size) .min_level_size(self.min_level_size); bulk_update.execute(wtxn)?; } else { - let indexer = FacetsUpdateIncremental::new(self.database) - .max_group_size(self.max_level_group_size) - .min_level_size(self.min_level_size); - - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = self.new_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(key) - .ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - - for (field_id, new_docids) in new_faceted_docids { - let mut docids = - self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - docids |= new_docids; - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; - } + let incremental_update = + FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data) + .group_size(self.level_group_size) + .max_group_size(self.max_level_group_size) + .min_level_size(self.min_level_size); + incremental_update.execute(wtxn)?; } Ok(()) } } + +#[cfg(test)] +mod tests { + // here I want to create a benchmark + // to find out at which point it is faster to do it incrementally + + #[test] + fn update() {} +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c2b3896c4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c2b3896c4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..c9f8951ac --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..c9f8951ac --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 9a89691b1..1d415166d 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,9 +6,9 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::FieldDocIdFacetF64Codec; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, +}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 078a82335..e6a41067b 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,8 +4,7 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, StrRefCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e18cb4e16..03f15945a 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use grenad::{CompressionType, Reader, Sorter}; +use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -208,36 +208,6 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn write_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - reader: Reader, - merge: MergeFn, -) -> Result<()> { - debug!("Writing MTBL stores..."); - let before = Instant::now(); - - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - pub fn sorter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 7e2ebd2d3..8fb629cae 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,8 +9,8 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, - writer_into_reader, GrenadParameters, MergeableReader, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2a2511362..96bea9589 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,8 +27,7 @@ pub use self::enrich::{ pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, - ClonableMmap, MergeFn, + sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 8fba16d3d..b13118e09 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,7 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; -pub use self::facet::incremental::FacetsUpdateIncremental; +pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, };