From bee3c23b45c0a1a9212ba7269c82a4eca5ad6e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Sep 2022 13:39:08 +0200 Subject: [PATCH] Add comparison benchmark between bulk and incremental facet indexing --- milli/src/update/facet/bulk.rs | 2 - milli/src/update/facet/mod.rs | 85 ++++++++++++++++++- .../src/update/index_documents/typed_chunk.rs | 2 + 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 321ae52d4..90e287f23 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -291,8 +291,6 @@ impl FacetsUpdateBulkInner { field_id, level - 1, &mut |sub_bitmaps, left_bound| { - // TODO: is this done unnecessarily for all 32 levels? - println!("level: {level}"); let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 7298fecc5..caf88671e 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -13,8 +13,6 @@ pub struct FacetsUpdate<'i> { database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, - // Options: - // there's no way to change these for now level_group_size: u8, max_level_group_size: u8, min_level_size: u8, @@ -40,6 +38,28 @@ impl<'i> FacetsUpdate<'i> { } } + // TODO: use the options below? + // but I don't actually see why they should be configurable + // /// The minimum number of elements that a level is allowed to have. + // pub fn level_max_group_size(mut self, value: u8) -> Self { + // self.max_level_group_size = std::cmp::max(value, 4); + // self + // } + + // /// The number of elements from the level below that are represented by a single element in the level above + // /// + // /// This setting is always greater than or equal to 2. + // pub fn level_group_size(mut self, value: u8) -> Self { + // self.level_group_size = std::cmp::max(value, 2); + // self + // } + + // /// The minimum number of elements that a level is allowed to have. + // pub fn min_level_size(mut self, value: u8) -> Self { + // self.min_level_size = std::cmp::max(value, 2); + // self + // } + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { if self.new_data.is_empty() { return Ok(()); @@ -144,7 +164,7 @@ pub(crate) mod tests { let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 100); + let options = options.map_size(4096 * 4 * 1000); let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); @@ -309,3 +329,62 @@ pub(crate) mod tests { } } } + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + + use super::tests::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // It appears that the incremental indexer is about 50 times slower than the + // bulk indexer. + #[test] + fn benchmark_facet_indexing() { + // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it + + let mut facet_value = 0; + + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, facet_value as f64), once(i).collect())); + facet_value += 1; + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // insert one document + // + for _ in 0..nbr_doc { + index.insert(&mut txn, 0, &r.gen(), &once(1).collect()); + } + let time_spent = timer.elapsed().as_millis(); + println!(" add {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 16784bd92..f11414f20 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -138,11 +138,13 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); indexer.execute(wtxn)?; is_merged_database = true;