From 8c86348119a431cdffa784f8769b3448c77e00de Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:20:04 +0200 Subject: [PATCH] Indexing the facet strings levels --- milli/src/search/facet/facet_string.rs | 9 +- milli/src/update/facets.rs | 152 ++++++++++++++++++++++--- 2 files changed, 142 insertions(+), 19 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 559bd41b6..509bb4f0c 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -210,7 +210,6 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, - field_id: FieldId, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -228,19 +227,19 @@ impl<'t> FacetStringLevelZeroRange<'t> { ) -> Bound<&'a [u8]> { match bound { Included(value) => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0); buffer.extend_from_slice(value.as_bytes()); Included(&buffer[..]) } Excluded(value) => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0); buffer.extend_from_slice(value.as_bytes()); Excluded(&buffer[..]) } Unbounded => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(1); // we must only get the level 0 Excluded(&buffer[..]) } @@ -257,7 +256,7 @@ impl<'t> FacetStringLevelZeroRange<'t> { .range(rtxn, &(left_bound, right_bound))? .remap_types::(); - Ok(FacetStringLevelZeroRange { iter, field_id }) + Ok(FacetStringLevelZeroRange { iter }) } } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 5fabbc504..d3bba6d6e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,6 +1,6 @@ -use std::cmp; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::{cmp, mem}; use chrono::Utc; use grenad::{CompressionType, FileFuse, Reader, Writer}; @@ -10,7 +10,10 @@ use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, + FacetStringZeroBoundsValueCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, @@ -64,6 +67,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for field_id in faceted_fields { + // Clear the facet string levels. + clear_field_string_levels( + self.wtxn, + self.index.facet_id_string_docids.remap_types::(), + field_id, + )?; + // Compute and store the faceted strings documents ids. let string_documents_ids = compute_faceted_documents_ids( self.wtxn, @@ -71,6 +81,17 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + let facet_string_levels = compute_facet_string_levels( + self.wtxn, + self.index.facet_id_string_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; @@ -81,7 +102,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - let content = compute_facet_number_levels( + let facet_number_levels = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -106,8 +127,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), - content, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }), + facet_number_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), + WriteMethod::GetMergePut, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_string_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), WriteMethod::GetMergePut, )?; } @@ -193,6 +222,21 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } +fn write_number_entry( + writer: &mut Writer, + field_id: FieldId, + level: u8, + left: f64, + right: f64, + ids: &RoaringBitmap, +) -> Result<()> { + let key = (field_id, level, left, right); + let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} + fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, @@ -208,17 +252,97 @@ fn compute_faceted_documents_ids( Ok(documents_ids) } -fn write_number_entry( +fn clear_field_string_levels<'t>( + wtxn: &'t mut heed::RwTxn, + db: heed::Database, + field_id: FieldId, +) -> heed::Result<()> { + let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); + let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); + let range = left..=right; + db.remap_key_type::().delete_range(wtxn, &range).map(drop) +} + +fn compute_facet_string_levels<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + for (level, group_size) in group_size_iter { + let level = NonZeroU8::new(level).unwrap(); + let mut left = (0, ""); + let mut right = (0, ""); + let mut group_docids = RoaringBitmap::new(); + + // Because we know the size of the level 0 we can use a range iterator that starts + // at the first value of the level and goes to the last by simply counting. + for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { + let ((_field_id, value), docids) = result?; + + if i == 0 { + left = (i as u32, value); + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. We also reset the docids. + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + + // We save the left bound for the new group. + left = (i as u32, value); + } + + // The right bound is always the bound we run through. + group_docids |= docids; + right = (i as u32, value); + } + + if !group_docids.is_empty() { + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_string_entry( writer: &mut Writer, field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, + level: NonZeroU8, + (left_id, left_value): (u32, &str), + (right_id, right_value): (u32, &str), + docids: RoaringBitmap, ) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + let key = (field_id, level, left_id, right_id); + let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = match level.get() { + 1 => (Some((left_value, right_value)), docids), + _ => (None, docids), + }; + let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) + .ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) }