diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f1ac07deb..fd5949fd9 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,6 +12,7 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; +use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, @@ -28,6 +29,116 @@ pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, +) -> Result<(grenad::Reader>, grenad::Reader>)> { + if settings_diff.settings_update_only() { + extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) + } else { + let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; + extract_facet_string_docids_document_update( + docid_fid_facet_string, + indexer, + localized_field_ids, + ) + } +} + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn extract_facet_string_docids_document_update( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, + localized_field_ids: &LocalizedFieldIds, +) -> Result<(grenad::Reader>, grenad::Reader>)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut normalized_facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_btreeset_string, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut buffer = Vec::new(); + let mut cursor = docid_fid_facet_string.into_cursor()?; + while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { + let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + + let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() + && deladd_reader.get(DelAdd::Addition).is_some(); + + if is_same_value { + continue; + } + + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let normalized_value = str::from_utf8(normalized_value_bytes)?; + + // Facet search normalization + { + let locales = localized_field_ids.locales(field_id); + let hyper_normalized_value = normalize_facet_string(normalized_value, locales); + + let set = BTreeSet::from_iter(std::iter::once(normalized_value)); + + // as the facet string is the same, we can put the deletion and addition in the same obkv. + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd_key, val)?; + } + obkv.finish()?; + + let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + facet_string_docids_sorter.insert(&key_bytes, &buffer)?; + } + + let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; + sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) +} + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn extract_facet_string_docids_settings( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -60,6 +171,15 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); + let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + + let are_same_locales = old_locales == new_locales; + + if is_same_value && are_same_locales { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -68,23 +188,17 @@ pub fn extract_facet_string_docids( // Facet search normalization { - let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); - - if is_same_value && old_locales == new_locales { - // optimization: skip costly normalizations if the values and locales stayed the same - // TODO: splitting the cases between a settings diff and a document update would possibly allow for more optimizations, - // such as skipping the locales check when doing a documents update. - continue; - } - let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); - let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); + let new_hyper_normalized_value = if are_same_locales { + &old_hyper_normalized_value + } else { + &normalize_facet_string(normalized_value, new_locales) + }; let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == new_hyper_normalized_value { + if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { // nothing to do if we delete and re-add the value. if is_same_value { continue;