From 8acd3f50bb224d2804616db975fd5b55f66ed5da Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 31 Jul 2024 09:53:00 +0200 Subject: [PATCH 1/5] skip normalization when the locales and values are the same --- .../extract/extract_facet_string_docids.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 6452a67a1..f1ac07deb 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -68,10 +68,18 @@ pub fn extract_facet_string_docids( // Facet search normalization { - let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales); - let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); - let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales); + let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); + let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + + if is_same_value && old_locales == new_locales { + // optimization: skip costly normalizations if the values and locales stayed the same + // TODO: splitting the cases between a settings diff and a document update would possibly allow for more optimizations, + // such as skipping the locales check when doing a documents update. + continue; + } + + let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); + let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); let set = BTreeSet::from_iter(std::iter::once(normalized_value)); From 7c3fc8c6555df60199026fa6873caf705320909b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 31 Jul 2024 10:57:46 +0200 Subject: [PATCH 2/5] Split settings and document facet string extractions --- .../extract/extract_facet_string_docids.rs | 138 ++++++++++++++++-- 1 file changed, 126 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f1ac07deb..fd5949fd9 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,6 +12,7 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; +use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, @@ -28,6 +29,116 @@ pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, +) -> Result<(grenad::Reader>, grenad::Reader>)> { + if settings_diff.settings_update_only() { + extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) + } else { + let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; + extract_facet_string_docids_document_update( + docid_fid_facet_string, + indexer, + localized_field_ids, + ) + } +} + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn extract_facet_string_docids_document_update( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, + localized_field_ids: &LocalizedFieldIds, +) -> Result<(grenad::Reader>, grenad::Reader>)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut normalized_facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_btreeset_string, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut buffer = Vec::new(); + let mut cursor = docid_fid_facet_string.into_cursor()?; + while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { + let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + + let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() + && deladd_reader.get(DelAdd::Addition).is_some(); + + if is_same_value { + continue; + } + + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let normalized_value = str::from_utf8(normalized_value_bytes)?; + + // Facet search normalization + { + let locales = localized_field_ids.locales(field_id); + let hyper_normalized_value = normalize_facet_string(normalized_value, locales); + + let set = BTreeSet::from_iter(std::iter::once(normalized_value)); + + // as the facet string is the same, we can put the deletion and addition in the same obkv. + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd_key, val)?; + } + obkv.finish()?; + + let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + facet_string_docids_sorter.insert(&key_bytes, &buffer)?; + } + + let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; + sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) +} + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn extract_facet_string_docids_settings( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -60,6 +171,15 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); + let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + + let are_same_locales = old_locales == new_locales; + + if is_same_value && are_same_locales { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -68,23 +188,17 @@ pub fn extract_facet_string_docids( // Facet search normalization { - let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); - - if is_same_value && old_locales == new_locales { - // optimization: skip costly normalizations if the values and locales stayed the same - // TODO: splitting the cases between a settings diff and a document update would possibly allow for more optimizations, - // such as skipping the locales check when doing a documents update. - continue; - } - let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); - let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); + let new_hyper_normalized_value = if are_same_locales { + &old_hyper_normalized_value + } else { + &normalize_facet_string(normalized_value, new_locales) + }; let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == new_hyper_normalized_value { + if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { // nothing to do if we delete and re-add the value. if is_same_value { continue; From 0e68718027dca692e6399d9a8a3350fb74d17bae Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 31 Jul 2024 13:05:47 +0200 Subject: [PATCH 3/5] Add detailed spans --- .../extract/extract_facet_string_docids.rs | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index fd5949fd9..45a7696ac 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -269,16 +269,36 @@ fn extract_facet_string_docids_settings( } /// Normalizes the facet string and truncates it to the max length. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { let options = NormalizerOption { lossy: true, ..Default::default() }; let mut detection = StrDetection::new(facet_string, locales); + + let script = { + let span = tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "detect_script"); + let _entered = span.enter(); + + detection.script() + }; + + let language = { + let span = tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "detect_language"); + let _entered = span.enter(); + + detection.language() + }; + let token = Token { lemma: std::borrow::Cow::Borrowed(facet_string), - script: detection.script(), - language: detection.language(), + script, + language, ..Default::default() }; + let span = + tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "normalize"); + let _entered = span.enter(); + // truncate the facet string to the max length token .normalize(&options) From ade54493aba062de21bebc79d6607e026775d655 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 14 Aug 2024 11:33:17 +0200 Subject: [PATCH 4/5] Only detect language for a facet if several locales have been specified by the user in the settings --- meilisearch/src/search/mod.rs | 14 ++++++++++---- milli/src/search/facet/search.rs | 10 +++++++++- .../extract/extract_facet_string_docids.rs | 9 +++++++-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index dada9159b..915505be0 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1369,12 +1369,18 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; + // In the faceted search context, we want to use the intersection between the locales provided by the user + // and the locales of the facet string. + // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale. + // If the user does not provide locales, we use the locales of the facet string. let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - let locales = locales.or_else(|| { - localized_attributes + let localized_attributes_locales = + localized_attributes.into_iter().find(|attr| attr.match_str(&facet_name)); + let locales = localized_attributes_locales.map(|attr| { + attr.locales .into_iter() - .find(|attr| attr.match_str(&facet_name)) - .map(|attr| attr.locales) + .filter(|locale| locales.as_ref().map_or(true, |locales| locales.contains(locale))) + .collect() }); let (search, _, _, _) = diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index 39fb7374a..cdba7ee16 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -339,10 +339,18 @@ impl ValuesCollection { fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { let options = NormalizerOption { lossy: true, ..Default::default() }; let mut detection = StrDetection::new(facet_string, locales); + + // Detect the language of the facet string only if several locales are explicitly provided. + let language = match locales { + Some(&[language]) => Some(language), + Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), + _ => None, + }; + let token = Token { lemma: std::borrow::Cow::Borrowed(facet_string), script: detection.script(), - language: detection.language(), + language, ..Default::default() }; diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 45a7696ac..69663bb08 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -271,7 +271,7 @@ fn extract_facet_string_docids_settings( /// Normalizes the facet string and truncates it to the max length. #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { - let options = NormalizerOption { lossy: true, ..Default::default() }; + let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() }; let mut detection = StrDetection::new(facet_string, locales); let script = { @@ -285,7 +285,12 @@ fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> S let span = tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "detect_language"); let _entered = span.enter(); - detection.language() + // Detect the language of the facet string only if several locales are explicitly provided. + match locales { + Some(&[language]) => Some(language), + Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), + _ => None, + } }; let token = Token { From 0f965d35747226e0fc7bf6056a491f4fea3fd48f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 14 Aug 2024 14:33:36 +0200 Subject: [PATCH 5/5] Remove hotloop's spans --- .../extract/extract_facet_string_docids.rs | 28 ++++--------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 69663bb08..36dd20b15 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -269,28 +269,16 @@ fn extract_facet_string_docids_settings( } /// Normalizes the facet string and truncates it to the max length. -#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() }; let mut detection = StrDetection::new(facet_string, locales); - let script = { - let span = tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "detect_script"); - let _entered = span.enter(); - - detection.script() - }; - - let language = { - let span = tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "detect_language"); - let _entered = span.enter(); - - // Detect the language of the facet string only if several locales are explicitly provided. - match locales { - Some(&[language]) => Some(language), - Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), - _ => None, - } + let script = detection.script(); + // Detect the language of the facet string only if several locales are explicitly provided. + let language = match locales { + Some(&[language]) => Some(language), + Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), + _ => None, }; let token = Token { @@ -300,10 +288,6 @@ fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> S ..Default::default() }; - let span = - tracing::trace_span!(target: "indexing::extract::extract_facet_string_docids", "normalize"); - let _entered = span.enter(); - // truncate the facet string to the max length token .normalize(&options)