From 99211eb375fec29fabb1e55422d8924260388838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 28 May 2024 11:46:53 +0200 Subject: [PATCH 01/16] Introduce the SettingDiff only_additional_fields method --- milli/src/update/settings.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 24b32b6fa..c3e4ab3fa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1112,6 +1112,26 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } + /// Returns only the additional searchable fields if any + /// other searchable field has been modified, returns None. + pub fn only_additional_fields(&self) -> Option> { + match (&self.old.user_defined_searchable_fields, &self.new.user_defined_searchable_fields) { + (None, None) | (Some(_), None) => None, + (None, Some(new)) => Some(new.iter().cloned().collect()), + (Some(old), Some(new)) => { + let old: HashSet<_> = old.iter().cloned().collect(); + let new: HashSet<_> = new.iter().cloned().collect(); + if old.difference(&new).next().is_none() { + // if no field has been removed + // return only the additional ones + Some(&new - &old) + } else { + None + } + } + } + } + pub fn reindex_facets(&self) -> bool { let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { From 9af103a88eb2df6467919e2b1cd9b27280d050e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 28 May 2024 14:53:45 +0200 Subject: [PATCH 02/16] Introducing a new into_del_add_obkv_conditional_operation function --- milli/src/update/del_add.rs | 15 ++++++ milli/src/update/index_documents/transform.rs | 47 +++++++++++++++---- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 0288858ed..cb5e448f1 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -40,11 +40,26 @@ pub fn into_del_add_obkv( operation: DelAddOperation, buffer: &mut Vec, ) -> Result<(), std::io::Error> { + into_del_add_obkv_conditional_operation(reader, buffer, |_| operation) +} + +/// Akin to the [into_del_add_obkv] function but lets you +/// conditionally define the `DelAdd` variant based on the obkv key. +pub fn into_del_add_obkv_conditional_operation( + reader: obkv::KvReader, + buffer: &mut Vec, + operation: F, +) -> std::io::Result<()> +where + K: obkv::Key + PartialOrd, + F: Fn(K) -> DelAddOperation, +{ let mut writer = obkv::KvWriter::new(buffer); let mut value_buffer = Vec::new(); for (key, value) in reader.iter() { value_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let operation = operation(key); if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { value_writer.insert(DelAdd::Deletion, value)?; } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 41a0a55cf..dc6642b8a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -20,7 +20,10 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; +use crate::update::del_add::{ + into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, + KvReaderDelAdd, +}; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -841,13 +844,28 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only the `searchableAttributes` has been changed, keep only the searchable fields. + // However, if only new searchable attributes are added, this function will + // return false has fields do not need to be reindexed. let must_reindex_searchables = settings_diff.reindex_searchable(); - let necessary_searchable_field = |id: FieldId| -> bool { - must_reindex_searchables + let must_index_only_additional_searchables = &settings_diff.only_additional_fields(); + let necessary_searchable_field_to_reindex = move |id: FieldId| -> bool { + must_index_only_additional_searchables.is_none() + && must_reindex_searchables && (settings_diff.old.searchable_fields_ids.contains(&id) || settings_diff.new.searchable_fields_ids.contains(&id)) }; + // If only new `searchableAttributes` are present, keep only those ones. + let additional_searchable_field_only = move |id: FieldId| -> bool { + match must_index_only_additional_searchables { + Some(additional_fields) => { + let additional_field = settings_diff.new.fields_ids_map.name(id).unwrap(); + additional_fields.contains(additional_field) + } + None => false, + } + }; + // If only a faceted field has been added, keep only this field. let must_reindex_facets = settings_diff.reindex_facets(); let necessary_faceted_field = |id: FieldId| -> bool { @@ -862,14 +880,21 @@ impl<'a, 'i> Transform<'a, 'i> { // we need the fields for the prompt/templating. let reindex_vectors = settings_diff.reindex_vectors(); + // The set of additional searchable fields only, + // the only purpose of these fields is to be indexed from scratch. + let mut additional_searchables_only = HashSet::new(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); for (id, val) in old_obkv.iter() { if is_primary_key(id) - || necessary_searchable_field(id) + || necessary_searchable_field_to_reindex(id) || necessary_faceted_field(id) || reindex_vectors { obkv_writer.insert(id, val)?; + } else if additional_searchable_field_only(id) { + additional_searchables_only.insert(id); + obkv_writer.insert(id, val)?; } } let data = obkv_writer.into_inner()?; @@ -887,11 +912,15 @@ impl<'a, 'i> Transform<'a, 'i> { let flattened = flattened.as_deref().map_or(obkv, KvReader::new); flattened_obkv_buffer.clear(); - into_del_add_obkv( - flattened, - DelAddOperation::DeletionAndAddition, - flattened_obkv_buffer, - )?; + into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { + // If the field is only required because it is an additional + // searchable field only define it as an DelAdd::Addition only. + if additional_searchables_only.contains(&id) { + DelAddOperation::Addition + } else { + DelAddOperation::DeletionAndAddition + } + })?; } Ok(()) From db3887929f717ee8c2712c838f49adc6ed90a6f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 14:49:09 +0200 Subject: [PATCH 03/16] Fix an issue with settings diff and * in the searchable attributes --- milli/src/update/settings.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c3e4ab3fa..af37a205c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1112,12 +1112,11 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } - /// Returns only the additional searchable fields if any - /// other searchable field has been modified, returns None. + /// Returns only the additional searchable fields. + /// If any other searchable field has been modified, returns None. pub fn only_additional_fields(&self) -> Option> { match (&self.old.user_defined_searchable_fields, &self.new.user_defined_searchable_fields) { - (None, None) | (Some(_), None) => None, - (None, Some(new)) => Some(new.iter().cloned().collect()), + (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * (Some(old), Some(new)) => { let old: HashSet<_> = old.iter().cloned().collect(); let new: HashSet<_> = new.iter().cloned().collect(); From 4bf83f701c08272574f44f6a30daefaf261cbc50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 14:50:00 +0200 Subject: [PATCH 04/16] Give the settings diff to the write_typed_chunk_into_index function --- milli/src/update/index_documents/mod.rs | 5 +++-- milli/src/update/index_documents/typed_chunk.rs | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index afae8973a..2420463b4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -369,6 +369,7 @@ where // Run extraction pipeline in parallel. pool.install(|| { + let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); let _enter = child_span.enter(); @@ -398,7 +399,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, - settings_diff.clone(), + settings_diff_cloned, max_positions_per_attributes, ) }); @@ -425,7 +426,7 @@ where Err(status) => { if let Some(typed_chunks) = chunk_accumulator.pop_longest() { let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?; + write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2ef7a8990..fcd8cfc17 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -123,8 +123,10 @@ impl TypedChunk { #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( typed_chunks: Vec, - index: &Index, wtxn: &mut RwTxn, + index: &Index, + settings_diff: &InnerIndexSettingsDiff, + typed_chunks: Vec, ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunks[0] { From bc210bdc006046bd70940a453104bb6256ae43ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 15:17:51 +0200 Subject: [PATCH 05/16] Introduce a dedicated function to write proximity entries in database --- .../src/update/index_documents/typed_chunk.rs | 79 ++++++++++++++++--- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index fcd8cfc17..4021dea08 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; -use heed::RwTxn; +use heed::{BytesDecode, RwTxn}; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; @@ -20,13 +20,16 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{ as_cloneable_grenad, keep_latest_obkv, try_split_array_at, }; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{ - lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, + Result, SerializationError, U8StrStrCodec, }; /// This struct accumulates and group the TypedChunks @@ -122,7 +125,6 @@ impl TypedChunk { /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( - typed_chunks: Vec, wtxn: &mut RwTxn, index: &Index, settings_diff: &InnerIndexSettingsDiff, @@ -487,13 +489,22 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - write_entries_into_database( - merger, - &index.word_pair_proximity_docids, - wtxn, - deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - )?; + if settings_diff.only_additional_fields().is_some() { + write_proximity_entries_into_database_additional_searchables( + merger, + &index.word_pair_proximity_docids, + wtxn, + )?; + } else { + write_entries_into_database( + merger, + &index.word_pair_proximity_docids, + wtxn, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + )?; + } + is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(_) => { @@ -832,3 +843,51 @@ where } Ok(()) } + +/// Akin to the `write_entries_into_database` function but specialized +/// for the case when we only index additional searchable fields only. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] +fn write_proximity_entries_into_database_additional_searchables( + merger: Merger, + database: &heed::Database, + wtxn: &mut RwTxn, +) -> Result<()> +where + R: io::Read + io::Seek, +{ + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + if valid_lmdb_key(key) { + let (proximity_to_insert, word1, word2) = + U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?; + let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) { + Some(value) => { + CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)? + } + None => continue, + }; + + let mut data_to_remove = RoaringBitmap::new(); + for prox in 1..(MAX_DISTANCE as u8) { + let key = (prox, word1, word2); + let database_value = database.get(wtxn, &key)?.unwrap_or_default(); + let value = if prox == proximity_to_insert { + // Proximity that should be changed. + // Union values and remove lower proximity data + (&database_value | &data_to_insert) - &data_to_remove + } else { + // Remove lower proximity data + &database_value - &data_to_remove + }; + + // add the current data in data_to_remove for the next proximities + data_to_remove |= &value; + + if database_value != value { + database.put(wtxn, &key, &value)?; + } + } + } + } + Ok(()) +} From d9e50741896869631206555e5727b0bd61824c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 17:46:28 +0200 Subject: [PATCH 06/16] Introduce a new way to determine the operations to perform on the fields --- milli/src/update/index_documents/transform.rs | 47 +++---------------- milli/src/update/settings.rs | 28 ++++++++++- 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index dc6642b8a..34e40b7f6 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -843,29 +843,6 @@ impl<'a, 'i> Transform<'a, 'i> { // Always keep the primary key. let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; - // If only the `searchableAttributes` has been changed, keep only the searchable fields. - // However, if only new searchable attributes are added, this function will - // return false has fields do not need to be reindexed. - let must_reindex_searchables = settings_diff.reindex_searchable(); - let must_index_only_additional_searchables = &settings_diff.only_additional_fields(); - let necessary_searchable_field_to_reindex = move |id: FieldId| -> bool { - must_index_only_additional_searchables.is_none() - && must_reindex_searchables - && (settings_diff.old.searchable_fields_ids.contains(&id) - || settings_diff.new.searchable_fields_ids.contains(&id)) - }; - - // If only new `searchableAttributes` are present, keep only those ones. - let additional_searchable_field_only = move |id: FieldId| -> bool { - match must_index_only_additional_searchables { - Some(additional_fields) => { - let additional_field = settings_diff.new.fields_ids_map.name(id).unwrap(); - additional_fields.contains(additional_field) - } - None => false, - } - }; - // If only a faceted field has been added, keep only this field. let must_reindex_facets = settings_diff.reindex_facets(); let necessary_faceted_field = |id: FieldId| -> bool { @@ -880,20 +857,16 @@ impl<'a, 'i> Transform<'a, 'i> { // we need the fields for the prompt/templating. let reindex_vectors = settings_diff.reindex_vectors(); - // The set of additional searchable fields only, - // the only purpose of these fields is to be indexed from scratch. - let mut additional_searchables_only = HashSet::new(); + // The operations that we must perform on the different fields. + let mut operations = HashMap::new(); let mut obkv_writer = KvWriter::<_, FieldId>::memory(); for (id, val) in old_obkv.iter() { - if is_primary_key(id) - || necessary_searchable_field_to_reindex(id) - || necessary_faceted_field(id) - || reindex_vectors - { + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; - } else if additional_searchable_field_only(id) { - additional_searchables_only.insert(id); + } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { + operations.insert(id, operation); obkv_writer.insert(id, val)?; } } @@ -913,13 +886,7 @@ impl<'a, 'i> Transform<'a, 'i> { flattened_obkv_buffer.clear(); into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { - // If the field is only required because it is an additional - // searchable field only define it as an DelAdd::Addition only. - if additional_searchables_only.contains(&id) { - DelAddOperation::Addition - } else { - DelAddOperation::DeletionAndAddition - } + operations.get(&id).copied().unwrap_or(DelAddOperation::DeletionAndAddition) })?; } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index af37a205c..b401adff9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -9,6 +9,7 @@ use itertools::{EitherOrBoth, Itertools}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; +use super::del_add::DelAddOperation; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; @@ -1112,6 +1113,31 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } + pub fn reindex_searchable_id(&self, id: FieldId) -> Option { + if self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + || self.old.allowed_separators != self.new.allowed_separators + || self.old.dictionary != self.new.dictionary + || self.old.exact_attributes != self.new.exact_attributes + // Here we can be much more optimal by just deleting the proximity database + || self.old.proximity_precision != self.new.proximity_precision + { + Some(DelAddOperation::DeletionAndAddition) + } else if let Some(only_additional_fields) = self.only_additional_fields() { + let additional_field = self.new.fields_ids_map.name(id).unwrap(); + if only_additional_fields.contains(additional_field) { + Some(DelAddOperation::Addition) + } else { + None + } + } else if self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields + { + Some(DelAddOperation::DeletionAndAddition) + } else { + None + } + } + /// Returns only the additional searchable fields. /// If any other searchable field has been modified, returns None. pub fn only_additional_fields(&self) -> Option> { @@ -1599,7 +1625,7 @@ mod tests { // When we search for something that is not in // the searchable fields it must not return any document. let result = index.search(&rtxn).query("23").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids, Vec::::new()); // When we search for something that is in the searchable fields // we must find the appropriate document. From 3a78e988dad814f0b781e568d349ecceb20be0a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 11:17:03 +0200 Subject: [PATCH 07/16] Reduce the number of complex calls to settings diff functions --- milli/src/update/index_documents/transform.rs | 14 ++-- .../src/update/index_documents/typed_chunk.rs | 2 +- milli/src/update/settings.rs | 69 ++++++++++++------- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 34e40b7f6..59bab36e8 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -808,13 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let settings_diff = InnerIndexSettingsDiff { - old: old_inner_settings, - new: new_inner_settings, + let embedding_configs_updated = false; + let settings_update_only = false; + let settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, primary_key_id, - embedding_configs_updated: false, - settings_update_only: false, - }; + embedding_configs_updated, + settings_update_only, + ); Ok(TransformOutput { primary_key, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 4021dea08..2fbe91685 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -489,7 +489,7 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - if settings_diff.only_additional_fields().is_some() { + if settings_diff.only_additional_fields.is_some() { write_proximity_entries_into_database_additional_searchables( merger, &index.word_pair_proximity_docids, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index b401adff9..84eccf8e6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1073,13 +1073,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { .index .primary_key(self.wtxn)? .and_then(|name| new_inner_settings.fields_ids_map.id(name)); - let inner_settings_diff = InnerIndexSettingsDiff { - old: old_inner_settings, - new: new_inner_settings, + let settings_update_only = true; + let inner_settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, primary_key_id, embedding_configs_updated, - settings_update_only: true, - }; + settings_update_only, + ); if inner_settings_diff.any_reindexing_needed() { self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; @@ -1096,9 +1097,46 @@ pub struct InnerIndexSettingsDiff { // TODO: compare directly the embedders. pub(crate) embedding_configs_updated: bool, pub(crate) settings_update_only: bool, + /// The set of only the additional searchable fields. + /// If any other searchable field has been modified, is set to None. + pub(crate) only_additional_fields: Option>, } impl InnerIndexSettingsDiff { + pub(crate) fn new( + old_settings: InnerIndexSettings, + new_settings: InnerIndexSettings, + primary_key_id: Option, + embedding_configs_updated: bool, + settings_update_only: bool, + ) -> Self { + let only_additional_fields = match ( + &old_settings.user_defined_searchable_fields, + &new_settings.user_defined_searchable_fields, + ) { + (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * + (Some(old), Some(new)) => { + let old: HashSet<_> = old.iter().cloned().collect(); + let new: HashSet<_> = new.iter().cloned().collect(); + if old.difference(&new).next().is_none() { + // if no field has been removed return only the additional ones + Some(&new - &old) + } else { + None + } + } + }; + + InnerIndexSettingsDiff { + old: old_settings, + new: new_settings, + primary_key_id, + embedding_configs_updated, + settings_update_only, + only_additional_fields, + } + } + pub fn any_reindexing_needed(&self) -> bool { self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() } @@ -1123,7 +1161,7 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision { Some(DelAddOperation::DeletionAndAddition) - } else if let Some(only_additional_fields) = self.only_additional_fields() { + } else if let Some(only_additional_fields) = &self.only_additional_fields { let additional_field = self.new.fields_ids_map.name(id).unwrap(); if only_additional_fields.contains(additional_field) { Some(DelAddOperation::Addition) @@ -1138,25 +1176,6 @@ impl InnerIndexSettingsDiff { } } - /// Returns only the additional searchable fields. - /// If any other searchable field has been modified, returns None. - pub fn only_additional_fields(&self) -> Option> { - match (&self.old.user_defined_searchable_fields, &self.new.user_defined_searchable_fields) { - (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * - (Some(old), Some(new)) => { - let old: HashSet<_> = old.iter().cloned().collect(); - let new: HashSet<_> = new.iter().cloned().collect(); - if old.difference(&new).next().is_none() { - // if no field has been removed - // return only the additional ones - Some(&new - &old) - } else { - None - } - } - } - } - pub fn reindex_facets(&self) -> bool { let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { From 0e9eb9eedb04066394dc9e9b5829ed9aaa05339a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 12:08:27 +0200 Subject: [PATCH 08/16] Add a span for the settings diff creation --- milli/src/update/settings.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 84eccf8e6..eca250c19 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1103,6 +1103,7 @@ pub struct InnerIndexSettingsDiff { } impl InnerIndexSettingsDiff { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::settings")] pub(crate) fn new( old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, From 75496af985d02fcf1259b7defd6b6d47a0b44864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 12:14:22 +0200 Subject: [PATCH 09/16] Add a span for the prepare_for_documents_reindexing --- milli/src/update/index_documents/transform.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 59bab36e8..c34b7876a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -899,6 +899,11 @@ impl<'a, 'i> Transform<'a, 'i> { /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. /// // TODO this can be done in parallel by using the rayon `ThreadPool`. + #[tracing::instrument( + level = "trace" + skip(self, wtxn, settings_diff), + target = "indexing::documents" + )] pub fn prepare_for_documents_reindexing( self, wtxn: &mut heed::RwTxn<'i>, From b9a0ff0dd6c282110bb8f331870ddb77e0ace530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 15:39:05 +0200 Subject: [PATCH 10/16] Cache a lot of operations to know if a field must be indexed --- milli/src/update/settings.rs | 40 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index eca250c19..11a249068 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1100,6 +1100,12 @@ pub struct InnerIndexSettingsDiff { /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. pub(crate) only_additional_fields: Option>, + + // Cache the check to see if all the stop_words, allowed_separators, dictionary, + // exact_attributes, proximity_precision are different. + pub(crate) cache_reindex_searchable_without_user_defined: bool, + // Cache the check to see if all the user_defined_searchables are different. + pub(crate) cache_user_defined_searchables: bool, } impl InnerIndexSettingsDiff { @@ -1128,6 +1134,18 @@ impl InnerIndexSettingsDiff { } }; + let cache_reindex_searchable_without_user_defined = { + old_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + || old_settings.allowed_separators != new_settings.allowed_separators + || old_settings.dictionary != new_settings.dictionary + || old_settings.exact_attributes != new_settings.exact_attributes + || old_settings.proximity_precision != new_settings.proximity_precision + }; + + let cache_user_defined_searchables = old_settings.user_defined_searchable_fields + != new_settings.user_defined_searchable_fields; + InnerIndexSettingsDiff { old: old_settings, new: new_settings, @@ -1135,6 +1153,8 @@ impl InnerIndexSettingsDiff { embedding_configs_updated, settings_update_only, only_additional_fields, + cache_reindex_searchable_without_user_defined, + cache_user_defined_searchables, } } @@ -1143,24 +1163,11 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { - self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - || self.old.allowed_separators != self.new.allowed_separators - || self.old.dictionary != self.new.dictionary - || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields - || self.old.exact_attributes != self.new.exact_attributes - || self.old.proximity_precision != self.new.proximity_precision + self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables } pub fn reindex_searchable_id(&self, id: FieldId) -> Option { - if self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - || self.old.allowed_separators != self.new.allowed_separators - || self.old.dictionary != self.new.dictionary - || self.old.exact_attributes != self.new.exact_attributes - // Here we can be much more optimal by just deleting the proximity database - || self.old.proximity_precision != self.new.proximity_precision - { + if self.cache_reindex_searchable_without_user_defined { Some(DelAddOperation::DeletionAndAddition) } else if let Some(only_additional_fields) = &self.only_additional_fields { let additional_field = self.new.fields_ids_map.name(id).unwrap(); @@ -1169,8 +1176,7 @@ impl InnerIndexSettingsDiff { } else { None } - } else if self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields - { + } else if self.cache_user_defined_searchables { Some(DelAddOperation::DeletionAndAddition) } else { None From c32d746069352b73904018d347689d1b62bf004b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 16:46:57 +0200 Subject: [PATCH 11/16] Rename the embeddings workloads --- ...subset-hf-embeddings.json => embeddings-movies-subset-hf.json} | 0 ...{settings-add-embeddings.json => embeddings-settings-add.json} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename workloads/{movies-subset-hf-embeddings.json => embeddings-movies-subset-hf.json} (100%) rename workloads/{settings-add-embeddings.json => embeddings-settings-add.json} (100%) diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/embeddings-movies-subset-hf.json similarity index 100% rename from workloads/movies-subset-hf-embeddings.json rename to workloads/embeddings-movies-subset-hf.json diff --git a/workloads/settings-add-embeddings.json b/workloads/embeddings-settings-add.json similarity index 100% rename from workloads/settings-add-embeddings.json rename to workloads/embeddings-settings-add.json From 17c5ceeb9dc2f6574beed680b3ade879ab88ff7f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 3 Jun 2024 11:42:57 +0200 Subject: [PATCH 12/16] iterate over the faceted fields instead of over the whole document --- .../extract/extract_fid_docid_facet_values.rs | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 3cbd7e49e..bcbb87a58 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -9,7 +9,7 @@ use std::result::Result as StdResult; use bytemuck::bytes_of; use grenad::Sorter; use heed::BytesEncode; -use itertools::EitherOrBoth; +use itertools::{merge_join_by, EitherOrBoth}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; @@ -18,7 +18,7 @@ use FilterableValues::{Empty, Null, Values}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::del_add::{DelAdd, KvWriterDelAdd}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::settings::InnerIndexSettingsDiff; use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; @@ -66,6 +66,11 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); + let old_faceted_fids: BTreeSet<_> = + settings_diff.old.faceted_fields_ids.iter().copied().collect(); + let new_faceted_fids: BTreeSet<_> = + settings_diff.new.faceted_fields_ids.iter().copied().collect(); + // The tuples represents the Del and Add side for a bitmap let mut facet_exists_docids = BTreeMap::::new(); let mut facet_is_null_docids = BTreeMap::::new(); @@ -78,11 +83,45 @@ pub fn extract_fid_docid_facet_values( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); + let get_document_json_value = move |field_id, side| { + obkv.get(field_id) + .map(KvReaderDelAdd::new) + .and_then(|kv| kv.get(side)) + .map(from_slice) + .transpose() + .map_err(InternalError::SerdeJson) + }; + // iterate over the faceted fields instead of over the whole document. + for eob in + merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| old.cmp(new)) + { + let (field_id, del_value, add_value) = match eob { + EitherOrBoth::Left(&field_id) => { + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - for (field_id, field_bytes) in obkv.iter() { - let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id); - let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id); - if delete_faceted || add_faceted { + // deletion only + (field_id, del_value, None) + } + EitherOrBoth::Right(&field_id) => { + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + // addition only + (field_id, None, add_value) + } + EitherOrBoth::Both(&field_id, _) => { + // during settings update, recompute the changing settings only. + if settings_diff.settings_update_only { + continue; + } + + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + (field_id, del_value, add_value) + } + }; + + if del_value.is_some() || add_value.is_some() { numbers_key_buffer.clear(); strings_key_buffer.clear(); @@ -98,17 +137,6 @@ pub fn extract_fid_docid_facet_values( numbers_key_buffer.extend_from_slice(docid_bytes); strings_key_buffer.extend_from_slice(docid_bytes); - let del_add_obkv = obkv::KvReader::new(field_bytes); - let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted) - { - Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), - None => None, - }; - let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) { - Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), - None => None, - }; - // We insert the document id on the Del and the Add side if the field exists. let (ref mut del_exists, ref mut add_exists) = facet_exists_docids.entry(field_id).or_default(); From d29d4f88da692f3345830d0236e0782f01b9f7ca Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 4 Jun 2024 15:31:24 +0200 Subject: [PATCH 13/16] Skip iterating over documents when the faceted field list doesn't change --- .../extract/extract_fid_docid_facet_values.rs | 318 +++++++++--------- 1 file changed, 161 insertions(+), 157 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index bcbb87a58..810fa26a9 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -66,11 +66,6 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); - let old_faceted_fids: BTreeSet<_> = - settings_diff.old.faceted_fields_ids.iter().copied().collect(); - let new_faceted_fids: BTreeSet<_> = - settings_diff.new.faceted_fields_ids.iter().copied().collect(); - // The tuples represents the Del and Add side for a bitmap let mut facet_exists_docids = BTreeMap::::new(); let mut facet_is_null_docids = BTreeMap::::new(); @@ -80,172 +75,181 @@ pub fn extract_fid_docid_facet_values( let mut numbers_key_buffer = Vec::new(); let mut strings_key_buffer = Vec::new(); - let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { - let obkv = obkv::KvReader::new(value); - let get_document_json_value = move |field_id, side| { - obkv.get(field_id) - .map(KvReaderDelAdd::new) - .and_then(|kv| kv.get(side)) - .map(from_slice) - .transpose() - .map_err(InternalError::SerdeJson) - }; - // iterate over the faceted fields instead of over the whole document. - for eob in - merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| old.cmp(new)) - { - let (field_id, del_value, add_value) = match eob { - EitherOrBoth::Left(&field_id) => { - let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let old_faceted_fids: BTreeSet<_> = + settings_diff.old.faceted_fields_ids.iter().copied().collect(); + let new_faceted_fids: BTreeSet<_> = + settings_diff.new.faceted_fields_ids.iter().copied().collect(); - // deletion only - (field_id, del_value, None) - } - EitherOrBoth::Right(&field_id) => { - let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { + let obkv = obkv::KvReader::new(value); + let get_document_json_value = move |field_id, side| { + obkv.get(field_id) + .map(KvReaderDelAdd::new) + .and_then(|kv| kv.get(side)) + .map(from_slice) + .transpose() + .map_err(InternalError::SerdeJson) + }; + // iterate over the faceted fields instead of over the whole document. + for eob in + merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| { + old.cmp(new) + }) + { + let (field_id, del_value, add_value) = match eob { + EitherOrBoth::Left(&field_id) => { + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - // addition only - (field_id, None, add_value) - } - EitherOrBoth::Both(&field_id, _) => { - // during settings update, recompute the changing settings only. - if settings_diff.settings_update_only { - continue; + // deletion only + (field_id, del_value, None) + } + EitherOrBoth::Right(&field_id) => { + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + // addition only + (field_id, None, add_value) + } + EitherOrBoth::Both(&field_id, _) => { + // during settings update, recompute the changing settings only. + if settings_diff.settings_update_only { + continue; + } + + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + (field_id, del_value, add_value) + } + }; + + if del_value.is_some() || add_value.is_some() { + numbers_key_buffer.clear(); + strings_key_buffer.clear(); + + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec + numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = DocumentId::from_be_bytes(document); + + // For the other extraction tasks, prefix the key with the field_id and the document_id + numbers_key_buffer.extend_from_slice(docid_bytes); + strings_key_buffer.extend_from_slice(docid_bytes); + + // We insert the document id on the Del and the Add side if the field exists. + let (ref mut del_exists, ref mut add_exists) = + facet_exists_docids.entry(field_id).or_default(); + let (ref mut del_is_null, ref mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (ref mut del_is_empty, ref mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); + + if del_value.is_some() { + del_exists.insert(document); + } + if add_value.is_some() { + add_exists.insert(document); } - let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + let del_geo_support = settings_diff + .old + .geo_fields_ids + .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let add_geo_support = settings_diff + .new + .geo_fields_ids + .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let del_filterable_values = + del_value.map(|value| extract_facet_values(&value, del_geo_support)); + let add_filterable_values = + add_value.map(|value| extract_facet_values(&value, add_geo_support)); - (field_id, del_value, add_value) - } - }; + // Those closures are just here to simplify things a bit. + let mut insert_numbers_diff = |del_numbers, add_numbers| { + insert_numbers_diff( + &mut fid_docid_facet_numbers_sorter, + &mut numbers_key_buffer, + del_numbers, + add_numbers, + ) + }; + let mut insert_strings_diff = |del_strings, add_strings| { + insert_strings_diff( + &mut fid_docid_facet_strings_sorter, + &mut strings_key_buffer, + del_strings, + add_strings, + ) + }; - if del_value.is_some() || add_value.is_some() { - numbers_key_buffer.clear(); - strings_key_buffer.clear(); - - // Set key to the field_id - // Note: this encoding is consistent with FieldIdCodec - numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - - let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); - let document = DocumentId::from_be_bytes(document); - - // For the other extraction tasks, prefix the key with the field_id and the document_id - numbers_key_buffer.extend_from_slice(docid_bytes); - strings_key_buffer.extend_from_slice(docid_bytes); - - // We insert the document id on the Del and the Add side if the field exists. - let (ref mut del_exists, ref mut add_exists) = - facet_exists_docids.entry(field_id).or_default(); - let (ref mut del_is_null, ref mut add_is_null) = - facet_is_null_docids.entry(field_id).or_default(); - let (ref mut del_is_empty, ref mut add_is_empty) = - facet_is_empty_docids.entry(field_id).or_default(); - - if del_value.is_some() { - del_exists.insert(document); - } - if add_value.is_some() { - add_exists.insert(document); - } - - let del_geo_support = settings_diff - .old - .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let add_geo_support = settings_diff - .new - .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let del_filterable_values = - del_value.map(|value| extract_facet_values(&value, del_geo_support)); - let add_filterable_values = - add_value.map(|value| extract_facet_values(&value, add_geo_support)); - - // Those closures are just here to simplify things a bit. - let mut insert_numbers_diff = |del_numbers, add_numbers| { - insert_numbers_diff( - &mut fid_docid_facet_numbers_sorter, - &mut numbers_key_buffer, - del_numbers, - add_numbers, - ) - }; - let mut insert_strings_diff = |del_strings, add_strings| { - insert_strings_diff( - &mut fid_docid_facet_strings_sorter, - &mut strings_key_buffer, - del_strings, - add_strings, - ) - }; - - match (del_filterable_values, add_filterable_values) { - (None, None) => (), - (Some(del_filterable_values), None) => match del_filterable_values { - Null => { - del_is_null.insert(document); - } - Empty => { - del_is_empty.insert(document); - } - Values { numbers, strings } => { - insert_numbers_diff(numbers, vec![])?; - insert_strings_diff(strings, vec![])?; - } - }, - (None, Some(add_filterable_values)) => match add_filterable_values { - Null => { - add_is_null.insert(document); - } - Empty => { - add_is_empty.insert(document); - } - Values { numbers, strings } => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; - } - }, - (Some(del_filterable_values), Some(add_filterable_values)) => { - match (del_filterable_values, add_filterable_values) { - (Null, Null) | (Empty, Empty) => (), - (Null, Empty) => { - del_is_null.insert(document); - add_is_empty.insert(document); - } - (Empty, Null) => { - del_is_empty.insert(document); - add_is_null.insert(document); - } - (Null, Values { numbers, strings }) => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; + match (del_filterable_values, add_filterable_values) { + (None, None) => (), + (Some(del_filterable_values), None) => match del_filterable_values { + Null => { del_is_null.insert(document); } - (Empty, Values { numbers, strings }) => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; + Empty => { del_is_empty.insert(document); } - (Values { numbers, strings }, Null) => { - add_is_null.insert(document); + Values { numbers, strings } => { insert_numbers_diff(numbers, vec![])?; insert_strings_diff(strings, vec![])?; } - (Values { numbers, strings }, Empty) => { - add_is_empty.insert(document); - insert_numbers_diff(numbers, vec![])?; - insert_strings_diff(strings, vec![])?; + }, + (None, Some(add_filterable_values)) => match add_filterable_values { + Null => { + add_is_null.insert(document); } - ( - Values { numbers: del_numbers, strings: del_strings }, - Values { numbers: add_numbers, strings: add_strings }, - ) => { - insert_numbers_diff(del_numbers, add_numbers)?; - insert_strings_diff(del_strings, add_strings)?; + Empty => { + add_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + } + }, + (Some(del_filterable_values), Some(add_filterable_values)) => { + match (del_filterable_values, add_filterable_values) { + (Null, Null) | (Empty, Empty) => (), + (Null, Empty) => { + del_is_null.insert(document); + add_is_empty.insert(document); + } + (Empty, Null) => { + del_is_empty.insert(document); + add_is_null.insert(document); + } + (Null, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_null.insert(document); + } + (Empty, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_empty.insert(document); + } + (Values { numbers, strings }, Null) => { + add_is_null.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + (Values { numbers, strings }, Empty) => { + add_is_empty.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + ( + Values { numbers: del_numbers, strings: del_strings }, + Values { numbers: add_numbers, strings: add_strings }, + ) => { + insert_numbers_diff(del_numbers, add_numbers)?; + insert_strings_diff(del_strings, add_strings)?; + } } } } From ba9fadc8f19d9ce32e1e7469f5dd154875fe2a04 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 5 Jun 2024 10:51:16 +0200 Subject: [PATCH 14/16] Put only_additional_fields to None if the difference gives an empty result. --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 11a249068..952b017c6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1127,7 +1127,7 @@ impl InnerIndexSettingsDiff { let new: HashSet<_> = new.iter().cloned().collect(); if old.difference(&new).next().is_none() { // if no field has been removed return only the additional ones - Some(&new - &old) + Some(&new - &old).filter(|x| !x.is_empty()) } else { None } From ff87b4db261f10aeb4a0375d5766dd4ffd085341 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 5 Jun 2024 12:48:44 +0200 Subject: [PATCH 15/16] Avoid running proximity when only the exact attributes changes --- .../extract_word_pair_proximity_docids.rs | 9 ++++---- milli/src/update/settings.rs | 22 +++++++++++++++---- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 617338f9f..5a9363942 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -26,11 +26,8 @@ pub fn extract_word_pair_proximity_docids( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; - let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; - // early return if the data shouldn't be deleted nor created. - if !any_deletion && !any_addition { + if settings_diff.settings_update_only && !settings_diff.reindex_proximities() { let writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -39,8 +36,10 @@ pub fn extract_word_pair_proximity_docids( return writer_into_reader(writer); } - let max_memory = indexer.max_memory_by_thread(); + let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; + let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; + let max_memory = indexer.max_memory_by_thread(); let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) .map(|_| { create_sorter( diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 952b017c6..dc26ac746 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1104,8 +1104,10 @@ pub struct InnerIndexSettingsDiff { // Cache the check to see if all the stop_words, allowed_separators, dictionary, // exact_attributes, proximity_precision are different. pub(crate) cache_reindex_searchable_without_user_defined: bool, - // Cache the check to see if all the user_defined_searchables are different. + // Cache the check to see if the user_defined_searchables are different. pub(crate) cache_user_defined_searchables: bool, + // Cache the check to see if the exact_attributes are different. + pub(crate) cache_exact_attributes: bool, } impl InnerIndexSettingsDiff { @@ -1139,10 +1141,11 @@ impl InnerIndexSettingsDiff { != new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) || old_settings.allowed_separators != new_settings.allowed_separators || old_settings.dictionary != new_settings.dictionary - || old_settings.exact_attributes != new_settings.exact_attributes || old_settings.proximity_precision != new_settings.proximity_precision }; + let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; + let cache_user_defined_searchables = old_settings.user_defined_searchable_fields != new_settings.user_defined_searchable_fields; @@ -1155,6 +1158,7 @@ impl InnerIndexSettingsDiff { only_additional_fields, cache_reindex_searchable_without_user_defined, cache_user_defined_searchables, + cache_exact_attributes, } } @@ -1163,11 +1167,21 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { - self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables + self.cache_reindex_searchable_without_user_defined + || self.cache_exact_attributes + || self.cache_user_defined_searchables + } + + pub fn reindex_proximities(&self) -> bool { + // if any searchable settings force the reindexing + (self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables) + // and if any settings needs the proximity database created + && (self.old.proximity_precision == ProximityPrecision::ByAttribute + || self.old.proximity_precision == ProximityPrecision::ByAttribute) } pub fn reindex_searchable_id(&self, id: FieldId) -> Option { - if self.cache_reindex_searchable_without_user_defined { + if self.cache_reindex_searchable_without_user_defined || self.cache_exact_attributes { Some(DelAddOperation::DeletionAndAddition) } else if let Some(only_additional_fields) = &self.only_additional_fields { let additional_field = self.new.fields_ids_map.name(id).unwrap(); From 33241a6b12566b784a56d158240972dfad05f079 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 5 Jun 2024 16:00:24 +0200 Subject: [PATCH 16/16] Fix condition mistake --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index dc26ac746..be9b6b74e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1177,7 +1177,7 @@ impl InnerIndexSettingsDiff { (self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables) // and if any settings needs the proximity database created && (self.old.proximity_precision == ProximityPrecision::ByAttribute - || self.old.proximity_precision == ProximityPrecision::ByAttribute) + || self.new.proximity_precision == ProximityPrecision::ByAttribute) } pub fn reindex_searchable_id(&self, id: FieldId) -> Option {