diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 0288858ed..cb5e448f1 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -40,11 +40,26 @@ pub fn into_del_add_obkv( operation: DelAddOperation, buffer: &mut Vec, ) -> Result<(), std::io::Error> { + into_del_add_obkv_conditional_operation(reader, buffer, |_| operation) +} + +/// Akin to the [into_del_add_obkv] function but lets you +/// conditionally define the `DelAdd` variant based on the obkv key. +pub fn into_del_add_obkv_conditional_operation( + reader: obkv::KvReader, + buffer: &mut Vec, + operation: F, +) -> std::io::Result<()> +where + K: obkv::Key + PartialOrd, + F: Fn(K) -> DelAddOperation, +{ let mut writer = obkv::KvWriter::new(buffer); let mut value_buffer = Vec::new(); for (key, value) in reader.iter() { value_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let operation = operation(key); if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { value_writer.insert(DelAdd::Deletion, value)?; } diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 3cbd7e49e..810fa26a9 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -9,7 +9,7 @@ use std::result::Result as StdResult; use bytemuck::bytes_of; use grenad::Sorter; use heed::BytesEncode; -use itertools::EitherOrBoth; +use itertools::{merge_join_by, EitherOrBoth}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; @@ -18,7 +18,7 @@ use FilterableValues::{Empty, Null, Values}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::del_add::{DelAdd, KvWriterDelAdd}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::settings::InnerIndexSettingsDiff; use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; @@ -75,149 +75,181 @@ pub fn extract_fid_docid_facet_values( let mut numbers_key_buffer = Vec::new(); let mut strings_key_buffer = Vec::new(); - let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { - let obkv = obkv::KvReader::new(value); + let old_faceted_fids: BTreeSet<_> = + settings_diff.old.faceted_fields_ids.iter().copied().collect(); + let new_faceted_fids: BTreeSet<_> = + settings_diff.new.faceted_fields_ids.iter().copied().collect(); - for (field_id, field_bytes) in obkv.iter() { - let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id); - let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id); - if delete_faceted || add_faceted { - numbers_key_buffer.clear(); - strings_key_buffer.clear(); + if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { + let obkv = obkv::KvReader::new(value); + let get_document_json_value = move |field_id, side| { + obkv.get(field_id) + .map(KvReaderDelAdd::new) + .and_then(|kv| kv.get(side)) + .map(from_slice) + .transpose() + .map_err(InternalError::SerdeJson) + }; + // iterate over the faceted fields instead of over the whole document. + for eob in + merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| { + old.cmp(new) + }) + { + let (field_id, del_value, add_value) = match eob { + EitherOrBoth::Left(&field_id) => { + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - // Set key to the field_id - // Note: this encoding is consistent with FieldIdCodec - numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + // deletion only + (field_id, del_value, None) + } + EitherOrBoth::Right(&field_id) => { + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; - let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); - let document = DocumentId::from_be_bytes(document); + // addition only + (field_id, None, add_value) + } + EitherOrBoth::Both(&field_id, _) => { + // during settings update, recompute the changing settings only. + if settings_diff.settings_update_only { + continue; + } - // For the other extraction tasks, prefix the key with the field_id and the document_id - numbers_key_buffer.extend_from_slice(docid_bytes); - strings_key_buffer.extend_from_slice(docid_bytes); + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; - let del_add_obkv = obkv::KvReader::new(field_bytes); - let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted) - { - Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), - None => None, - }; - let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) { - Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), - None => None, + (field_id, del_value, add_value) + } }; - // We insert the document id on the Del and the Add side if the field exists. - let (ref mut del_exists, ref mut add_exists) = - facet_exists_docids.entry(field_id).or_default(); - let (ref mut del_is_null, ref mut add_is_null) = - facet_is_null_docids.entry(field_id).or_default(); - let (ref mut del_is_empty, ref mut add_is_empty) = - facet_is_empty_docids.entry(field_id).or_default(); + if del_value.is_some() || add_value.is_some() { + numbers_key_buffer.clear(); + strings_key_buffer.clear(); - if del_value.is_some() { - del_exists.insert(document); - } - if add_value.is_some() { - add_exists.insert(document); - } + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec + numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - let del_geo_support = settings_diff - .old - .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let add_geo_support = settings_diff - .new - .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let del_filterable_values = - del_value.map(|value| extract_facet_values(&value, del_geo_support)); - let add_filterable_values = - add_value.map(|value| extract_facet_values(&value, add_geo_support)); + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = DocumentId::from_be_bytes(document); - // Those closures are just here to simplify things a bit. - let mut insert_numbers_diff = |del_numbers, add_numbers| { - insert_numbers_diff( - &mut fid_docid_facet_numbers_sorter, - &mut numbers_key_buffer, - del_numbers, - add_numbers, - ) - }; - let mut insert_strings_diff = |del_strings, add_strings| { - insert_strings_diff( - &mut fid_docid_facet_strings_sorter, - &mut strings_key_buffer, - del_strings, - add_strings, - ) - }; + // For the other extraction tasks, prefix the key with the field_id and the document_id + numbers_key_buffer.extend_from_slice(docid_bytes); + strings_key_buffer.extend_from_slice(docid_bytes); - match (del_filterable_values, add_filterable_values) { - (None, None) => (), - (Some(del_filterable_values), None) => match del_filterable_values { - Null => { - del_is_null.insert(document); - } - Empty => { - del_is_empty.insert(document); - } - Values { numbers, strings } => { - insert_numbers_diff(numbers, vec![])?; - insert_strings_diff(strings, vec![])?; - } - }, - (None, Some(add_filterable_values)) => match add_filterable_values { - Null => { - add_is_null.insert(document); - } - Empty => { - add_is_empty.insert(document); - } - Values { numbers, strings } => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; - } - }, - (Some(del_filterable_values), Some(add_filterable_values)) => { - match (del_filterable_values, add_filterable_values) { - (Null, Null) | (Empty, Empty) => (), - (Null, Empty) => { - del_is_null.insert(document); - add_is_empty.insert(document); - } - (Empty, Null) => { - del_is_empty.insert(document); - add_is_null.insert(document); - } - (Null, Values { numbers, strings }) => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; + // We insert the document id on the Del and the Add side if the field exists. + let (ref mut del_exists, ref mut add_exists) = + facet_exists_docids.entry(field_id).or_default(); + let (ref mut del_is_null, ref mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (ref mut del_is_empty, ref mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); + + if del_value.is_some() { + del_exists.insert(document); + } + if add_value.is_some() { + add_exists.insert(document); + } + + let del_geo_support = settings_diff + .old + .geo_fields_ids + .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let add_geo_support = settings_diff + .new + .geo_fields_ids + .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let del_filterable_values = + del_value.map(|value| extract_facet_values(&value, del_geo_support)); + let add_filterable_values = + add_value.map(|value| extract_facet_values(&value, add_geo_support)); + + // Those closures are just here to simplify things a bit. + let mut insert_numbers_diff = |del_numbers, add_numbers| { + insert_numbers_diff( + &mut fid_docid_facet_numbers_sorter, + &mut numbers_key_buffer, + del_numbers, + add_numbers, + ) + }; + let mut insert_strings_diff = |del_strings, add_strings| { + insert_strings_diff( + &mut fid_docid_facet_strings_sorter, + &mut strings_key_buffer, + del_strings, + add_strings, + ) + }; + + match (del_filterable_values, add_filterable_values) { + (None, None) => (), + (Some(del_filterable_values), None) => match del_filterable_values { + Null => { del_is_null.insert(document); } - (Empty, Values { numbers, strings }) => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; + Empty => { del_is_empty.insert(document); } - (Values { numbers, strings }, Null) => { - add_is_null.insert(document); + Values { numbers, strings } => { insert_numbers_diff(numbers, vec![])?; insert_strings_diff(strings, vec![])?; } - (Values { numbers, strings }, Empty) => { - add_is_empty.insert(document); - insert_numbers_diff(numbers, vec![])?; - insert_strings_diff(strings, vec![])?; + }, + (None, Some(add_filterable_values)) => match add_filterable_values { + Null => { + add_is_null.insert(document); } - ( - Values { numbers: del_numbers, strings: del_strings }, - Values { numbers: add_numbers, strings: add_strings }, - ) => { - insert_numbers_diff(del_numbers, add_numbers)?; - insert_strings_diff(del_strings, add_strings)?; + Empty => { + add_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + } + }, + (Some(del_filterable_values), Some(add_filterable_values)) => { + match (del_filterable_values, add_filterable_values) { + (Null, Null) | (Empty, Empty) => (), + (Null, Empty) => { + del_is_null.insert(document); + add_is_empty.insert(document); + } + (Empty, Null) => { + del_is_empty.insert(document); + add_is_null.insert(document); + } + (Null, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_null.insert(document); + } + (Empty, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_empty.insert(document); + } + (Values { numbers, strings }, Null) => { + add_is_null.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + (Values { numbers, strings }, Empty) => { + add_is_empty.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + ( + Values { numbers: del_numbers, strings: del_strings }, + Values { numbers: add_numbers, strings: add_strings }, + ) => { + insert_numbers_diff(del_numbers, add_numbers)?; + insert_strings_diff(del_strings, add_strings)?; + } } } } diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 617338f9f..5a9363942 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -26,11 +26,8 @@ pub fn extract_word_pair_proximity_docids( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; - let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; - // early return if the data shouldn't be deleted nor created. - if !any_deletion && !any_addition { + if settings_diff.settings_update_only && !settings_diff.reindex_proximities() { let writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -39,8 +36,10 @@ pub fn extract_word_pair_proximity_docids( return writer_into_reader(writer); } - let max_memory = indexer.max_memory_by_thread(); + let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; + let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; + let max_memory = indexer.max_memory_by_thread(); let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) .map(|_| { create_sorter( diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index afae8973a..2420463b4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -369,6 +369,7 @@ where // Run extraction pipeline in parallel. pool.install(|| { + let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); let _enter = child_span.enter(); @@ -398,7 +399,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, - settings_diff.clone(), + settings_diff_cloned, max_positions_per_attributes, ) }); @@ -425,7 +426,7 @@ where Err(status) => { if let Some(typed_chunks) = chunk_accumulator.pop_longest() { let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?; + write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 41a0a55cf..c34b7876a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -20,7 +20,10 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; +use crate::update::del_add::{ + into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, + KvReaderDelAdd, +}; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -805,13 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let settings_diff = InnerIndexSettingsDiff { - old: old_inner_settings, - new: new_inner_settings, + let embedding_configs_updated = false; + let settings_update_only = false; + let settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, primary_key_id, - embedding_configs_updated: false, - settings_update_only: false, - }; + embedding_configs_updated, + settings_update_only, + ); Ok(TransformOutput { primary_key, @@ -840,14 +845,6 @@ impl<'a, 'i> Transform<'a, 'i> { // Always keep the primary key. let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; - // If only the `searchableAttributes` has been changed, keep only the searchable fields. - let must_reindex_searchables = settings_diff.reindex_searchable(); - let necessary_searchable_field = |id: FieldId| -> bool { - must_reindex_searchables - && (settings_diff.old.searchable_fields_ids.contains(&id) - || settings_diff.new.searchable_fields_ids.contains(&id)) - }; - // If only a faceted field has been added, keep only this field. let must_reindex_facets = settings_diff.reindex_facets(); let necessary_faceted_field = |id: FieldId| -> bool { @@ -862,13 +859,16 @@ impl<'a, 'i> Transform<'a, 'i> { // we need the fields for the prompt/templating. let reindex_vectors = settings_diff.reindex_vectors(); + // The operations that we must perform on the different fields. + let mut operations = HashMap::new(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); for (id, val) in old_obkv.iter() { - if is_primary_key(id) - || necessary_searchable_field(id) - || necessary_faceted_field(id) - || reindex_vectors - { + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + operations.insert(id, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(id, val)?; + } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { + operations.insert(id, operation); obkv_writer.insert(id, val)?; } } @@ -887,11 +887,9 @@ impl<'a, 'i> Transform<'a, 'i> { let flattened = flattened.as_deref().map_or(obkv, KvReader::new); flattened_obkv_buffer.clear(); - into_del_add_obkv( - flattened, - DelAddOperation::DeletionAndAddition, - flattened_obkv_buffer, - )?; + into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { + operations.get(&id).copied().unwrap_or(DelAddOperation::DeletionAndAddition) + })?; } Ok(()) @@ -901,6 +899,11 @@ impl<'a, 'i> Transform<'a, 'i> { /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. /// // TODO this can be done in parallel by using the rayon `ThreadPool`. + #[tracing::instrument( + level = "trace" + skip(self, wtxn, settings_diff), + target = "indexing::documents" + )] pub fn prepare_for_documents_reindexing( self, wtxn: &mut heed::RwTxn<'i>, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2ef7a8990..2fbe91685 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; -use heed::RwTxn; +use heed::{BytesDecode, RwTxn}; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; @@ -20,13 +20,16 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{ as_cloneable_grenad, keep_latest_obkv, try_split_array_at, }; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{ - lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, + Result, SerializationError, U8StrStrCodec, }; /// This struct accumulates and group the TypedChunks @@ -122,9 +125,10 @@ impl TypedChunk { /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( - typed_chunks: Vec, - index: &Index, wtxn: &mut RwTxn, + index: &Index, + settings_diff: &InnerIndexSettingsDiff, + typed_chunks: Vec, ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunks[0] { @@ -485,13 +489,22 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - write_entries_into_database( - merger, - &index.word_pair_proximity_docids, - wtxn, - deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - )?; + if settings_diff.only_additional_fields.is_some() { + write_proximity_entries_into_database_additional_searchables( + merger, + &index.word_pair_proximity_docids, + wtxn, + )?; + } else { + write_entries_into_database( + merger, + &index.word_pair_proximity_docids, + wtxn, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + )?; + } + is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(_) => { @@ -830,3 +843,51 @@ where } Ok(()) } + +/// Akin to the `write_entries_into_database` function but specialized +/// for the case when we only index additional searchable fields only. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] +fn write_proximity_entries_into_database_additional_searchables( + merger: Merger, + database: &heed::Database, + wtxn: &mut RwTxn, +) -> Result<()> +where + R: io::Read + io::Seek, +{ + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + if valid_lmdb_key(key) { + let (proximity_to_insert, word1, word2) = + U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?; + let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) { + Some(value) => { + CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)? + } + None => continue, + }; + + let mut data_to_remove = RoaringBitmap::new(); + for prox in 1..(MAX_DISTANCE as u8) { + let key = (prox, word1, word2); + let database_value = database.get(wtxn, &key)?.unwrap_or_default(); + let value = if prox == proximity_to_insert { + // Proximity that should be changed. + // Union values and remove lower proximity data + (&database_value | &data_to_insert) - &data_to_remove + } else { + // Remove lower proximity data + &database_value - &data_to_remove + }; + + // add the current data in data_to_remove for the next proximities + data_to_remove |= &value; + + if database_value != value { + database.put(wtxn, &key, &value)?; + } + } + } + } + Ok(()) +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 24b32b6fa..be9b6b74e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -9,6 +9,7 @@ use itertools::{EitherOrBoth, Itertools}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; +use super::del_add::DelAddOperation; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; @@ -1072,13 +1073,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { .index .primary_key(self.wtxn)? .and_then(|name| new_inner_settings.fields_ids_map.id(name)); - let inner_settings_diff = InnerIndexSettingsDiff { - old: old_inner_settings, - new: new_inner_settings, + let settings_update_only = true; + let inner_settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, primary_key_id, embedding_configs_updated, - settings_update_only: true, - }; + settings_update_only, + ); if inner_settings_diff.any_reindexing_needed() { self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; @@ -1095,21 +1097,104 @@ pub struct InnerIndexSettingsDiff { // TODO: compare directly the embedders. pub(crate) embedding_configs_updated: bool, pub(crate) settings_update_only: bool, + /// The set of only the additional searchable fields. + /// If any other searchable field has been modified, is set to None. + pub(crate) only_additional_fields: Option>, + + // Cache the check to see if all the stop_words, allowed_separators, dictionary, + // exact_attributes, proximity_precision are different. + pub(crate) cache_reindex_searchable_without_user_defined: bool, + // Cache the check to see if the user_defined_searchables are different. + pub(crate) cache_user_defined_searchables: bool, + // Cache the check to see if the exact_attributes are different. + pub(crate) cache_exact_attributes: bool, } impl InnerIndexSettingsDiff { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::settings")] + pub(crate) fn new( + old_settings: InnerIndexSettings, + new_settings: InnerIndexSettings, + primary_key_id: Option, + embedding_configs_updated: bool, + settings_update_only: bool, + ) -> Self { + let only_additional_fields = match ( + &old_settings.user_defined_searchable_fields, + &new_settings.user_defined_searchable_fields, + ) { + (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * + (Some(old), Some(new)) => { + let old: HashSet<_> = old.iter().cloned().collect(); + let new: HashSet<_> = new.iter().cloned().collect(); + if old.difference(&new).next().is_none() { + // if no field has been removed return only the additional ones + Some(&new - &old).filter(|x| !x.is_empty()) + } else { + None + } + } + }; + + let cache_reindex_searchable_without_user_defined = { + old_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + || old_settings.allowed_separators != new_settings.allowed_separators + || old_settings.dictionary != new_settings.dictionary + || old_settings.proximity_precision != new_settings.proximity_precision + }; + + let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; + + let cache_user_defined_searchables = old_settings.user_defined_searchable_fields + != new_settings.user_defined_searchable_fields; + + InnerIndexSettingsDiff { + old: old_settings, + new: new_settings, + primary_key_id, + embedding_configs_updated, + settings_update_only, + only_additional_fields, + cache_reindex_searchable_without_user_defined, + cache_user_defined_searchables, + cache_exact_attributes, + } + } + pub fn any_reindexing_needed(&self) -> bool { self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() } pub fn reindex_searchable(&self) -> bool { - self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - || self.old.allowed_separators != self.new.allowed_separators - || self.old.dictionary != self.new.dictionary - || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields - || self.old.exact_attributes != self.new.exact_attributes - || self.old.proximity_precision != self.new.proximity_precision + self.cache_reindex_searchable_without_user_defined + || self.cache_exact_attributes + || self.cache_user_defined_searchables + } + + pub fn reindex_proximities(&self) -> bool { + // if any searchable settings force the reindexing + (self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables) + // and if any settings needs the proximity database created + && (self.old.proximity_precision == ProximityPrecision::ByAttribute + || self.new.proximity_precision == ProximityPrecision::ByAttribute) + } + + pub fn reindex_searchable_id(&self, id: FieldId) -> Option { + if self.cache_reindex_searchable_without_user_defined || self.cache_exact_attributes { + Some(DelAddOperation::DeletionAndAddition) + } else if let Some(only_additional_fields) = &self.only_additional_fields { + let additional_field = self.new.fields_ids_map.name(id).unwrap(); + if only_additional_fields.contains(additional_field) { + Some(DelAddOperation::Addition) + } else { + None + } + } else if self.cache_user_defined_searchables { + Some(DelAddOperation::DeletionAndAddition) + } else { + None + } } pub fn reindex_facets(&self) -> bool { @@ -1580,7 +1665,7 @@ mod tests { // When we search for something that is not in // the searchable fields it must not return any document. let result = index.search(&rtxn).query("23").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids, Vec::::new()); // When we search for something that is in the searchable fields // we must find the appropriate document. diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/embeddings-movies-subset-hf.json similarity index 100% rename from workloads/movies-subset-hf-embeddings.json rename to workloads/embeddings-movies-subset-hf.json diff --git a/workloads/settings-add-embeddings.json b/workloads/embeddings-settings-add.json similarity index 100% rename from workloads/settings-add-embeddings.json rename to workloads/embeddings-settings-add.json