diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 899655db1..1644b2254 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,7 +1,10 @@ use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::document::{ + Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, +}; +use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; @@ -164,6 +167,80 @@ impl<'doc> Update<'doc> { } } + /// Returns whether the updated version of the document is different from the current version for the passed subset of fields. + /// + /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed. + /// Otherwise `false`. + pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( + &self, + fields: Option<&[&str]>, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result { + let mut changed = false; + let mut cached_current = None; + let mut updated_selected_field_count = 0; + + for entry in self.updated().iter_top_level_fields() { + let (key, updated_value) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + + updated_selected_field_count += 1; + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + let current_value = current.top_level_field(key)?; + let Some(current_value) = current_value else { + changed = true; + break; + }; + + if current_value.get() != updated_value.get() { + changed = true; + break; + } + cached_current = Some(current); + } + + if !self.has_deletion { + // no field deletion, so fields that don't appear in `updated` cannot have changed + return Ok(changed); + } + + if changed { + return Ok(true); + } + + // we saw all updated fields, and set `changed` if any field wasn't in `current`. + // so if there are as many fields in `current` as in `updated`, then nothing changed. + // If there is any more fields in `current`, then they are missing in `updated`. + let has_deleted_fields = { + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + + let mut current_selected_field_count = 0; + for entry in current.iter_top_level_fields() { + let (key, _) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + current_selected_field_count += 1; + } + + current_selected_field_count != updated_selected_field_count + }; + + Ok(has_deleted_fields) + } + pub fn updated_vectors( &self, doc_alloc: &'doc Bump, diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index f2132ce38..b865d0a35 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -97,6 +97,15 @@ impl FacetedDocidsExtractor { }, ), DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + Some(attributes_to_extract), + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 05e2374dc..06fb747c6 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -351,6 +351,15 @@ impl WordDocidsExtractors { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + &context.rtxn, + context.index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let mut token_fn = |fname: &str, fid, pos, word: &str| { cached_sorter.insert_del_u32( fid, diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dcd9e3a78..e58c0efd2 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document,