From c77073efcca508df72eede587401fd94235cfd4c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 15:50:12 +0100 Subject: [PATCH 1/5] Update::has_changed_for_fields --- .../milli/src/update/new/document_change.rs | 79 ++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 899655db1..1644b2254 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,7 +1,10 @@ use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::document::{ + Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, +}; +use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; @@ -164,6 +167,80 @@ impl<'doc> Update<'doc> { } } + /// Returns whether the updated version of the document is different from the current version for the passed subset of fields. + /// + /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed. + /// Otherwise `false`. + pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( + &self, + fields: Option<&[&str]>, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result { + let mut changed = false; + let mut cached_current = None; + let mut updated_selected_field_count = 0; + + for entry in self.updated().iter_top_level_fields() { + let (key, updated_value) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + + updated_selected_field_count += 1; + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + let current_value = current.top_level_field(key)?; + let Some(current_value) = current_value else { + changed = true; + break; + }; + + if current_value.get() != updated_value.get() { + changed = true; + break; + } + cached_current = Some(current); + } + + if !self.has_deletion { + // no field deletion, so fields that don't appear in `updated` cannot have changed + return Ok(changed); + } + + if changed { + return Ok(true); + } + + // we saw all updated fields, and set `changed` if any field wasn't in `current`. + // so if there are as many fields in `current` as in `updated`, then nothing changed. + // If there is any more fields in `current`, then they are missing in `updated`. + let has_deleted_fields = { + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + + let mut current_selected_field_count = 0; + for entry in current.iter_top_level_fields() { + let (key, _) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + current_selected_field_count += 1; + } + + current_selected_field_count != updated_selected_field_count + }; + + Ok(has_deleted_fields) + } + pub fn updated_vectors( &self, doc_alloc: &'doc Bump, From c77b00d3ac9c9ed893ae7be940a57eebd3efd338 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 15:51:58 +0100 Subject: [PATCH 2/5] Don't extract word docids when no searchable changed --- .../new/extract/searchable/extract_word_docids.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 05e2374dc..39f67e417 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -8,8 +8,9 @@ use bumpalo::Bump; use heed::RoTxn; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; +use crate::update::new::document::Document as _; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::extract::perm_json_p::contained_in; +use crate::update::new::extract::perm_json_p::{self, contained_in}; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, }; @@ -351,6 +352,15 @@ impl WordDocidsExtractors { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + &context.rtxn, + context.index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let mut token_fn = |fname: &str, fid, pos, word: &str| { cached_sorter.insert_del_u32( fid, From 2b74d1824bca13d92757433391fd0f7ad0fabb43 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 15:56:22 +0100 Subject: [PATCH 3/5] Ignore documents that didn't change any field in word pair proximity --- .../searchable/extract_word_pair_proximity_docids.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dcd9e3a78..e58c0efd2 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, From fa8b9acdf6aa932d9a5421ff4f4347f39b280a5e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 16:12:52 +0100 Subject: [PATCH 4/5] Ignore documents that didn't change in facets --- .../src/update/new/extract/faceted/extract_facets.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index f2132ce38..b865d0a35 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -97,6 +97,15 @@ impl FacetedDocidsExtractor { }, ), DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + Some(attributes_to_extract), + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, From bd5110a2fed4c36f8d57c8eba2de94367e750c96 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Dec 2024 16:13:07 +0100 Subject: [PATCH 5/5] Fix clippy warnings --- .../src/update/new/extract/searchable/extract_word_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 39f67e417..06fb747c6 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -8,9 +8,8 @@ use bumpalo::Bump; use heed::RoTxn; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use crate::update::new::document::Document as _; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::extract::perm_json_p::{self, contained_in}; +use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, };