From 0ee69ea07b3b854f18d3da6884e6a20609d812c0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 27 Jan 2025 16:40:52 +0100 Subject: [PATCH] refactor searchable extraction --- .../extract/searchable/extract_word_docids.rs | 61 ++++--- .../extract_word_pair_proximity_docids.rs | 117 ++++++++++++-- .../src/update/new/extract/searchable/mod.rs | 149 ++---------------- .../milli/src/update/new/indexer/extract.rs | 2 +- 4 files changed, 149 insertions(+), 180 deletions(-) diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 49259cd64..e6027b204 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -7,6 +7,7 @@ use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; use heed::RoTxn; +use super::match_searchable_field; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; @@ -17,7 +18,6 @@ use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; const MAX_COUNTED_WORDS: usize = 30; @@ -207,9 +207,10 @@ impl<'extractor> WordDocidsCaches<'extractor> { } pub struct WordDocidsExtractorData<'a> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: &'a GrenadParameters, + tokenizer: DocumentTokenizer<'a>, + max_memory_by_thread: Option, buckets: usize, + searchable_attributes: Option>, } impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { @@ -218,7 +219,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory_by_thread(), + self.max_memory_by_thread, extractor_alloc, )))) } @@ -230,7 +231,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { ) -> Result<()> { for change in changes { let change = change?; - WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; + WordDocidsExtractors::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; } Ok(()) } @@ -248,52 +254,42 @@ impl WordDocidsExtractors { where MSP: Fn() -> bool + Sync, { - let index = indexing_context.index; - let rtxn = index.read_txn()?; - - let stop_words = index.stop_words(&rtxn)?; - let allowed_separators = index.allowed_separators(&rtxn)?; + // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; let allowed_separators: Option> = allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = index.dictionary(&rtxn)?; + let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let builder = tokenizer_builder( + let mut builder = tokenizer_builder( stop_words.as_ref(), allowed_separators.as_deref(), dictionary.as_deref(), ); - let tokenizer = builder.into_tokenizer(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let tokenizer = builder.build(); let localized_attributes_rules = - index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - + let extractor_data = WordDocidsExtractorData { + tokenizer: document_tokenizer, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + }; let datastore = ThreadLocal::new(); - { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - - let extractor = WordDocidsExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters: indexing_context.grenad_parameters, - buckets: rayon::current_num_threads(), - }; - extract( document_changes, - &extractor, + &extractor_data, indexing_context, extractor_allocs, &datastore, @@ -312,6 +308,7 @@ impl WordDocidsExtractors { fn extract_document_change( context: &DocumentChangeContext>>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let index = &context.index; @@ -345,7 +342,9 @@ impl WordDocidsExtractors { } DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - document_tokenizer.attribute_to_extract, + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, &context.rtxn, context.index, context.db_fields_ids_map, diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e58c0efd2..0724b0513 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,30 +2,114 @@ use std::cell::RefCell; use std::collections::VecDeque; use std::rc::Rc; -use heed::RoTxn; +use bumpalo::Bump; -use super::tokenize_document::DocumentTokenizer; -use super::SearchableExtractor; +use super::match_searchable_field; +use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::indexer::document_changes::{ + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, +}; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::steps::IndexingStep; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; +use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE}; + +pub struct WordPairProximityDocidsExtractorData<'a> { + tokenizer: DocumentTokenizer<'a>, + searchable_attributes: Option>, + max_memory_by_thread: Option, + buckets: usize, +} + +impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> { + type Data = RefCell>; + + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(BalancedCaches::new_in( + self.buckets, + self.max_memory_by_thread, + extractor_alloc, + ))) + } + + fn process<'doc>( + &self, + changes: impl Iterator>>, + context: &DocumentChangeContext, + ) -> Result<()> { + for change in changes { + let change = change?; + WordPairProximityDocidsExtractor::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; + } + Ok(()) + } +} pub struct WordPairProximityDocidsExtractor; -impl SearchableExtractor for WordPairProximityDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } +impl WordPairProximityDocidsExtractor { + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, + extractor_allocs: &'extractor mut ThreadLocal>, + step: IndexingStep, + ) -> Result>> + where + MSP: Fn() -> bool + Sync, + { + // Warning: this is duplicated code from extract_word_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = indexing_context.index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let mut builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.build(); + let localized_attributes_rules = + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + let extractor_data = WordPairProximityDocidsExtractorData { + tokenizer: document_tokenizer, + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + }; + let datastore = ThreadLocal::new(); + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + extract( + document_changes, + &extractor_data, + indexing_context, + extractor_allocs, + &datastore, + step, + )?; + } - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) + Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } // This method is reimplemented to count the number of words in the document in each field @@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fn extract_document_change( context: &DocumentChangeContext>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let doc_alloc = &context.doc_alloc; @@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { } DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - document_tokenizer.attribute_to_extract, + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, rtxn, index, context.db_fields_ids_map, diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 7c949a3ce..28cd265ae 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -2,145 +2,28 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; -use std::cell::RefCell; -use std::marker::PhantomData; - -use bumpalo::Bump; pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; -use heed::RoTxn; -use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::BalancedCaches; -use super::DocidsExtractor; -use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, -}; -use crate::update::new::steps::IndexingStep; -use crate::update::new::thread_local::{FullySend, ThreadLocal}; -use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; -pub struct SearchableExtractorData<'a, EX: SearchableExtractor> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: &'a GrenadParameters, - buckets: usize, - _ex: PhantomData, -} +pub fn match_searchable_field( + field_name: &str, + searchable_fields: Option<&[&str]>, +) -> PatternMatch { + let Some(searchable_fields) = searchable_fields else { + // If no searchable fields are provided, consider all fields as searchable + return PatternMatch::Match; + }; -impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> - for SearchableExtractorData<'a, EX> -{ - type Data = RefCell>; - - fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { - Ok(RefCell::new(BalancedCaches::new_in( - self.buckets, - self.grenad_parameters.max_memory_by_thread(), - extractor_alloc, - ))) - } - - fn process<'doc>( - &self, - changes: impl Iterator>>, - context: &DocumentChangeContext, - ) -> Result<()> { - for change in changes { - let change = change?; - EX::extract_document_change(context, self.tokenizer, change)?; + let mut selection = PatternMatch::NoMatch; + for pattern in searchable_fields { + match match_field_legacy(&pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), } - Ok(()) - } -} - -pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - { - let rtxn = indexing_context.index.read_txn()?; - let stop_words = indexing_context.index.stop_words(&rtxn)?; - let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; - let allowed_separators: Option> = - allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = indexing_context.index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut builder = tokenizer_builder( - stop_words.as_ref(), - allowed_separators.as_deref(), - dictionary.as_deref(), - ); - let tokenizer = builder.build(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; - let localized_attributes_rules = - indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - - let document_tokenizer = DocumentTokenizer { - tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), - localized_attributes_rules: &localized_attributes_rules, - max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, - }; - - let extractor_data: SearchableExtractorData = SearchableExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters: indexing_context.grenad_parameters, - buckets: rayon::current_num_threads(), - _ex: PhantomData, - }; - - let datastore = ThreadLocal::new(); - - { - let span = - tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); - let _entered = span.enter(); - extract( - document_changes, - &extractor_data, - indexing_context, - extractor_allocs, - &datastore, - step, - )?; - } - - Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } - fn extract_document_change( - context: &DocumentChangeContext>, - document_tokenizer: &DocumentTokenizer, - document_change: DocumentChange, - ) -> Result<()>; - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) - -> Result>>; - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; -} - -impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - { - Self::run_extraction(document_changes, indexing_context, extractor_allocs, step) - } + selection } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 63536c559..1aaf849e8 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -198,7 +198,7 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); let _entered = span.enter(); - ::run_extraction( + WordPairProximityDocidsExtractor::run_extraction( document_changes, indexing_context, extractor_allocs,