refactor searchable extraction

2025-03-12 02:51:50 +01:00 · 2025-01-27 16:40:52 +01:00 · 2025-01-27 16:40:52 +01:00 · 0ee69ea07b
commit 0ee69ea07b
parent 4b926b881d
4 changed files with 149 additions and 180 deletions
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -7,6 +7,7 @@ use bumpalo::collections::vec::Vec as BumpVec;
 use bumpalo::Bump;
 use heed::RoTxn;

+use super::match_searchable_field;
 use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use crate::update::new::extract::cache::BalancedCaches;
 use crate::update::new::extract::perm_json_p::contained_in;
@ -17,7 +18,6 @@ use crate::update::new::ref_cell_ext::RefCellExt as _;
 use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::update::GrenadParameters;
 use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};

 const MAX_COUNTED_WORDS: usize = 30;
@ -207,9 +207,10 @@ impl<'extractor> WordDocidsCaches<'extractor> {
 }

 pub struct WordDocidsExtractorData<'a> {
-    tokenizer: &'a DocumentTokenizer<'a>,
-    grenad_parameters: &'a GrenadParameters,
+    tokenizer: DocumentTokenizer<'a>,
+    max_memory_by_thread: Option<usize>,
    buckets: usize,
+    searchable_attributes: Option<Vec<&'a str>>,
 }

 impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
@ -218,7 +219,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
        Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
            self.buckets,
-            self.grenad_parameters.max_memory_by_thread(),
+            self.max_memory_by_thread,
            extractor_alloc,
        ))))
    }
@ -230,7 +231,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
    ) -> Result<()> {
        for change in changes {
            let change = change?;
-            WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?;
+            WordDocidsExtractors::extract_document_change(
+                context,
+                &self.tokenizer,
+                self.searchable_attributes.as_deref(),
+                change,
+            )?;
        }
        Ok(())
    }
@ -248,52 +254,42 @@ impl WordDocidsExtractors {
    where
        MSP: Fn() -> bool + Sync,
    {
-        let index = indexing_context.index;
-        let rtxn = index.read_txn()?;
-
-        let stop_words = index.stop_words(&rtxn)?;
-        let allowed_separators = index.allowed_separators(&rtxn)?;
+        // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs
+        let rtxn = indexing_context.index.read_txn()?;
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
        let allowed_separators: Option<Vec<_>> =
            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
        let dictionary: Option<Vec<_>> =
            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let builder = tokenizer_builder(
+        let mut builder = tokenizer_builder(
            stop_words.as_ref(),
            allowed_separators.as_deref(),
            dictionary.as_deref(),
        );
-        let tokenizer = builder.into_tokenizer();
-
-        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
-        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
+        let tokenizer = builder.build();
        let localized_attributes_rules =
-            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
-
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tokenizer,
-            attribute_to_extract: attributes_to_extract.as_deref(),
-            attribute_to_skip: attributes_to_skip.as_slice(),
            localized_attributes_rules: &localized_attributes_rules,
            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
        };
-
+        let extractor_data = WordDocidsExtractorData {
+            tokenizer: document_tokenizer,
+            max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
+            buckets: rayon::current_num_threads(),
+            searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?,
+        };
        let datastore = ThreadLocal::new();
-
        {
            let span =
                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
            let _entered = span.enter();
-
-            let extractor = WordDocidsExtractorData {
-                tokenizer: &document_tokenizer,
-                grenad_parameters: indexing_context.grenad_parameters,
-                buckets: rayon::current_num_threads(),
-            };
-
            extract(
                document_changes,
-                &extractor,
+                &extractor_data,
                indexing_context,
                extractor_allocs,
                &datastore,
@ -312,6 +308,7 @@ impl WordDocidsExtractors {
    fn extract_document_change(
        context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
        document_tokenizer: &DocumentTokenizer,
+        searchable_attributes: Option<&[&str]>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let index = &context.index;
@ -345,7 +342,9 @@ impl WordDocidsExtractors {
            }
            DocumentChange::Update(inner) => {
                if !inner.has_changed_for_fields(
-                    document_tokenizer.attribute_to_extract,
+                    &mut |field_name: &str| {
+                        match_searchable_field(field_name, searchable_attributes)
+                    },
                    &context.rtxn,
                    context.index,
                    context.db_fields_ids_map,
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -2,30 +2,114 @@ use std::cell::RefCell;
 use std::collections::VecDeque;
 use std::rc::Rc;

-use heed::RoTxn;
+use bumpalo::Bump;

-use super::tokenize_document::DocumentTokenizer;
-use super::SearchableExtractor;
+use super::match_searchable_field;
+use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use crate::proximity::{index_proximity, MAX_DISTANCE};
 use crate::update::new::document::Document;
 use crate::update::new::extract::cache::BalancedCaches;
-use crate::update::new::indexer::document_changes::DocumentChangeContext;
+use crate::update::new::indexer::document_changes::{
+    extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
+};
 use crate::update::new::ref_cell_ext::RefCellExt as _;
+use crate::update::new::steps::IndexingStep;
+use crate::update::new::thread_local::{FullySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+pub struct WordPairProximityDocidsExtractorData<'a> {
+    tokenizer: DocumentTokenizer<'a>,
+    searchable_attributes: Option<Vec<&'a str>>,
+    max_memory_by_thread: Option<usize>,
+    buckets: usize,
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> {
+    type Data = RefCell<BalancedCaches<'extractor>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(BalancedCaches::new_in(
+            self.buckets,
+            self.max_memory_by_thread,
+            extractor_alloc,
+        )))
+    }
+
+    fn process<'doc>(
+        &self,
+        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        for change in changes {
+            let change = change?;
+            WordPairProximityDocidsExtractor::extract_document_change(
+                context,
+                &self.tokenizer,
+                self.searchable_attributes.as_deref(),
+                change,
+            )?;
+        }
+        Ok(())
+    }
+}

 pub struct WordPairProximityDocidsExtractor;

-impl SearchableExtractor for WordPairProximityDocidsExtractor {
-    fn attributes_to_extract<'a>(
-        rtxn: &'a RoTxn,
-        index: &'a Index,
-    ) -> Result<Option<Vec<&'a str>>> {
-        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
-    }
+impl WordPairProximityDocidsExtractor {
+    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+        step: IndexingStep,
+    ) -> Result<Vec<BalancedCaches<'extractor>>>
+    where
+        MSP: Fn() -> bool + Sync,
+    {
+        // Warning: this is duplicated code from extract_word_docids.rs
+        let rtxn = indexing_context.index.read_txn()?;
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let mut builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.build();
+        let localized_attributes_rules =
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+        let extractor_data = WordPairProximityDocidsExtractorData {
+            tokenizer: document_tokenizer,
+            searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?,
+            max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
+            buckets: rayon::current_num_threads(),
+        };
+        let datastore = ThreadLocal::new();
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            extract(
+                document_changes,
+                &extractor_data,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+                step,
+            )?;
+        }

-    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
-        Ok(Vec::new())
+        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
    }

    // This method is reimplemented to count the number of words in the document in each field
@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
    fn extract_document_change(
        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
        document_tokenizer: &DocumentTokenizer,
+        searchable_attributes: Option<&[&str]>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let doc_alloc = &context.doc_alloc;
@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
            }
            DocumentChange::Update(inner) => {
                if !inner.has_changed_for_fields(
-                    document_tokenizer.attribute_to_extract,
+                    &mut |field_name: &str| {
+                        match_searchable_field(field_name, searchable_attributes)
+                    },
                    rtxn,
                    index,
                    context.db_fields_ids_map,
--- a/crates/milli/src/update/new/extract/searchable/mod.rs
+++ b/crates/milli/src/update/new/extract/searchable/mod.rs
@ -2,145 +2,28 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod tokenize_document;

-use std::cell::RefCell;
-use std::marker::PhantomData;
-
-use bumpalo::Bump;
 pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
 pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
-use heed::RoTxn;
-use tokenize_document::{tokenizer_builder, DocumentTokenizer};

-use super::cache::BalancedCaches;
-use super::DocidsExtractor;
-use crate::update::new::indexer::document_changes::{
-    extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
-};
-use crate::update::new::steps::IndexingStep;
-use crate::update::new::thread_local::{FullySend, ThreadLocal};
-use crate::update::new::DocumentChange;
-use crate::update::GrenadParameters;
-use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::attribute_patterns::{match_field_legacy, PatternMatch};

-pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
-    tokenizer: &'a DocumentTokenizer<'a>,
-    grenad_parameters: &'a GrenadParameters,
-    buckets: usize,
-    _ex: PhantomData<EX>,
-}
+pub fn match_searchable_field(
+    field_name: &str,
+    searchable_fields: Option<&[&str]>,
+) -> PatternMatch {
+    let Some(searchable_fields) = searchable_fields else {
+        // If no searchable fields are provided, consider all fields as searchable
+        return PatternMatch::Match;
+    };

-impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
-    for SearchableExtractorData<'a, EX>
-{
-    type Data = RefCell<BalancedCaches<'extractor>>;
-
-    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
-        Ok(RefCell::new(BalancedCaches::new_in(
-            self.buckets,
-            self.grenad_parameters.max_memory_by_thread(),
-            extractor_alloc,
-        )))
-    }
-
-    fn process<'doc>(
-        &self,
-        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
-        context: &DocumentChangeContext<Self::Data>,
-    ) -> Result<()> {
-        for change in changes {
-            let change = change?;
-            EX::extract_document_change(context, self.tokenizer, change)?;
+    let mut selection = PatternMatch::NoMatch;
+    for pattern in searchable_fields {
+        match match_field_legacy(&pattern, field_name) {
+            PatternMatch::Match => return PatternMatch::Match,
+            PatternMatch::Parent => selection = PatternMatch::Parent,
+            PatternMatch::NoMatch => (),
        }
-        Ok(())
-    }
-}
-
-pub trait SearchableExtractor: Sized + Sync {
-    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
-        document_changes: &DC,
-        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
-        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
-        step: IndexingStep,
-    ) -> Result<Vec<BalancedCaches<'extractor>>>
-    where
-        MSP: Fn() -> bool + Sync,
-    {
-        let rtxn = indexing_context.index.read_txn()?;
-        let stop_words = indexing_context.index.stop_words(&rtxn)?;
-        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
-        let allowed_separators: Option<Vec<_>> =
-            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let dictionary = indexing_context.index.dictionary(&rtxn)?;
-        let dictionary: Option<Vec<_>> =
-            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let mut builder = tokenizer_builder(
-            stop_words.as_ref(),
-            allowed_separators.as_deref(),
-            dictionary.as_deref(),
-        );
-        let tokenizer = builder.build();
-
-        let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
-        let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
-        let localized_attributes_rules =
-            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
-
-        let document_tokenizer = DocumentTokenizer {
-            tokenizer: &tokenizer,
-            attribute_to_extract: attributes_to_extract.as_deref(),
-            attribute_to_skip: attributes_to_skip.as_slice(),
-            localized_attributes_rules: &localized_attributes_rules,
-            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
-        };
-
-        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
-            tokenizer: &document_tokenizer,
-            grenad_parameters: indexing_context.grenad_parameters,
-            buckets: rayon::current_num_threads(),
-            _ex: PhantomData,
-        };
-
-        let datastore = ThreadLocal::new();
-
-        {
-            let span =
-                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
-            let _entered = span.enter();
-            extract(
-                document_changes,
-                &extractor_data,
-                indexing_context,
-                extractor_allocs,
-                &datastore,
-                step,
-            )?;
-        }
-
-        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
    }

-    fn extract_document_change(
-        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
-        document_tokenizer: &DocumentTokenizer,
-        document_change: DocumentChange,
-    ) -> Result<()>;
-
-    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
-        -> Result<Option<Vec<&'a str>>>;
-
-    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
-}
-
-impl<T: SearchableExtractor> DocidsExtractor for T {
-    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
-        document_changes: &DC,
-        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
-        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
-        step: IndexingStep,
-    ) -> Result<Vec<BalancedCaches<'extractor>>>
-    where
-        MSP: Fn() -> bool + Sync,
-    {
-        Self::run_extraction(document_changes, indexing_context, extractor_allocs, step)
-    }
+    selection
 }
--- a/crates/milli/src/update/new/indexer/extract.rs
+++ b/crates/milli/src/update/new/indexer/extract.rs
@ -198,7 +198,7 @@ where
            let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
            let _entered = span.enter();

-            <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
+            WordPairProximityDocidsExtractor::run_extraction(
                document_changes,
                indexing_context,
                extractor_allocs,