diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 1c1605b66..6220956fb 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -3,9 +3,10 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; +use crate::attribute_patterns::PatternMatch; use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ - seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection, + seek_leaf_values_in_array, seek_leaf_values_in_object, Depth, }; use crate::{ FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, @@ -17,8 +18,6 @@ const MAX_DISTANCE: u32 = 8; pub struct DocumentTokenizer<'a> { pub tokenizer: &'a Tokenizer<'a>, - pub attribute_to_extract: Option<&'a [&'a str]>, - pub attribute_to_skip: &'a [&'a str], pub localized_attributes_rules: &'a [LocalizedAttributesRule], pub max_positions_per_attributes: u32, } @@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> { token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); + let mut tokenize_field = |field_name: &str, _depth, value: &Value| { + let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else { + return Err(UserError::AttributeLimitReached.into()); + }; + + if meta.is_searchable() { + self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?; + } + + // todo: should be a match on the field_name using `match_field_legacy` function, + // but for legacy reasons we iterate over all the fields to fill the field_id_map. + Ok(PatternMatch::Match) + }; for entry in document.iter_top_level_fields() { let (field_name, value) = entry?; - - let mut tokenize_field = |field_name: &str, _depth, value: &Value| { - let Some(field_id) = field_id_map.id_or_insert(field_name) else { - return Err(UserError::AttributeLimitReached.into()); - }; - - if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) - != Selection::Select - { - return Ok(()); - } - - let position = field_position - .entry(field_id) - .and_modify(|counter| *counter += MAX_DISTANCE) - .or_insert(0); - if *position >= self.max_positions_per_attributes { - return Ok(()); - } - - let text; - let tokens = match value { - Value::Number(n) => { - text = n.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::Bool(b) => { - text = b.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::String(text) => { - let locales = self - .localized_attributes_rules - .iter() - .find(|rule| rule.match_str(field_name)) - .map(|rule| rule.locales()); - self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) - } - _ => return Ok(()), - }; - - // create an iterator of token with their positions. - let tokens = process_tokens(*position, tokens) - .take_while(|(p, _)| *p < self.max_positions_per_attributes); - - for (index, token) in tokens { - // keep a word only if it is not empty and fit in a LMDB key. - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - *position = index; - if let Ok(position) = (*position).try_into() { - token_fn(field_name, field_id, position, token)?; - } - } - } - - Ok(()) - }; - // parse json. match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, Value::Array(array) => seek_leaf_values_in_array( &array, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, - value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, + value => { + tokenize_field(field_name, Depth::OnBaseKey, &value)?; + } + } + } + + Ok(()) + } + + fn tokenize_field( + &self, + field_id: FieldId, + field_name: &str, + value: &Value, + token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>, + field_position: &mut HashMap, + ) -> Result<()> { + let position = field_position + .entry(field_id) + .and_modify(|counter| *counter += MAX_DISTANCE) + .or_insert(0); + if *position >= self.max_positions_per_attributes { + return Ok(()); + } + + let text; + let tokens = match value { + Value::Number(n) => { + text = n.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::Bool(b) => { + text = b.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::String(text) => { + let locales = self + .localized_attributes_rules + .iter() + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) + .map(|rule| rule.locales()); + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) + } + _ => return Ok(()), + }; + + // create an iterator of token with their positions. + let tokens = process_tokens(*position, tokens) + .take_while(|(p, _)| *p < self.max_positions_per_attributes); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_name, field_id, position, token)?; + } } } @@ -215,8 +221,6 @@ mod test { let mut tb = TokenizerBuilder::default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tb.build(), - attribute_to_extract: None, - attribute_to_skip: &["not-me", "me-nether.nope"], localized_attributes_rules: &[], max_positions_per_attributes: 1000, };