mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Parse every attributes and filter before tokenization
This commit is contained in:
parent
ff9c92c409
commit
4d616f8794
3 changed files with 33 additions and 50 deletions
|
@ -75,12 +75,12 @@ pub trait SearchableExtractor: Sized + Sync {
|
|||
let dictionary = indexing_context.index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let builder = tokenizer_builder(
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.into_tokenizer();
|
||||
let tokenizer = builder.build();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
|
||||
|
|
|
@ -40,6 +40,12 @@ impl<'a> DocumentTokenizer<'a> {
|
|||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
|
||||
!= Selection::Select
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let position = field_position
|
||||
.entry(field_id)
|
||||
.and_modify(|counter| *counter += MAX_DISTANCE)
|
||||
|
@ -87,30 +93,25 @@ impl<'a> DocumentTokenizer<'a> {
|
|||
Ok(())
|
||||
};
|
||||
|
||||
// if the current field is searchable or contains a searchable attribute
|
||||
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
|
||||
!= Selection::Skip
|
||||
{
|
||||
// parse json.
|
||||
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(object) => seek_leaf_values_in_object(
|
||||
&object,
|
||||
self.attribute_to_extract,
|
||||
self.attribute_to_skip,
|
||||
field_name,
|
||||
Depth::OnBaseKey,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
Value::Array(array) => seek_leaf_values_in_array(
|
||||
&array,
|
||||
self.attribute_to_extract,
|
||||
self.attribute_to_skip,
|
||||
field_name,
|
||||
Depth::OnBaseKey,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
|
||||
}
|
||||
// parse json.
|
||||
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(object) => seek_leaf_values_in_object(
|
||||
&object,
|
||||
None,
|
||||
&[],
|
||||
field_name,
|
||||
Depth::OnBaseKey,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
Value::Array(array) => seek_leaf_values_in_array(
|
||||
&array,
|
||||
None,
|
||||
&[],
|
||||
field_name,
|
||||
Depth::OnBaseKey,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue