mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-05-25 09:03:59 +02:00

Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
196 lines
7.5 KiB
Rust
196 lines
7.5 KiB
Rust
pub struct DocumentTokenizer {
|
|
tokenizer: &Tokenizer,
|
|
searchable_attributes: Option<&[String]>,
|
|
localized_attributes_rules: &[LocalizedAttributesRule],
|
|
max_positions_per_attributes: u32,
|
|
}
|
|
|
|
impl DocumentTokenizer {
|
|
// pub fn new(tokenizer: &Tokenizer, settings: &InnerIndexSettings) -> Self {
|
|
// Self { tokenizer, settings }
|
|
// }
|
|
|
|
pub fn tokenize_document<'a>(
|
|
obkv: &KvReader<'a, FieldId>,
|
|
field_id_map: &FieldsIdsMap,
|
|
token_fn: impl Fn(FieldId, u16, &str),
|
|
) {
|
|
let mut field_position = Hashmap::new();
|
|
for (field_id, field_bytes) in obkv {
|
|
let field_name = field_id_map.name(field_id);
|
|
|
|
let tokenize_field = |name, value| {
|
|
let field_id = field_id_map.id(name);
|
|
match value {
|
|
Number(n) => {
|
|
let token = n.to_string();
|
|
let position = field_position
|
|
.entry(field_id)
|
|
.and_modify(|counter| *counter += 8)
|
|
.or_insert(0);
|
|
token_fn(field_id, position, token.as_str());
|
|
}
|
|
String(text) => {
|
|
// create an iterator of token with their positions.
|
|
let locales = self
|
|
.localized_attributes_rules
|
|
.iter()
|
|
.first(|rule| rule.match_str(field_name))
|
|
.map(|rule| rule.locales(field_id));
|
|
let tokens =
|
|
process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
|
.take_while(|(p, _)| {
|
|
(*p as u32) < self.max_positions_per_attributes
|
|
});
|
|
|
|
for (index, token) in tokens {
|
|
// keep a word only if it is not empty and fit in a LMDB key.
|
|
let token = token.lemma().trim();
|
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
|
let position: u16 = index
|
|
.try_into()
|
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
|
writer.insert(position, token.as_bytes())?;
|
|
}
|
|
}
|
|
}
|
|
_ => (),
|
|
}
|
|
};
|
|
|
|
// if the current field is searchable or contains a searchable attribute
|
|
if searchable_attributes.map_or(true, |attributes| {
|
|
attributes.iter().any(|name| contained_in(name, field_name))
|
|
}) {
|
|
// parse json.
|
|
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
|
Value::Object(object) => {
|
|
seek_leaf_values_in_object(object, selectors, &field_name, tokenize_field)
|
|
}
|
|
Value::Array(array) => {
|
|
seek_leaf_values_in_array(array, selectors, &field_name, tokenize_field)
|
|
}
|
|
value => tokenize_field(&base_key, value),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
|
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
|
/// else we keep the standard proximity of 1 between words.
|
|
fn process_tokens<'a>(
|
|
tokens: impl Iterator<Item = Token<'a>>,
|
|
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
|
tokens
|
|
.skip_while(|token| token.is_separator())
|
|
.scan((0, None), |(offset, prev_kind), mut token| {
|
|
match token.kind {
|
|
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
|
*offset += match *prev_kind {
|
|
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
|
Some(_) => 1,
|
|
None => 0,
|
|
};
|
|
*prev_kind = Some(token.kind)
|
|
}
|
|
TokenKind::Separator(SeparatorKind::Hard) => {
|
|
*prev_kind = Some(token.kind);
|
|
}
|
|
TokenKind::Separator(SeparatorKind::Soft)
|
|
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
|
{
|
|
*prev_kind = Some(token.kind);
|
|
}
|
|
_ => token.kind = TokenKind::Unknown,
|
|
}
|
|
Some((*offset, token))
|
|
})
|
|
.filter(|(_, t)| t.is_word())
|
|
}
|
|
|
|
/// Returns `true` if the `selector` match the `key`.
|
|
///
|
|
/// ```text
|
|
/// Example:
|
|
/// `animaux` match `animaux`
|
|
/// `animaux.chien` match `animaux`
|
|
/// `animaux.chien` match `animaux`
|
|
/// `animaux.chien.nom` match `animaux`
|
|
/// `animaux.chien.nom` match `animaux.chien`
|
|
/// -----------------------------------------
|
|
/// `animaux` doesn't match `animaux.chien`
|
|
/// `animaux.` doesn't match `animaux`
|
|
/// `animaux.ch` doesn't match `animaux.chien`
|
|
/// `animau` doesn't match `animaux`
|
|
/// ```
|
|
fn contained_in(selector: &str, key: &str) -> bool {
|
|
selector.starts_with(key)
|
|
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
|
}
|
|
|
|
/// TODO move in permissive json pointer
|
|
mod perm_json_p {
|
|
pub fn seek_leaf_values<'a>(
|
|
value: &Map<String, Value>,
|
|
selectors: impl IntoIterator<Item = &'a str>,
|
|
seeker: impl Fn(&str, &Value),
|
|
) {
|
|
let selectors: Vec<_> = selectors.into_iter().collect();
|
|
seek_leaf_values_in_object(value, &selectors, "", &seeker);
|
|
}
|
|
|
|
pub fn seek_leaf_values_in_object(
|
|
value: &Map<String, Value>,
|
|
selectors: &[&str],
|
|
base_key: &str,
|
|
seeker: &impl Fn(&str, &Value),
|
|
) {
|
|
for (key, value) in value.iter() {
|
|
let base_key = if base_key.is_empty() {
|
|
key.to_string()
|
|
} else {
|
|
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
|
|
};
|
|
|
|
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
|
// so we check the contained_in on both side
|
|
let should_continue = selectors.iter().any(|selector| {
|
|
contained_in(selector, &base_key) || contained_in(&base_key, selector)
|
|
});
|
|
|
|
if should_continue {
|
|
match value {
|
|
Value::Object(object) => {
|
|
seek_leaf_values_in_object(object, selectors, &base_key, seeker)
|
|
}
|
|
Value::Array(array) => {
|
|
seek_leaf_values_in_array(array, selectors, &base_key, seeker)
|
|
}
|
|
value => seeker(&base_key, value),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn seek_leaf_values_in_array(
|
|
values: &mut [Value],
|
|
selectors: &[&str],
|
|
base_key: &str,
|
|
seeker: &impl Fn(&str, &Value),
|
|
) {
|
|
for value in values.iter_mut() {
|
|
match value {
|
|
Value::Object(object) => {
|
|
seek_leaf_values_in_object(object, selectors, base_key, seeker)
|
|
}
|
|
Value::Array(array) => {
|
|
seek_leaf_values_in_array(array, selectors, base_key, seeker)
|
|
}
|
|
value => seeker(base_key, value),
|
|
}
|
|
}
|
|
}
|
|
}
|