mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-04-10 16:41:43 +02:00
Separate calc_byte_length function
This commit is contained in:
parent
debd2b21b8
commit
2800e42243
@ -8,6 +8,7 @@ use std::cmp::{max, min};
|
||||
|
||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
||||
use either::Either;
|
||||
use itertools::Itertools;
|
||||
pub use matching_words::MatchingWords;
|
||||
use matching_words::{MatchType, PartialMatch};
|
||||
use r#match::{Match, MatchPosition};
|
||||
@ -229,12 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
||||
.iter()
|
||||
.map(|m| MatchBounds {
|
||||
start: tokens[m.get_first_token_pos()].byte_start,
|
||||
length: (m.get_first_token_pos()..m.get_last_token_pos() + 1)
|
||||
.map(|i| tokens[i].clone())
|
||||
.flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */))
|
||||
.map(|(original, _)| original as usize)
|
||||
.take(m.char_count)
|
||||
.sum(),
|
||||
length: self.calc_byte_length(&tokens, m),
|
||||
indices: if array_indices.is_empty() {
|
||||
None
|
||||
} else {
|
||||
@ -245,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
||||
}
|
||||
}
|
||||
|
||||
fn calc_byte_length(&self, tokens: &Vec<Token<'t>>, m: &Match) -> usize {
|
||||
(m.get_first_token_pos()..=m.get_last_token_pos())
|
||||
.flat_map(|i| match &tokens[i].char_map {
|
||||
Some(char_map) => {
|
||||
char_map.iter().map(|(original, _)| *original as usize).collect_vec()
|
||||
}
|
||||
None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(),
|
||||
})
|
||||
.take(m.char_count)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Returns the bounds in byte index of the crop window.
|
||||
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
||||
let (
|
||||
|
Loading…
x
Reference in New Issue
Block a user