diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 80a19948e..d9009d92b 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -8,6 +8,7 @@ use std::cmp::{max, min}; use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; +use itertools::Itertools; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; @@ -229,12 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - length: (m.get_first_token_pos()..m.get_last_token_pos() + 1) - .map(|i| tokens[i].clone()) - .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */)) - .map(|(original, _)| original as usize) - .take(m.char_count) - .sum(), + length: self.calc_byte_length(&tokens, m), indices: if array_indices.is_empty() { None } else { @@ -245,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } + fn calc_byte_length(&self, tokens: &Vec>, m: &Match) -> usize { + (m.get_first_token_pos()..=m.get_last_token_pos()) + .flat_map(|i| match &tokens[i].char_map { + Some(char_map) => { + char_map.iter().map(|(original, _)| *original as usize).collect_vec() + } + None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(), + }) + .take(m.char_count) + .sum() + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { let (