From 39aca661dd16c5a3033c1d9ac449a92e8ed6a48a Mon Sep 17 00:00:00 2001 From: HikariLan Date: Wed, 19 Mar 2025 23:12:29 +0800 Subject: [PATCH 1/3] Make _matchesPosition length byte based instead of char based --- crates/meilisearch/tests/search/formatted.rs | 12 ++++++------ crates/milli/src/search/new/matches/mod.rs | 8 ++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index 38935da5f..2b9383034 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() { allow_duplicates! { assert_json_snapshot!(response["hits"][0], { "._rankingScore" => "[score]" }, - @r###" + @r#" { "_formatted": { "id": "852", @@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() { "cattos": [ { "start": 0, - "length": 5 + "length": 6 } ] } } - "###); + "#); } } ) @@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() { allow_duplicates! { assert_json_snapshot!(response["hits"][0], { "._rankingScore" => "[score]" }, - @r###" + @r#" { "id": 852, "cattos": "pésti", @@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() { "cattos": [ { "start": 0, - "length": 5 + "length": 6 } ] } } - "###) + "#) } }) .await; diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 7f333d548..80a19948e 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -229,8 +229,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - // TODO: Why is this in chars, while start is in bytes? - length: m.char_count, + length: (m.get_first_token_pos()..m.get_last_token_pos() + 1) + .map(|i| tokens[i].clone()) + .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */)) + .map(|(original, _)| original as usize) + .take(m.char_count) + .sum(), indices: if array_indices.is_empty() { None } else { From 2800e422432021135b82e7409a3933b9f6fc6e87 Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 25 Mar 2025 00:47:17 +0800 Subject: [PATCH 2/3] Separate calc_byte_length function --- crates/milli/src/search/new/matches/mod.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 80a19948e..d9009d92b 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -8,6 +8,7 @@ use std::cmp::{max, min}; use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; +use itertools::Itertools; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; @@ -229,12 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - length: (m.get_first_token_pos()..m.get_last_token_pos() + 1) - .map(|i| tokens[i].clone()) - .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */)) - .map(|(original, _)| original as usize) - .take(m.char_count) - .sum(), + length: self.calc_byte_length(&tokens, m), indices: if array_indices.is_empty() { None } else { @@ -245,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } + fn calc_byte_length(&self, tokens: &Vec>, m: &Match) -> usize { + (m.get_first_token_pos()..=m.get_last_token_pos()) + .flat_map(|i| match &tokens[i].char_map { + Some(char_map) => { + char_map.iter().map(|(original, _)| *original as usize).collect_vec() + } + None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(), + }) + .take(m.char_count) + .sum() + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { let ( From 9d3037aa1ac86a8e1bf0edba3777bed2d63c880d Mon Sep 17 00:00:00 2001 From: HikariLan Date: Tue, 25 Mar 2025 18:12:36 +0800 Subject: [PATCH 3/3] Fix clippy error --- crates/milli/src/search/new/matches/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index d9009d92b..6a81d7c4d 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -230,7 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - length: self.calc_byte_length(&tokens, m), + length: self.calc_byte_length(tokens, m), indices: if array_indices.is_empty() { None } else { @@ -241,7 +241,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } - fn calc_byte_length(&self, tokens: &Vec>, m: &Match) -> usize { + fn calc_byte_length(&self, tokens: &[Token<'t>], m: &Match) -> usize { (m.get_first_token_pos()..=m.get_last_token_pos()) .flat_map(|i| match &tokens[i].char_map { Some(char_map) => {