From 39aca661dd16c5a3033c1d9ac449a92e8ed6a48a Mon Sep 17 00:00:00 2001 From: HikariLan Date: Wed, 19 Mar 2025 23:12:29 +0800 Subject: [PATCH] Make _matchesPosition length byte based instead of char based --- crates/meilisearch/tests/search/formatted.rs | 12 ++++++------ crates/milli/src/search/new/matches/mod.rs | 8 ++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index 38935da5f..2b9383034 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() { allow_duplicates! { assert_json_snapshot!(response["hits"][0], { "._rankingScore" => "[score]" }, - @r###" + @r#" { "_formatted": { "id": "852", @@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() { "cattos": [ { "start": 0, - "length": 5 + "length": 6 } ] } } - "###); + "#); } } ) @@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() { allow_duplicates! { assert_json_snapshot!(response["hits"][0], { "._rankingScore" => "[score]" }, - @r###" + @r#" { "id": 852, "cattos": "pésti", @@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() { "cattos": [ { "start": 0, - "length": 5 + "length": 6 } ] } } - "###) + "#) } }) .await; diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 7f333d548..80a19948e 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -229,8 +229,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - // TODO: Why is this in chars, while start is in bytes? - length: m.char_count, + length: (m.get_first_token_pos()..m.get_last_token_pos() + 1) + .map(|i| tokens[i].clone()) + .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */)) + .map(|(original, _)| original as usize) + .take(m.char_count) + .sum(), indices: if array_indices.is_empty() { None } else {