From 39aca661dd16c5a3033c1d9ac449a92e8ed6a48a Mon Sep 17 00:00:00 2001
From: HikariLan <hikarilan@minecraft.kim>
Date: Wed, 19 Mar 2025 23:12:29 +0800
Subject: [PATCH] Make _matchesPosition length byte based instead of char based

---
 crates/meilisearch/tests/search/formatted.rs | 12 ++++++------
 crates/milli/src/search/new/matches/mod.rs   |  8 ++++++--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs
index 38935da5f..2b9383034 100644
--- a/crates/meilisearch/tests/search/formatted.rs
+++ b/crates/meilisearch/tests/search/formatted.rs
@@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() {
             allow_duplicates! {
               assert_json_snapshot!(response["hits"][0],
                     { "._rankingScore" => "[score]" },
-                    @r###"
+                    @r#"
               {
                 "_formatted": {
                   "id": "852",
@@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() {
                   "cattos": [
                     {
                       "start": 0,
-                      "length": 5
+                      "length": 6
                     }
                   ]
                 }
               }
-              "###);
+              "#);
             }
     }
     )
@@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() {
                 allow_duplicates! {
                   assert_json_snapshot!(response["hits"][0],
                  { "._rankingScore" => "[score]" },
-                 @r###"
+                 @r#"
                   {
                     "id": 852,
                     "cattos": "pésti",
@@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() {
                       "cattos": [
                         {
                           "start": 0,
-                          "length": 5
+                          "length": 6
                         }
                       ]
                     }
                   }
-                  "###)
+                  "#)
              }
         })
         .await;
diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs
index 7f333d548..80a19948e 100644
--- a/crates/milli/src/search/new/matches/mod.rs
+++ b/crates/milli/src/search/new/matches/mod.rs
@@ -229,8 +229,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 .iter()
                 .map(|m| MatchBounds {
                     start: tokens[m.get_first_token_pos()].byte_start,
-                    // TODO: Why is this in chars, while start is in bytes?
-                    length: m.char_count,
+                    length: (m.get_first_token_pos()..m.get_last_token_pos() + 1)
+                        .map(|i| tokens[i].clone())
+                        .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */))
+                        .map(|(original, _)| original as usize)
+                        .take(m.char_count)
+                        .sum(),
                     indices: if array_indices.is_empty() {
                         None
                     } else {