Merge pull request #5446 from shaokeyibb/main

Fix _matchesPosition length calculate
2025-07-15 13:58:36 +02:00 · 2025-03-25 14:16:38 +00:00 · 2025-03-25 14:16:38 +00:00 · eefefc482b
commit eefefc482b
parent 0f654e45c9 9d3037aa1a
2 changed files with 20 additions and 8 deletions
--- a/crates/meilisearch/tests/search/formatted.rs
+++ b/crates/meilisearch/tests/search/formatted.rs
@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() {
            allow_duplicates! {
              assert_json_snapshot!(response["hits"][0],
                    { "._rankingScore" => "[score]" },
-                    @r###"
+                    @r#"
              {
                "_formatted": {
                  "id": "852",
@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() {
                  "cattos": [
                    {
                      "start": 0,
-                      "length": 5
+                      "length": 6
                    }
                  ]
                }
              }
-              "###);
+              "#);
            }
    }
    )
@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() {
                allow_duplicates! {
                  assert_json_snapshot!(response["hits"][0],
                 { "._rankingScore" => "[score]" },
-                 @r###"
+                 @r#"
                  {
                    "id": 852,
                    "cattos": "pésti",
@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() {
                      "cattos": [
                        {
                          "start": 0,
-                          "length": 5
+                          "length": 6
                        }
                      ]
                    }
                  }
-                  "###)
+                  "#)
             }
        })
        .await;
--- a/crates/milli/src/search/new/matches/mod.rs
+++ b/crates/milli/src/search/new/matches/mod.rs
@ -8,6 +8,7 @@ use std::cmp::{max, min};

 use charabia::{Language, SeparatorKind, Token, Tokenizer};
 use either::Either;
+use itertools::Itertools;
 pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PartialMatch};
 use r#match::{Match, MatchPosition};
@ -229,8 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                .iter()
                .map(|m| MatchBounds {
                    start: tokens[m.get_first_token_pos()].byte_start,
-                    // TODO: Why is this in chars, while start is in bytes?
-                    length: m.char_count,
+                    length: self.calc_byte_length(tokens, m),
                    indices: if array_indices.is_empty() {
                        None
                    } else {
@ -241,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
        }
    }

+    fn calc_byte_length(&self, tokens: &[Token<'t>], m: &Match) -> usize {
+        (m.get_first_token_pos()..=m.get_last_token_pos())
+            .flat_map(|i| match &tokens[i].char_map {
+                Some(char_map) => {
+                    char_map.iter().map(|(original, _)| *original as usize).collect_vec()
+                }
+                None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(),
+            })
+            .take(m.char_count)
+            .sum()
+    }
+
    /// Returns the bounds in byte index of the crop window.
    fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
        let (