Make _matchesPosition length byte based instead of char based

This commit is contained in:
HikariLan 2025-03-19 23:12:29 +08:00
parent cf31a65a88
commit 39aca661dd
2 changed files with 12 additions and 8 deletions

View File

@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() {
allow_duplicates! { allow_duplicates! {
assert_json_snapshot!(response["hits"][0], assert_json_snapshot!(response["hits"][0],
{ "._rankingScore" => "[score]" }, { "._rankingScore" => "[score]" },
@r###" @r#"
{ {
"_formatted": { "_formatted": {
"id": "852", "id": "852",
@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() {
"cattos": [ "cattos": [
{ {
"start": 0, "start": 0,
"length": 5 "length": 6
} }
] ]
} }
} }
"###); "#);
} }
} }
) )
@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() {
allow_duplicates! { allow_duplicates! {
assert_json_snapshot!(response["hits"][0], assert_json_snapshot!(response["hits"][0],
{ "._rankingScore" => "[score]" }, { "._rankingScore" => "[score]" },
@r###" @r#"
{ {
"id": 852, "id": 852,
"cattos": "pésti", "cattos": "pésti",
@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() {
"cattos": [ "cattos": [
{ {
"start": 0, "start": 0,
"length": 5 "length": 6
} }
] ]
} }
} }
"###) "#)
} }
}) })
.await; .await;

View File

@ -229,8 +229,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
.iter() .iter()
.map(|m| MatchBounds { .map(|m| MatchBounds {
start: tokens[m.get_first_token_pos()].byte_start, start: tokens[m.get_first_token_pos()].byte_start,
// TODO: Why is this in chars, while start is in bytes? length: (m.get_first_token_pos()..m.get_last_token_pos() + 1)
length: m.char_count, .map(|i| tokens[i].clone())
.flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */))
.map(|(original, _)| original as usize)
.take(m.char_count)
.sum(),
indices: if array_indices.is_empty() { indices: if array_indices.is_empty() {
None None
} else { } else {