Merge pull request #5446 from shaokeyibb/main

Fix _matchesPosition length calculate
This commit is contained in:
Many the fish 2025-03-25 14:16:38 +00:00 committed by GitHub
commit eefefc482b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 8 deletions

View File

@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() {
allow_duplicates! {
assert_json_snapshot!(response["hits"][0],
{ "._rankingScore" => "[score]" },
@r###"
@r#"
{
"_formatted": {
"id": "852",
@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() {
"cattos": [
{
"start": 0,
"length": 5
"length": 6
}
]
}
}
"###);
"#);
}
}
)
@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() {
allow_duplicates! {
assert_json_snapshot!(response["hits"][0],
{ "._rankingScore" => "[score]" },
@r###"
@r#"
{
"id": 852,
"cattos": "pésti",
@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() {
"cattos": [
{
"start": 0,
"length": 5
"length": 6
}
]
}
}
"###)
"#)
}
})
.await;

View File

@ -8,6 +8,7 @@ use std::cmp::{max, min};
use charabia::{Language, SeparatorKind, Token, Tokenizer};
use either::Either;
use itertools::Itertools;
pub use matching_words::MatchingWords;
use matching_words::{MatchType, PartialMatch};
use r#match::{Match, MatchPosition};
@ -229,8 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
.iter()
.map(|m| MatchBounds {
start: tokens[m.get_first_token_pos()].byte_start,
// TODO: Why is this in chars, while start is in bytes?
length: m.char_count,
length: self.calc_byte_length(tokens, m),
indices: if array_indices.is_empty() {
None
} else {
@ -241,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
}
}
fn calc_byte_length(&self, tokens: &[Token<'t>], m: &Match) -> usize {
(m.get_first_token_pos()..=m.get_last_token_pos())
.flat_map(|i| match &tokens[i].char_map {
Some(char_map) => {
char_map.iter().map(|(original, _)| *original as usize).collect_vec()
}
None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(),
})
.take(m.char_count)
.sum()
}
/// Returns the bounds in byte index of the crop window.
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
let (