mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-05-25 09:03:59 +02:00
Merge pull request #5446 from shaokeyibb/main
Fix _matchesPosition length calculate
This commit is contained in:
commit
eefefc482b
@ -74,7 +74,7 @@ async fn formatted_contain_wildcard() {
|
|||||||
allow_duplicates! {
|
allow_duplicates! {
|
||||||
assert_json_snapshot!(response["hits"][0],
|
assert_json_snapshot!(response["hits"][0],
|
||||||
{ "._rankingScore" => "[score]" },
|
{ "._rankingScore" => "[score]" },
|
||||||
@r###"
|
@r#"
|
||||||
{
|
{
|
||||||
"_formatted": {
|
"_formatted": {
|
||||||
"id": "852",
|
"id": "852",
|
||||||
@ -84,12 +84,12 @@ async fn formatted_contain_wildcard() {
|
|||||||
"cattos": [
|
"cattos": [
|
||||||
{
|
{
|
||||||
"start": 0,
|
"start": 0,
|
||||||
"length": 5
|
"length": 6
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"###);
|
"#);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@ -119,7 +119,7 @@ async fn formatted_contain_wildcard() {
|
|||||||
allow_duplicates! {
|
allow_duplicates! {
|
||||||
assert_json_snapshot!(response["hits"][0],
|
assert_json_snapshot!(response["hits"][0],
|
||||||
{ "._rankingScore" => "[score]" },
|
{ "._rankingScore" => "[score]" },
|
||||||
@r###"
|
@r#"
|
||||||
{
|
{
|
||||||
"id": 852,
|
"id": 852,
|
||||||
"cattos": "pésti",
|
"cattos": "pésti",
|
||||||
@ -131,12 +131,12 @@ async fn formatted_contain_wildcard() {
|
|||||||
"cattos": [
|
"cattos": [
|
||||||
{
|
{
|
||||||
"start": 0,
|
"start": 0,
|
||||||
"length": 5
|
"length": 6
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"###)
|
"#)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
|
@ -8,6 +8,7 @@ use std::cmp::{max, min};
|
|||||||
|
|
||||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
||||||
use either::Either;
|
use either::Either;
|
||||||
|
use itertools::Itertools;
|
||||||
pub use matching_words::MatchingWords;
|
pub use matching_words::MatchingWords;
|
||||||
use matching_words::{MatchType, PartialMatch};
|
use matching_words::{MatchType, PartialMatch};
|
||||||
use r#match::{Match, MatchPosition};
|
use r#match::{Match, MatchPosition};
|
||||||
@ -229,8 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|m| MatchBounds {
|
.map(|m| MatchBounds {
|
||||||
start: tokens[m.get_first_token_pos()].byte_start,
|
start: tokens[m.get_first_token_pos()].byte_start,
|
||||||
// TODO: Why is this in chars, while start is in bytes?
|
length: self.calc_byte_length(tokens, m),
|
||||||
length: m.char_count,
|
|
||||||
indices: if array_indices.is_empty() {
|
indices: if array_indices.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
@ -241,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn calc_byte_length(&self, tokens: &[Token<'t>], m: &Match) -> usize {
|
||||||
|
(m.get_first_token_pos()..=m.get_last_token_pos())
|
||||||
|
.flat_map(|i| match &tokens[i].char_map {
|
||||||
|
Some(char_map) => {
|
||||||
|
char_map.iter().map(|(original, _)| *original as usize).collect_vec()
|
||||||
|
}
|
||||||
|
None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(),
|
||||||
|
})
|
||||||
|
.take(m.char_count)
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the bounds in byte index of the crop window.
|
/// Returns the bounds in byte index of the crop window.
|
||||||
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
||||||
let (
|
let (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user