mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
fix(parse_query): cyrillic bug
This commit is contained in:
parent
72736c0ea9
commit
ef9fc6c854
1 changed files with 62 additions and 2 deletions
|
@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>(
|
||||||
|
|
||||||
Ok(Box::new(move |word: &str| {
|
Ok(Box::new(move |word: &str| {
|
||||||
if !authorize_typos
|
if !authorize_typos
|
||||||
|| word.len() < min_len_one_typo as usize
|
|| word.chars().count() < min_len_one_typo as usize
|
||||||
|| exact_words.as_ref().is_some_and(|fst| fst.contains(word))
|
|| exact_words.as_ref().is_some_and(|fst| fst.contains(word))
|
||||||
{
|
{
|
||||||
0
|
0
|
||||||
} else if word.len() < min_len_two_typos as usize {
|
} else if word.chars().count() < min_len_two_typos as usize {
|
||||||
1
|
1
|
||||||
} else {
|
} else {
|
||||||
2
|
2
|
||||||
|
@ -381,3 +381,63 @@ mod tests {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_unicode_typo_tolerance_fixed() -> Result<()> {
|
||||||
|
let temp_index = temp_index_with_documents();
|
||||||
|
let rtxn = temp_index.read_txn()?;
|
||||||
|
let ctx = SearchContext::new(&temp_index, &rtxn)?;
|
||||||
|
|
||||||
|
let nbr_typos = number_of_typos_allowed(&ctx)?;
|
||||||
|
|
||||||
|
// ASCII word "doggy" (5 chars, 5 bytes)
|
||||||
|
let ascii_word = "doggy";
|
||||||
|
let ascii_typos = nbr_typos(ascii_word);
|
||||||
|
|
||||||
|
// Cyrillic word "собак" (5 chars, 10 bytes)
|
||||||
|
let cyrillic_word = "собак";
|
||||||
|
let cyrillic_typos = nbr_typos(cyrillic_word);
|
||||||
|
|
||||||
|
eprintln!("ASCII '{}': char_count={}, typos={}",
|
||||||
|
ascii_word, ascii_word.chars().count(), ascii_typos);
|
||||||
|
eprintln!("Cyrillic '{}': char_count={}, typos={}",
|
||||||
|
cyrillic_word, cyrillic_word.chars().count(), cyrillic_typos);
|
||||||
|
|
||||||
|
// Both words have 5 characters, so they should have the same typo tolerance
|
||||||
|
assert_eq!(ascii_typos, cyrillic_typos,
|
||||||
|
"Words with same character count should get same typo tolerance");
|
||||||
|
|
||||||
|
// With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo
|
||||||
|
assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance");
|
||||||
|
assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_various_unicode_scripts() -> Result<()> {
|
||||||
|
let temp_index = temp_index_with_documents();
|
||||||
|
let rtxn = temp_index.read_txn()?;
|
||||||
|
let ctx = SearchContext::new(&temp_index, &rtxn)?;
|
||||||
|
|
||||||
|
let nbr_typos = number_of_typos_allowed(&ctx)?;
|
||||||
|
|
||||||
|
// Let's use 5-character words for consistent testing
|
||||||
|
let five_char_words = vec![
|
||||||
|
("doggy", "ASCII"), // 5 chars, 5 bytes
|
||||||
|
("café!", "Accented"), // 5 chars, 7 bytes
|
||||||
|
("собак", "Cyrillic"), // 5 chars, 10 bytes
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_typos = 1; // With default settings, 5-char words get 1 typo
|
||||||
|
|
||||||
|
for (word, script) in five_char_words {
|
||||||
|
let typos = nbr_typos(word);
|
||||||
|
eprintln!("{} '{}': chars={}, bytes={}, typos={}",
|
||||||
|
script, word, word.chars().count(), word.chars().count(), typos);
|
||||||
|
assert_eq!(typos, expected_typos,
|
||||||
|
"{} word '{}' should get {} typo(s)", script, word, expected_typos);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue