372: Fix Meilisearch 1714 r=Kerollmops a=ManyTheFish

The bug comes from the typo tolerance, to know how many typos are accepted we were counting bytes instead of characters in a word.
On Chinese Script characters, we were allowing  2 typos on 3 characters words.
We are now counting the number of char instead of counting bytes to assign the typo tolerance.

Related to [Meilisearch#1714](https://github.com/meilisearch/MeiliSearch/issues/1714)

Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
bors[bot] 2021-09-28 11:59:45 +00:00 committed by GitHub
commit b2a332599e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 1 deletions

View File

@ -262,7 +262,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
/// and the provided word length. /// and the provided word length.
fn typos(word: String, authorize_typos: bool) -> QueryKind { fn typos(word: String, authorize_typos: bool) -> QueryKind {
if authorize_typos { if authorize_typos {
match word.len() { match word.chars().count() {
0..=4 => QueryKind::exact(word), 0..=4 => QueryKind::exact(word),
5..=8 => QueryKind::tolerant(1, word), 5..=8 => QueryKind::tolerant(1, word),
_ => QueryKind::tolerant(2, word), _ => QueryKind::tolerant(2, word),

View File

@ -981,4 +981,41 @@ mod tests {
let count = index.number_of_documents(&rtxn).unwrap(); let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 4); assert_eq!(count, 4);
} }
#[test]
fn test_meilisearch_1714() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let content = documents!([
{"id": "123", "title": "小化妆包" },
{"id": "456", "title": "Ipad 包" }
]);
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
// Only the first document should match.
let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len();
assert_eq!(count, 1);
// Only the second document should match.
let count = index.word_docids.get(&rtxn, "").unwrap().unwrap().len();
assert_eq!(count, 1);
let mut search = crate::Search::new(&rtxn, &index);
search.query("化妆包");
search.authorize_typos(true);
search.optional_words(true);
// only 1 document should be returned
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids.len(), 1);
}
} }