From 19884162957e6c71eb95d4b5a1f989be442c1a1c Mon Sep 17 00:00:00 2001 From: many Date: Tue, 28 Sep 2021 12:05:11 +0200 Subject: [PATCH 1/2] Add failing test related to Meilisearch#1714 --- milli/src/update/index_documents/mod.rs | 37 +++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b00dbf375..498a2a85d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -981,4 +981,41 @@ mod tests { let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 4); } + + #[test] + fn test_meilisearch_1714() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let content = documents!([ + {"id": "123", "title": "小化妆包" }, + {"id": "456", "title": "Ipad 包" } + ]); + + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // Only the first document should match. + let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len(); + assert_eq!(count, 1); + + // Only the second document should match. + let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len(); + assert_eq!(count, 1); + + let mut search = crate::Search::new(&rtxn, &index); + search.query("化妆包"); + search.authorize_typos(true); + search.optional_words(true); + + // only 1 document should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 1); + } } From 8046ae4bd570b092a98c424dfbd4f5ac3e0678cc Mon Sep 17 00:00:00 2001 From: many Date: Tue, 28 Sep 2021 12:10:43 +0200 Subject: [PATCH 2/2] Count the number of char instead of counting bytes to assign the typo tolerance --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 8fa24b9d3..0744231ae 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -262,7 +262,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { if authorize_typos { - match word.len() { + match word.chars().count() { 0..=4 => QueryKind::exact(word), 5..=8 => QueryKind::tolerant(1, word), _ => QueryKind::tolerant(2, word),