From 161cb736ea501b1913c62917b622424c75a1f4a9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Jun 2025 10:37:29 +0200 Subject: [PATCH] Adapt tests to the Chinese word segmenter changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new Chinese segmenter is splitting words in smaller parts. The words `小化妆包` was previously seegmented as `小 / 化妆包` and is now segmented as `小 / 化妆 / 包`, which changes the tests results. --- crates/milli/src/update/index_documents/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 4acb78b9a..e0f85ca2d 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -1580,12 +1580,12 @@ mod tests { let rtxn = index.read_txn().unwrap(); // Only the first document should match. - let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len(); + let count = index.word_docids.get(&rtxn, "huàzhuāng").unwrap().unwrap().len(); assert_eq!(count, 1); // Only the second document should match. let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len(); - assert_eq!(count, 1); + assert_eq!(count, 2); let mut search = crate::Search::new(&rtxn, &index); search.query("化妆包");