From 2852349e68b8b03a8567ba2cf2547f3626c03924 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Dec 2020 16:31:31 +0100 Subject: [PATCH] update tokenizer version --- Cargo.lock | 2 +- meilisearch-core/Cargo.toml | 2 +- meilisearch-core/src/raw_indexer.rs | 4 ++-- meilisearch-http/tests/search.rs | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5f65b5fb..50bb4e9d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1699,7 +1699,7 @@ dependencies = [ [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.1#dedea5df4b52d94216a65091f237ac64673bab09" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.2#8d91cd52f30aa4b651a085c15056938f7b599646" dependencies = [ "character_converter", "cow-utils", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 8fbcba67b..dbf706e2c 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -26,7 +26,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } log = "0.4.11" meilisearch-error = { path = "../meilisearch-error", version = "0.17.0" } meilisearch-schema = { path = "../meilisearch-schema", version = "0.17.0" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.1" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.2" } meilisearch-types = { path = "../meilisearch-types", version = "0.17.0" } once_cell = "1.5.2" ordered-float = { version = "2.0.1", features = ["serde"] } diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index c78d1f4ff..d83c02a28 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -140,7 +140,7 @@ fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator< tokens .scan((0, None), |(offset, prev_kind), token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Any => { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, @@ -227,7 +227,7 @@ mod tests { let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stopwords)); let analyzer = analyzer.analyze(text); let tokens: Vec<_> = process_tokens(analyzer.tokens()).map(|(_, t)| t.text().to_string()).collect(); - assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目", "词", "的", "带", "标记", "平衡", "语料库"]); + assert_eq!(tokens, ["为", "一", "包含", "一千多万", "目词", "的", "带", "标记", "平衡", "语料库"]); } #[test] diff --git a/meilisearch-http/tests/search.rs b/meilisearch-http/tests/search.rs index 82804a019..cd1fae4bd 100644 --- a/meilisearch-http/tests/search.rs +++ b/meilisearch-http/tests/search.rs @@ -401,7 +401,7 @@ async fn search_with_attribute_to_highlight_wildcard_chinese() { "email": "SunTzu@chorizon.com", "phone": "+1 (810) 407-3258", "address": "吴国", - "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安氏始祖,次子孙明为富春氏始祖。\r\n", + "about": "孫武(前544年-前470年或前496年),字長卿,春秋時期齊國人,著名軍事家、政治家,兵家代表人物。兵書《孙子兵法》的作者,後人尊稱為孫子、兵聖、東方兵聖,山東、蘇州等地尚有祀奉孫武的廟宇兵聖廟。其族人为乐安孙氏始祖,次子孙明为富春孙氏始祖。\r\n", "registered": "2014-10-20T10:13:32 -02:00", "latitude": 17.11935, "longitude": 65.38197,