From 8d0977233402c093095d8e7e7046cd5d0e6d7aed Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 6 Jun 2022 15:52:51 +0200 Subject: [PATCH 1/6] Update milli --- Cargo.lock | 99 +++++++++++++++++++------------------ meilisearch-auth/Cargo.toml | 2 +- meilisearch-lib/Cargo.toml | 2 +- 3 files changed, 53 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a1be24517..647c3ce4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -643,12 +643,33 @@ dependencies = [ ] [[package]] -name = "character_converter" -version = "1.0.0" +name = "charabia" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +checksum = "4a26a3df4d9c9231eb1e757fe6b1c66c471e0c2cd5410265e7c3109a726663c4" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "lindera", + "lindera-core", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + +[[package]] +name = "character_converter" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7064c6e919124b6541c52fef59d88c3c3eabdf4bc97c13b14551df775aead02" dependencies = [ "bincode", + "fst", + "once_cell", ] [[package]] @@ -1101,8 +1122,8 @@ dependencies = [ [[package]] name = "filter-parser" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "nom", "nom_locate", @@ -1126,8 +1147,8 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "serde_json", ] @@ -1630,8 +1651,8 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "serde_json", ] @@ -1709,9 +1730,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.12.6" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dea10df226936ff54f16d3922500e08ef4be2ba7c0070bec9ad4a1474316111" +checksum = "7d1c5db4b1d12637aa316dc1adb215f78fe79025080af750942516c5ff17d1a0" dependencies = [ "anyhow", "bincode", @@ -1731,9 +1752,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4392785248c3d8755c6fae9d0086d27ad7a1d6810155a2494fe5206e2021f471" +checksum = "73a3509fb497340571d49feddb57e1db2ce5248c4d449f2548d0ee8cb745eb1e" dependencies = [ "anyhow", "bincode", @@ -1751,9 +1772,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af63a4484334d4b83277621f1ba62fb83472858cc37fb4ab2181a4c19eebcb38" +checksum = "5d20d1b2c085393aed58625d741beca69410e1143fc35bc67ebc35c9885f9f74" dependencies = [ "anyhow", "bincode", @@ -1767,9 +1788,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "817ee62bc8973ec2457805df83796c59f074e49a4a0ee9baffe2663fe157f54a" +checksum = "b96b8050cded13927a99bcb8cbb0987f89fc8f35429fc153b4bc05ddc7a53a44" dependencies = [ "anyhow", "lzma-rs", @@ -1778,9 +1799,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd57501ee44a6aba0431d043c7926347e29883a79d8fc3955b8837e4ad1fee3c" +checksum = "5abe3dddc22303402957edb4472ab0c996e0d93b3b00643de3bee8b28c2f9297" dependencies = [ "anyhow", "bincode", @@ -1790,9 +1811,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade3bd3faa5f0db629c26264663e901dee5f46221eb04c2c7b592bd7485d44f9" +checksum = "b8f4c111f6ad9eb9e015d02061af2ed36fc0255f29359294415c7c2f1ea5b5b6" dependencies = [ "bincode", "byteorder", @@ -1807,9 +1828,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee61f8dd6566738c5fd0ee9b1c11212ffc2d1f97af69c08a02cbb5c49995250a" +checksum = "a2b9893f22a4a7511ac70ff7d96cda9b8d7259b7d7121784183c73bc593ce6e7" dependencies = [ "anyhow", "bincode", @@ -1827,9 +1848,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01f05950d9adc7aa42aa8b16be1616f9625576c867179ac29372714eaed6993d" +checksum = "14282600ebfe7ab6fd4f3042143024ff9d74c09d58fd983d0c587839cf940d4a" dependencies = [ "anyhow", "bincode", @@ -1847,9 +1868,9 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3836c1278b8309ebf209c67bc7a935f4ce7c9246a578b250540398806a40b81d" +checksum = "b20825d46c95854e47c532c3e548dfec07c8f187c1ed89383cb6c35790338088" dependencies = [ "anyhow", "bincode", @@ -2131,24 +2152,6 @@ dependencies = [ "whoami", ] -[[package]] -name = "meilisearch-tokenizer" -version = "0.2.9" -source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.9#1dfc8ad9f5b338c39c3bc5fd5b2d0c1328314ddc" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "lindera", - "lindera-core", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang", -] - [[package]] name = "memchr" version = "2.5.0" @@ -2175,13 +2178,14 @@ dependencies = [ [[package]] name = "milli" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "bimap", "bincode", "bstr", "byteorder", + "charabia", "concat-arrays", "crossbeam-channel", "csv", @@ -2198,7 +2202,6 @@ dependencies = [ "levenshtein_automata", "log", "logging_timer", - "meilisearch-tokenizer", "memmap2", "obkv", "once_cell", diff --git a/meilisearch-auth/Cargo.toml b/meilisearch-auth/Cargo.toml index 29fa78a14..709898b22 100644 --- a/meilisearch-auth/Cargo.toml +++ b/meilisearch-auth/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" base64 = "0.13.0" enum-iterator = "0.7.0" meilisearch-error = { path = "../meilisearch-error" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" } rand = "0.8.4" serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } diff --git a/meilisearch-lib/Cargo.toml b/meilisearch-lib/Cargo.toml index e02882b39..730061675 100644 --- a/meilisearch-lib/Cargo.toml +++ b/meilisearch-lib/Cargo.toml @@ -30,7 +30,7 @@ lazy_static = "1.4.0" log = "0.4.14" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-error = { path = "../meilisearch-error" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" } mime = "0.3.16" num_cpus = "1.13.1" obkv = "0.2.0" From 173eea06e18d7b5df4cb965ae3113aa235612f96 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 6 Jun 2022 15:53:28 +0200 Subject: [PATCH 2/6] Replace old tokenizer by charabia --- meilisearch-lib/src/index/search.rs | 30 +++++++---------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/meilisearch-lib/src/index/search.rs b/meilisearch-lib/src/index/search.rs index 91a46600f..6a4a0a672 100644 --- a/meilisearch-lib/src/index/search.rs +++ b/meilisearch-lib/src/index/search.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use std::time::Instant; use either::Either; -use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::tokenizer::TokenizerBuilder; use milli::{ AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError, }; @@ -175,12 +175,9 @@ impl Index { &displayed_ids, ); - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); + let tokenizer = TokenizerBuilder::default().build(); - let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words); + let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer); formatter_builder.crop_marker(query.crop_marker); formatter_builder.highlight_prefix(query.highlight_pre_tag); formatter_builder.highlight_suffix(query.highlight_post_tag); @@ -204,7 +201,6 @@ impl Index { &displayed_document, &fields_ids_map, &formatter_builder, - &analyzer, &formatted_options, query.show_matches_position, &displayed_ids, @@ -414,8 +410,7 @@ fn make_document( fn format_fields<'a, A: AsRef<[u8]>>( document: &Document, field_ids_map: &FieldsIdsMap, - builder: &MatcherBuilder, - analyzer: &'a Analyzer<'a, A>, + builder: &MatcherBuilder<'a, A>, formatted_options: &BTreeMap, compute_matches: bool, displayable_ids: &BTreeSet, @@ -446,7 +441,6 @@ fn format_fields<'a, A: AsRef<[u8]>>( std::mem::take(value), builder, format, - analyzer, &mut infos, compute_matches, ); @@ -470,19 +464,14 @@ fn format_fields<'a, A: AsRef<[u8]>>( fn format_value<'a, A: AsRef<[u8]>>( value: Value, - builder: &MatcherBuilder, + builder: &MatcherBuilder<'a, A>, format_options: Option, - analyzer: &'a Analyzer<'a, A>, infos: &mut Vec, compute_matches: bool, ) -> Value { match value { Value::String(old_string) => { - // this will be removed with charabia - let analyzed = analyzer.analyze(&old_string); - let tokens: Vec<_> = analyzed.tokens().collect(); - - let mut matcher = builder.build(&tokens[..], &old_string); + let mut matcher = builder.build(&old_string); if compute_matches { let matches = matcher.matches(); infos.extend_from_slice(&matches[..]); @@ -507,7 +496,6 @@ fn format_value<'a, A: AsRef<[u8]>>( highlight: format_options.highlight, crop: None, }), - analyzer, infos, compute_matches, ) @@ -527,7 +515,6 @@ fn format_value<'a, A: AsRef<[u8]>>( highlight: format_options.highlight, crop: None, }), - analyzer, infos, compute_matches, ), @@ -536,12 +523,9 @@ fn format_value<'a, A: AsRef<[u8]>>( .collect(), ), Value::Number(number) => { - // this will be removed with charabia let s = number.to_string(); - let analyzed = analyzer.analyze(&s); - let tokens: Vec<_> = analyzed.tokens().collect(); - let mut matcher = builder.build(&tokens[..], &s); + let mut matcher = builder.build(&s); if compute_matches { let matches = matcher.matches(); infos.extend_from_slice(&matches[..]); From f5306eb5b0a66520fd59aa5a75cc5b86aa1108a4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 6 Jun 2022 15:54:50 +0200 Subject: [PATCH 3/6] Set disabled_words to default when Index::exact_words returns None --- meilisearch-lib/src/index/index.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/meilisearch-lib/src/index/index.rs b/meilisearch-lib/src/index/index.rs index 9c6150cfb..1fe191c41 100644 --- a/meilisearch-lib/src/index/index.rs +++ b/meilisearch-lib/src/index/index.rs @@ -175,12 +175,10 @@ impl Index { two_typos: Setting::Set(self.min_word_len_two_typos(txn)?), }; - let disabled_words = self - .exact_words(txn)? - .into_stream() - .into_strs()? - .into_iter() - .collect(); + let disabled_words = match self.exact_words(txn)? { + Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(), + None => BTreeSet::new(), + }; let disabled_attributes = self .exact_attributes(txn)? From 09938c9b6fc0dc004c87030a26d0b0dc7f628cf3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 6 Jun 2022 18:33:07 +0200 Subject: [PATCH 4/6] Patch ranking rules error test --- meilisearch-http/tests/settings/get_settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch-http/tests/settings/get_settings.rs b/meilisearch-http/tests/settings/get_settings.rs index e79b3ed26..d3ac47625 100644 --- a/meilisearch-http/tests/settings/get_settings.rs +++ b/meilisearch-http/tests/settings/get_settings.rs @@ -283,7 +283,7 @@ async fn error_set_invalid_ranking_rules() { assert_eq!(response["status"], "failed"); let expected_error = json!({ - "message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules."#, + "message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules."#, "code": "invalid_ranking_rule", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_ranking_rule" From 0a16f71563c1709e07858d515e82bd31931b8d43 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Jun 2022 12:11:35 +0200 Subject: [PATCH 5/6] Increase wait_task wainting time --- meilisearch-http/tests/common/index.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/meilisearch-http/tests/common/index.rs b/meilisearch-http/tests/common/index.rs index 275bec4cd..010535e21 100644 --- a/meilisearch-http/tests/common/index.rs +++ b/meilisearch-http/tests/common/index.rs @@ -89,9 +89,9 @@ impl Index<'_> { } pub async fn wait_task(&self, update_id: u64) -> Value { - // try 10 times to get status, or panic to not wait forever + // try several times to get status, or panic to not wait forever let url = format!("/tasks/{}", update_id); - for _ in 0..10 { + for _ in 0..100 { let (response, status_code) = self.service.get(&url).await; assert_eq!(200, status_code, "response: {}", response); @@ -99,7 +99,8 @@ impl Index<'_> { return response; } - sleep(Duration::from_secs(1)).await; + // wait 0.5 second. + sleep(Duration::from_millis(500)).await; } panic!("Timeout waiting for update id"); } From 55169ff91432bda80c692735713d31b2803d9790 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 8 Jun 2022 15:09:06 +0200 Subject: [PATCH 6/6] Fix test get_document_s_nested_attributes_to_retrieve --- meilisearch-http/tests/documents/get_documents.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meilisearch-http/tests/documents/get_documents.rs b/meilisearch-http/tests/documents/get_documents.rs index 83e433b22..c15d3f7fa 100644 --- a/meilisearch-http/tests/documents/get_documents.rs +++ b/meilisearch-http/tests/documents/get_documents.rs @@ -43,7 +43,7 @@ async fn get_document() { ]); let (_, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(1).await; let (response, code) = index.get_document(0, None).await; assert_eq!(code, 200); assert_eq!( @@ -306,7 +306,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { ]); let (_, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(1).await; let (response, code) = index .get_document(