diff --git a/Cargo.lock b/Cargo.lock index d4d977ab3..8e3ac34e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -643,12 +643,33 @@ dependencies = [ ] [[package]] -name = "character_converter" -version = "1.0.0" +name = "charabia" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +checksum = "4a26a3df4d9c9231eb1e757fe6b1c66c471e0c2cd5410265e7c3109a726663c4" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "lindera", + "lindera-core", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + +[[package]] +name = "character_converter" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7064c6e919124b6541c52fef59d88c3c3eabdf4bc97c13b14551df775aead02" dependencies = [ "bincode", + "fst", + "once_cell", ] [[package]] @@ -1102,8 +1123,8 @@ dependencies = [ [[package]] name = "filter-parser" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "nom", "nom_locate", @@ -1127,8 +1148,8 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "serde_json", ] @@ -1640,8 +1661,8 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "serde_json", ] @@ -1719,9 +1740,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.12.6" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dea10df226936ff54f16d3922500e08ef4be2ba7c0070bec9ad4a1474316111" +checksum = "7d1c5db4b1d12637aa316dc1adb215f78fe79025080af750942516c5ff17d1a0" dependencies = [ "anyhow", "bincode", @@ -1741,9 +1762,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4392785248c3d8755c6fae9d0086d27ad7a1d6810155a2494fe5206e2021f471" +checksum = "73a3509fb497340571d49feddb57e1db2ce5248c4d449f2548d0ee8cb745eb1e" dependencies = [ "anyhow", "bincode", @@ -1761,9 +1782,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af63a4484334d4b83277621f1ba62fb83472858cc37fb4ab2181a4c19eebcb38" +checksum = "5d20d1b2c085393aed58625d741beca69410e1143fc35bc67ebc35c9885f9f74" dependencies = [ "anyhow", "bincode", @@ -1777,9 +1798,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "817ee62bc8973ec2457805df83796c59f074e49a4a0ee9baffe2663fe157f54a" +checksum = "b96b8050cded13927a99bcb8cbb0987f89fc8f35429fc153b4bc05ddc7a53a44" dependencies = [ "anyhow", "lzma-rs", @@ -1788,9 +1809,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd57501ee44a6aba0431d043c7926347e29883a79d8fc3955b8837e4ad1fee3c" +checksum = "5abe3dddc22303402957edb4472ab0c996e0d93b3b00643de3bee8b28c2f9297" dependencies = [ "anyhow", "bincode", @@ -1800,9 +1821,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade3bd3faa5f0db629c26264663e901dee5f46221eb04c2c7b592bd7485d44f9" +checksum = "b8f4c111f6ad9eb9e015d02061af2ed36fc0255f29359294415c7c2f1ea5b5b6" dependencies = [ "bincode", "byteorder", @@ -1817,9 +1838,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee61f8dd6566738c5fd0ee9b1c11212ffc2d1f97af69c08a02cbb5c49995250a" +checksum = "a2b9893f22a4a7511ac70ff7d96cda9b8d7259b7d7121784183c73bc593ce6e7" dependencies = [ "anyhow", "bincode", @@ -1837,9 +1858,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01f05950d9adc7aa42aa8b16be1616f9625576c867179ac29372714eaed6993d" +checksum = "14282600ebfe7ab6fd4f3042143024ff9d74c09d58fd983d0c587839cf940d4a" dependencies = [ "anyhow", "bincode", @@ -1857,9 +1878,9 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.12.6" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3836c1278b8309ebf209c67bc7a935f4ce7c9246a578b250540398806a40b81d" +checksum = "b20825d46c95854e47c532c3e548dfec07c8f187c1ed89383cb6c35790338088" dependencies = [ "anyhow", "bincode", @@ -2142,24 +2163,6 @@ dependencies = [ "whoami", ] -[[package]] -name = "meilisearch-tokenizer" -version = "0.2.9" -source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.9#1dfc8ad9f5b338c39c3bc5fd5b2d0c1328314ddc" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "lindera", - "lindera-core", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang", -] - [[package]] name = "memchr" version = "2.5.0" @@ -2186,13 +2189,14 @@ dependencies = [ [[package]] name = "milli" -version = "0.28.0" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" +version = "0.29.1" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c" dependencies = [ "bimap", "bincode", "bstr", "byteorder", + "charabia", "concat-arrays", "crossbeam-channel", "csv", @@ -2209,7 +2213,6 @@ dependencies = [ "levenshtein_automata", "log", "logging_timer", - "meilisearch-tokenizer", "memmap2", "obkv", "once_cell", diff --git a/meilisearch-auth/Cargo.toml b/meilisearch-auth/Cargo.toml index bb4a9382c..fe76561d8 100644 --- a/meilisearch-auth/Cargo.toml +++ b/meilisearch-auth/Cargo.toml @@ -8,7 +8,7 @@ base64 = "0.13.0" enum-iterator = "0.7.0" hmac = "0.12.1" meilisearch-error = { path = "../meilisearch-error" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" } rand = "0.8.4" serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } diff --git a/meilisearch-http/tests/common/index.rs b/meilisearch-http/tests/common/index.rs index 275bec4cd..010535e21 100644 --- a/meilisearch-http/tests/common/index.rs +++ b/meilisearch-http/tests/common/index.rs @@ -89,9 +89,9 @@ impl Index<'_> { } pub async fn wait_task(&self, update_id: u64) -> Value { - // try 10 times to get status, or panic to not wait forever + // try several times to get status, or panic to not wait forever let url = format!("/tasks/{}", update_id); - for _ in 0..10 { + for _ in 0..100 { let (response, status_code) = self.service.get(&url).await; assert_eq!(200, status_code, "response: {}", response); @@ -99,7 +99,8 @@ impl Index<'_> { return response; } - sleep(Duration::from_secs(1)).await; + // wait 0.5 second. + sleep(Duration::from_millis(500)).await; } panic!("Timeout waiting for update id"); } diff --git a/meilisearch-http/tests/documents/get_documents.rs b/meilisearch-http/tests/documents/get_documents.rs index 83e433b22..c15d3f7fa 100644 --- a/meilisearch-http/tests/documents/get_documents.rs +++ b/meilisearch-http/tests/documents/get_documents.rs @@ -43,7 +43,7 @@ async fn get_document() { ]); let (_, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(1).await; let (response, code) = index.get_document(0, None).await; assert_eq!(code, 200); assert_eq!( @@ -306,7 +306,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { ]); let (_, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(1).await; let (response, code) = index .get_document( diff --git a/meilisearch-http/tests/settings/get_settings.rs b/meilisearch-http/tests/settings/get_settings.rs index e79b3ed26..d3ac47625 100644 --- a/meilisearch-http/tests/settings/get_settings.rs +++ b/meilisearch-http/tests/settings/get_settings.rs @@ -283,7 +283,7 @@ async fn error_set_invalid_ranking_rules() { assert_eq!(response["status"], "failed"); let expected_error = json!({ - "message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules."#, + "message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules."#, "code": "invalid_ranking_rule", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_ranking_rule" diff --git a/meilisearch-lib/Cargo.toml b/meilisearch-lib/Cargo.toml index e02882b39..730061675 100644 --- a/meilisearch-lib/Cargo.toml +++ b/meilisearch-lib/Cargo.toml @@ -30,7 +30,7 @@ lazy_static = "1.4.0" log = "0.4.14" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-error = { path = "../meilisearch-error" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" } mime = "0.3.16" num_cpus = "1.13.1" obkv = "0.2.0" diff --git a/meilisearch-lib/src/index/index.rs b/meilisearch-lib/src/index/index.rs index 9c6150cfb..1fe191c41 100644 --- a/meilisearch-lib/src/index/index.rs +++ b/meilisearch-lib/src/index/index.rs @@ -175,12 +175,10 @@ impl Index { two_typos: Setting::Set(self.min_word_len_two_typos(txn)?), }; - let disabled_words = self - .exact_words(txn)? - .into_stream() - .into_strs()? - .into_iter() - .collect(); + let disabled_words = match self.exact_words(txn)? { + Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(), + None => BTreeSet::new(), + }; let disabled_attributes = self .exact_attributes(txn)? diff --git a/meilisearch-lib/src/index/search.rs b/meilisearch-lib/src/index/search.rs index 91a46600f..6a4a0a672 100644 --- a/meilisearch-lib/src/index/search.rs +++ b/meilisearch-lib/src/index/search.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use std::time::Instant; use either::Either; -use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::tokenizer::TokenizerBuilder; use milli::{ AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError, }; @@ -175,12 +175,9 @@ impl Index { &displayed_ids, ); - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); + let tokenizer = TokenizerBuilder::default().build(); - let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words); + let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer); formatter_builder.crop_marker(query.crop_marker); formatter_builder.highlight_prefix(query.highlight_pre_tag); formatter_builder.highlight_suffix(query.highlight_post_tag); @@ -204,7 +201,6 @@ impl Index { &displayed_document, &fields_ids_map, &formatter_builder, - &analyzer, &formatted_options, query.show_matches_position, &displayed_ids, @@ -414,8 +410,7 @@ fn make_document( fn format_fields<'a, A: AsRef<[u8]>>( document: &Document, field_ids_map: &FieldsIdsMap, - builder: &MatcherBuilder, - analyzer: &'a Analyzer<'a, A>, + builder: &MatcherBuilder<'a, A>, formatted_options: &BTreeMap, compute_matches: bool, displayable_ids: &BTreeSet, @@ -446,7 +441,6 @@ fn format_fields<'a, A: AsRef<[u8]>>( std::mem::take(value), builder, format, - analyzer, &mut infos, compute_matches, ); @@ -470,19 +464,14 @@ fn format_fields<'a, A: AsRef<[u8]>>( fn format_value<'a, A: AsRef<[u8]>>( value: Value, - builder: &MatcherBuilder, + builder: &MatcherBuilder<'a, A>, format_options: Option, - analyzer: &'a Analyzer<'a, A>, infos: &mut Vec, compute_matches: bool, ) -> Value { match value { Value::String(old_string) => { - // this will be removed with charabia - let analyzed = analyzer.analyze(&old_string); - let tokens: Vec<_> = analyzed.tokens().collect(); - - let mut matcher = builder.build(&tokens[..], &old_string); + let mut matcher = builder.build(&old_string); if compute_matches { let matches = matcher.matches(); infos.extend_from_slice(&matches[..]); @@ -507,7 +496,6 @@ fn format_value<'a, A: AsRef<[u8]>>( highlight: format_options.highlight, crop: None, }), - analyzer, infos, compute_matches, ) @@ -527,7 +515,6 @@ fn format_value<'a, A: AsRef<[u8]>>( highlight: format_options.highlight, crop: None, }), - analyzer, infos, compute_matches, ), @@ -536,12 +523,9 @@ fn format_value<'a, A: AsRef<[u8]>>( .collect(), ), Value::Number(number) => { - // this will be removed with charabia let s = number.to_string(); - let analyzed = analyzer.analyze(&s); - let tokens: Vec<_> = analyzed.tokens().collect(); - let mut matcher = builder.build(&tokens[..], &s); + let mut matcher = builder.build(&s); if compute_matches { let matches = matcher.matches(); infos.extend_from_slice(&matches[..]);