2468: Update milli 0.29 r=Kerollmops a=ManyTheFish

- [x] Update milli to 0.29
- [x] Integrate charabia
- [x] Set disabled_words to default when Index::exact_words returns None
- [x] Fix ranking rules integration test

fixes #2375
fixes #2144
fixes #2417
fixes #2407

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
bors[bot] 2022-06-08 14:29:20 +00:00 committed by GitHub
commit 6171f17f1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 71 additions and 85 deletions

99
Cargo.lock generated
View File

@ -643,12 +643,33 @@ dependencies = [
] ]
[[package]] [[package]]
name = "character_converter" name = "charabia"
version = "1.0.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" checksum = "4a26a3df4d9c9231eb1e757fe6b1c66c471e0c2cd5410265e7c3109a726663c4"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"lindera",
"lindera-core",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]]
name = "character_converter"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7064c6e919124b6541c52fef59d88c3c3eabdf4bc97c13b14551df775aead02"
dependencies = [ dependencies = [
"bincode", "bincode",
"fst",
"once_cell",
] ]
[[package]] [[package]]
@ -1102,8 +1123,8 @@ dependencies = [
[[package]] [[package]]
name = "filter-parser" name = "filter-parser"
version = "0.28.0" version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [ dependencies = [
"nom", "nom",
"nom_locate", "nom_locate",
@ -1127,8 +1148,8 @@ dependencies = [
[[package]] [[package]]
name = "flatten-serde-json" name = "flatten-serde-json"
version = "0.28.0" version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [ dependencies = [
"serde_json", "serde_json",
] ]
@ -1640,8 +1661,8 @@ dependencies = [
[[package]] [[package]]
name = "json-depth-checker" name = "json-depth-checker"
version = "0.28.0" version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [ dependencies = [
"serde_json", "serde_json",
] ]
@ -1719,9 +1740,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera" name = "lindera"
version = "0.12.6" version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dea10df226936ff54f16d3922500e08ef4be2ba7c0070bec9ad4a1474316111" checksum = "7d1c5db4b1d12637aa316dc1adb215f78fe79025080af750942516c5ff17d1a0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -1741,9 +1762,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-cc-cedict-builder" name = "lindera-cc-cedict-builder"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4392785248c3d8755c6fae9d0086d27ad7a1d6810155a2494fe5206e2021f471" checksum = "73a3509fb497340571d49feddb57e1db2ce5248c4d449f2548d0ee8cb745eb1e"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -1761,9 +1782,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-core" name = "lindera-core"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af63a4484334d4b83277621f1ba62fb83472858cc37fb4ab2181a4c19eebcb38" checksum = "5d20d1b2c085393aed58625d741beca69410e1143fc35bc67ebc35c9885f9f74"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -1777,9 +1798,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-decompress" name = "lindera-decompress"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "817ee62bc8973ec2457805df83796c59f074e49a4a0ee9baffe2663fe157f54a" checksum = "b96b8050cded13927a99bcb8cbb0987f89fc8f35429fc153b4bc05ddc7a53a44"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"lzma-rs", "lzma-rs",
@ -1788,9 +1809,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-dictionary" name = "lindera-dictionary"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd57501ee44a6aba0431d043c7926347e29883a79d8fc3955b8837e4ad1fee3c" checksum = "5abe3dddc22303402957edb4472ab0c996e0d93b3b00643de3bee8b28c2f9297"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -1800,9 +1821,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-ipadic" name = "lindera-ipadic"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade3bd3faa5f0db629c26264663e901dee5f46221eb04c2c7b592bd7485d44f9" checksum = "b8f4c111f6ad9eb9e015d02061af2ed36fc0255f29359294415c7c2f1ea5b5b6"
dependencies = [ dependencies = [
"bincode", "bincode",
"byteorder", "byteorder",
@ -1817,9 +1838,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-ipadic-builder" name = "lindera-ipadic-builder"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee61f8dd6566738c5fd0ee9b1c11212ffc2d1f97af69c08a02cbb5c49995250a" checksum = "a2b9893f22a4a7511ac70ff7d96cda9b8d7259b7d7121784183c73bc593ce6e7"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -1837,9 +1858,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-ko-dic-builder" name = "lindera-ko-dic-builder"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01f05950d9adc7aa42aa8b16be1616f9625576c867179ac29372714eaed6993d" checksum = "14282600ebfe7ab6fd4f3042143024ff9d74c09d58fd983d0c587839cf940d4a"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -1857,9 +1878,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-unidic-builder" name = "lindera-unidic-builder"
version = "0.12.6" version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3836c1278b8309ebf209c67bc7a935f4ce7c9246a578b250540398806a40b81d" checksum = "b20825d46c95854e47c532c3e548dfec07c8f187c1ed89383cb6c35790338088"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -2142,24 +2163,6 @@ dependencies = [
"whoami", "whoami",
] ]
[[package]]
name = "meilisearch-tokenizer"
version = "0.2.9"
source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.9#1dfc8ad9f5b338c39c3bc5fd5b2d0c1328314ddc"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"lindera",
"lindera-core",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.5.0" version = "2.5.0"
@ -2186,13 +2189,14 @@ dependencies = [
[[package]] [[package]]
name = "milli" name = "milli"
version = "0.28.0" version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [ dependencies = [
"bimap", "bimap",
"bincode", "bincode",
"bstr", "bstr",
"byteorder", "byteorder",
"charabia",
"concat-arrays", "concat-arrays",
"crossbeam-channel", "crossbeam-channel",
"csv", "csv",
@ -2209,7 +2213,6 @@ dependencies = [
"levenshtein_automata", "levenshtein_automata",
"log", "log",
"logging_timer", "logging_timer",
"meilisearch-tokenizer",
"memmap2", "memmap2",
"obkv", "obkv",
"once_cell", "once_cell",

View File

@ -8,7 +8,7 @@ base64 = "0.13.0"
enum-iterator = "0.7.0" enum-iterator = "0.7.0"
hmac = "0.12.1" hmac = "0.12.1"
meilisearch-error = { path = "../meilisearch-error" } meilisearch-error = { path = "../meilisearch-error" }
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" }
rand = "0.8.4" rand = "0.8.4"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_json = { version = "1.0.79", features = ["preserve_order"] } serde_json = { version = "1.0.79", features = ["preserve_order"] }

View File

@ -89,9 +89,9 @@ impl Index<'_> {
} }
pub async fn wait_task(&self, update_id: u64) -> Value { pub async fn wait_task(&self, update_id: u64) -> Value {
// try 10 times to get status, or panic to not wait forever // try several times to get status, or panic to not wait forever
let url = format!("/tasks/{}", update_id); let url = format!("/tasks/{}", update_id);
for _ in 0..10 { for _ in 0..100 {
let (response, status_code) = self.service.get(&url).await; let (response, status_code) = self.service.get(&url).await;
assert_eq!(200, status_code, "response: {}", response); assert_eq!(200, status_code, "response: {}", response);
@ -99,7 +99,8 @@ impl Index<'_> {
return response; return response;
} }
sleep(Duration::from_secs(1)).await; // wait 0.5 second.
sleep(Duration::from_millis(500)).await;
} }
panic!("Timeout waiting for update id"); panic!("Timeout waiting for update id");
} }

View File

@ -43,7 +43,7 @@ async fn get_document() {
]); ]);
let (_, code) = index.add_documents(documents, None).await; let (_, code) = index.add_documents(documents, None).await;
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(0).await; index.wait_task(1).await;
let (response, code) = index.get_document(0, None).await; let (response, code) = index.get_document(0, None).await;
assert_eq!(code, 200); assert_eq!(code, 200);
assert_eq!( assert_eq!(
@ -306,7 +306,7 @@ async fn get_document_s_nested_attributes_to_retrieve() {
]); ]);
let (_, code) = index.add_documents(documents, None).await; let (_, code) = index.add_documents(documents, None).await;
assert_eq!(code, 202); assert_eq!(code, 202);
index.wait_task(0).await; index.wait_task(1).await;
let (response, code) = index let (response, code) = index
.get_document( .get_document(

View File

@ -283,7 +283,7 @@ async fn error_set_invalid_ranking_rules() {
assert_eq!(response["status"], "failed"); assert_eq!(response["status"], "failed");
let expected_error = json!({ let expected_error = json!({
"message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules."#, "message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules."#,
"code": "invalid_ranking_rule", "code": "invalid_ranking_rule",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_ranking_rule" "link": "https://docs.meilisearch.com/errors#invalid_ranking_rule"

View File

@ -30,7 +30,7 @@ lazy_static = "1.4.0"
log = "0.4.14" log = "0.4.14"
meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-error = { path = "../meilisearch-error" } meilisearch-error = { path = "../meilisearch-error" }
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" }
mime = "0.3.16" mime = "0.3.16"
num_cpus = "1.13.1" num_cpus = "1.13.1"
obkv = "0.2.0" obkv = "0.2.0"

View File

@ -175,12 +175,10 @@ impl Index {
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?), two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
}; };
let disabled_words = self let disabled_words = match self.exact_words(txn)? {
.exact_words(txn)? Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
.into_stream() None => BTreeSet::new(),
.into_strs()? };
.into_iter()
.collect();
let disabled_attributes = self let disabled_attributes = self
.exact_attributes(txn)? .exact_attributes(txn)?

View File

@ -4,7 +4,7 @@ use std::str::FromStr;
use std::time::Instant; use std::time::Instant;
use either::Either; use either::Either;
use milli::tokenizer::{Analyzer, AnalyzerConfig}; use milli::tokenizer::TokenizerBuilder;
use milli::{ use milli::{
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError, AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError,
}; };
@ -175,12 +175,9 @@ impl Index {
&displayed_ids, &displayed_ids,
); );
let stop_words = fst::Set::default(); let tokenizer = TokenizerBuilder::default().build();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words); let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
formatter_builder.crop_marker(query.crop_marker); formatter_builder.crop_marker(query.crop_marker);
formatter_builder.highlight_prefix(query.highlight_pre_tag); formatter_builder.highlight_prefix(query.highlight_pre_tag);
formatter_builder.highlight_suffix(query.highlight_post_tag); formatter_builder.highlight_suffix(query.highlight_post_tag);
@ -204,7 +201,6 @@ impl Index {
&displayed_document, &displayed_document,
&fields_ids_map, &fields_ids_map,
&formatter_builder, &formatter_builder,
&analyzer,
&formatted_options, &formatted_options,
query.show_matches_position, query.show_matches_position,
&displayed_ids, &displayed_ids,
@ -414,8 +410,7 @@ fn make_document(
fn format_fields<'a, A: AsRef<[u8]>>( fn format_fields<'a, A: AsRef<[u8]>>(
document: &Document, document: &Document,
field_ids_map: &FieldsIdsMap, field_ids_map: &FieldsIdsMap,
builder: &MatcherBuilder, builder: &MatcherBuilder<'a, A>,
analyzer: &'a Analyzer<'a, A>,
formatted_options: &BTreeMap<FieldId, FormatOptions>, formatted_options: &BTreeMap<FieldId, FormatOptions>,
compute_matches: bool, compute_matches: bool,
displayable_ids: &BTreeSet<FieldId>, displayable_ids: &BTreeSet<FieldId>,
@ -446,7 +441,6 @@ fn format_fields<'a, A: AsRef<[u8]>>(
std::mem::take(value), std::mem::take(value),
builder, builder,
format, format,
analyzer,
&mut infos, &mut infos,
compute_matches, compute_matches,
); );
@ -470,19 +464,14 @@ fn format_fields<'a, A: AsRef<[u8]>>(
fn format_value<'a, A: AsRef<[u8]>>( fn format_value<'a, A: AsRef<[u8]>>(
value: Value, value: Value,
builder: &MatcherBuilder, builder: &MatcherBuilder<'a, A>,
format_options: Option<FormatOptions>, format_options: Option<FormatOptions>,
analyzer: &'a Analyzer<'a, A>,
infos: &mut Vec<MatchBounds>, infos: &mut Vec<MatchBounds>,
compute_matches: bool, compute_matches: bool,
) -> Value { ) -> Value {
match value { match value {
Value::String(old_string) => { Value::String(old_string) => {
// this will be removed with charabia let mut matcher = builder.build(&old_string);
let analyzed = analyzer.analyze(&old_string);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], &old_string);
if compute_matches { if compute_matches {
let matches = matcher.matches(); let matches = matcher.matches();
infos.extend_from_slice(&matches[..]); infos.extend_from_slice(&matches[..]);
@ -507,7 +496,6 @@ fn format_value<'a, A: AsRef<[u8]>>(
highlight: format_options.highlight, highlight: format_options.highlight,
crop: None, crop: None,
}), }),
analyzer,
infos, infos,
compute_matches, compute_matches,
) )
@ -527,7 +515,6 @@ fn format_value<'a, A: AsRef<[u8]>>(
highlight: format_options.highlight, highlight: format_options.highlight,
crop: None, crop: None,
}), }),
analyzer,
infos, infos,
compute_matches, compute_matches,
), ),
@ -536,12 +523,9 @@ fn format_value<'a, A: AsRef<[u8]>>(
.collect(), .collect(),
), ),
Value::Number(number) => { Value::Number(number) => {
// this will be removed with charabia
let s = number.to_string(); let s = number.to_string();
let analyzed = analyzer.analyze(&s);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], &s); let mut matcher = builder.build(&s);
if compute_matches { if compute_matches {
let matches = matcher.matches(); let matches = matcher.matches();
infos.extend_from_slice(&matches[..]); infos.extend_from_slice(&matches[..]);