2468: Update milli 0.29 r=Kerollmops a=ManyTheFish

- [x] Update milli to 0.29
- [x] Integrate charabia
- [x] Set disabled_words to default when Index::exact_words returns None
- [x] Fix ranking rules integration test

fixes #2375
fixes #2144
fixes #2417
fixes #2407

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
bors[bot] 2022-06-08 14:29:20 +00:00 committed by GitHub
commit 6171f17f1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 71 additions and 85 deletions

99
Cargo.lock generated
View File

@ -643,12 +643,33 @@ dependencies = [
]
[[package]]
name = "character_converter"
version = "1.0.0"
name = "charabia"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c"
checksum = "4a26a3df4d9c9231eb1e757fe6b1c66c471e0c2cd5410265e7c3109a726663c4"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"lindera",
"lindera-core",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]]
name = "character_converter"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7064c6e919124b6541c52fef59d88c3c3eabdf4bc97c13b14551df775aead02"
dependencies = [
"bincode",
"fst",
"once_cell",
]
[[package]]
@ -1102,8 +1123,8 @@ dependencies = [
[[package]]
name = "filter-parser"
version = "0.28.0"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [
"nom",
"nom_locate",
@ -1127,8 +1148,8 @@ dependencies = [
[[package]]
name = "flatten-serde-json"
version = "0.28.0"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [
"serde_json",
]
@ -1640,8 +1661,8 @@ dependencies = [
[[package]]
name = "json-depth-checker"
version = "0.28.0"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [
"serde_json",
]
@ -1719,9 +1740,9 @@ dependencies = [
[[package]]
name = "lindera"
version = "0.12.6"
version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dea10df226936ff54f16d3922500e08ef4be2ba7c0070bec9ad4a1474316111"
checksum = "7d1c5db4b1d12637aa316dc1adb215f78fe79025080af750942516c5ff17d1a0"
dependencies = [
"anyhow",
"bincode",
@ -1741,9 +1762,9 @@ dependencies = [
[[package]]
name = "lindera-cc-cedict-builder"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4392785248c3d8755c6fae9d0086d27ad7a1d6810155a2494fe5206e2021f471"
checksum = "73a3509fb497340571d49feddb57e1db2ce5248c4d449f2548d0ee8cb745eb1e"
dependencies = [
"anyhow",
"bincode",
@ -1761,9 +1782,9 @@ dependencies = [
[[package]]
name = "lindera-core"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af63a4484334d4b83277621f1ba62fb83472858cc37fb4ab2181a4c19eebcb38"
checksum = "5d20d1b2c085393aed58625d741beca69410e1143fc35bc67ebc35c9885f9f74"
dependencies = [
"anyhow",
"bincode",
@ -1777,9 +1798,9 @@ dependencies = [
[[package]]
name = "lindera-decompress"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "817ee62bc8973ec2457805df83796c59f074e49a4a0ee9baffe2663fe157f54a"
checksum = "b96b8050cded13927a99bcb8cbb0987f89fc8f35429fc153b4bc05ddc7a53a44"
dependencies = [
"anyhow",
"lzma-rs",
@ -1788,9 +1809,9 @@ dependencies = [
[[package]]
name = "lindera-dictionary"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd57501ee44a6aba0431d043c7926347e29883a79d8fc3955b8837e4ad1fee3c"
checksum = "5abe3dddc22303402957edb4472ab0c996e0d93b3b00643de3bee8b28c2f9297"
dependencies = [
"anyhow",
"bincode",
@ -1800,9 +1821,9 @@ dependencies = [
[[package]]
name = "lindera-ipadic"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade3bd3faa5f0db629c26264663e901dee5f46221eb04c2c7b592bd7485d44f9"
checksum = "b8f4c111f6ad9eb9e015d02061af2ed36fc0255f29359294415c7c2f1ea5b5b6"
dependencies = [
"bincode",
"byteorder",
@ -1817,9 +1838,9 @@ dependencies = [
[[package]]
name = "lindera-ipadic-builder"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee61f8dd6566738c5fd0ee9b1c11212ffc2d1f97af69c08a02cbb5c49995250a"
checksum = "a2b9893f22a4a7511ac70ff7d96cda9b8d7259b7d7121784183c73bc593ce6e7"
dependencies = [
"anyhow",
"bincode",
@ -1837,9 +1858,9 @@ dependencies = [
[[package]]
name = "lindera-ko-dic-builder"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01f05950d9adc7aa42aa8b16be1616f9625576c867179ac29372714eaed6993d"
checksum = "14282600ebfe7ab6fd4f3042143024ff9d74c09d58fd983d0c587839cf940d4a"
dependencies = [
"anyhow",
"bincode",
@ -1857,9 +1878,9 @@ dependencies = [
[[package]]
name = "lindera-unidic-builder"
version = "0.12.6"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3836c1278b8309ebf209c67bc7a935f4ce7c9246a578b250540398806a40b81d"
checksum = "b20825d46c95854e47c532c3e548dfec07c8f187c1ed89383cb6c35790338088"
dependencies = [
"anyhow",
"bincode",
@ -2142,24 +2163,6 @@ dependencies = [
"whoami",
]
[[package]]
name = "meilisearch-tokenizer"
version = "0.2.9"
source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.9#1dfc8ad9f5b338c39c3bc5fd5b2d0c1328314ddc"
dependencies = [
"character_converter",
"cow-utils",
"deunicode",
"fst",
"jieba-rs",
"lindera",
"lindera-core",
"once_cell",
"slice-group-by",
"unicode-segmentation",
"whatlang",
]
[[package]]
name = "memchr"
version = "2.5.0"
@ -2186,13 +2189,14 @@ dependencies = [
[[package]]
name = "milli"
version = "0.28.0"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
version = "0.29.1"
source = "git+https://github.com/meilisearch/milli.git?tag=v0.29.1#7313d6c5331e7dc13e9ded70b60b1f56dd7e583c"
dependencies = [
"bimap",
"bincode",
"bstr",
"byteorder",
"charabia",
"concat-arrays",
"crossbeam-channel",
"csv",
@ -2209,7 +2213,6 @@ dependencies = [
"levenshtein_automata",
"log",
"logging_timer",
"meilisearch-tokenizer",
"memmap2",
"obkv",
"once_cell",

View File

@ -8,7 +8,7 @@ base64 = "0.13.0"
enum-iterator = "0.7.0"
hmac = "0.12.1"
meilisearch-error = { path = "../meilisearch-error" }
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" }
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" }
rand = "0.8.4"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = { version = "1.0.79", features = ["preserve_order"] }

View File

@ -89,9 +89,9 @@ impl Index<'_> {
}
pub async fn wait_task(&self, update_id: u64) -> Value {
// try 10 times to get status, or panic to not wait forever
// try several times to get status, or panic to not wait forever
let url = format!("/tasks/{}", update_id);
for _ in 0..10 {
for _ in 0..100 {
let (response, status_code) = self.service.get(&url).await;
assert_eq!(200, status_code, "response: {}", response);
@ -99,7 +99,8 @@ impl Index<'_> {
return response;
}
sleep(Duration::from_secs(1)).await;
// wait 0.5 second.
sleep(Duration::from_millis(500)).await;
}
panic!("Timeout waiting for update id");
}

View File

@ -43,7 +43,7 @@ async fn get_document() {
]);
let (_, code) = index.add_documents(documents, None).await;
assert_eq!(code, 202);
index.wait_task(0).await;
index.wait_task(1).await;
let (response, code) = index.get_document(0, None).await;
assert_eq!(code, 200);
assert_eq!(
@ -306,7 +306,7 @@ async fn get_document_s_nested_attributes_to_retrieve() {
]);
let (_, code) = index.add_documents(documents, None).await;
assert_eq!(code, 202);
index.wait_task(0).await;
index.wait_task(1).await;
let (response, code) = index
.get_document(

View File

@ -283,7 +283,7 @@ async fn error_set_invalid_ranking_rules() {
assert_eq!(response["status"], "failed");
let expected_error = json!({
"message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules."#,
"message": r#"`manyTheFish` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules."#,
"code": "invalid_ranking_rule",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_ranking_rule"

View File

@ -30,7 +30,7 @@ lazy_static = "1.4.0"
log = "0.4.14"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-error = { path = "../meilisearch-error" }
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" }
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.29.1" }
mime = "0.3.16"
num_cpus = "1.13.1"
obkv = "0.2.0"

View File

@ -175,12 +175,10 @@ impl Index {
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
};
let disabled_words = self
.exact_words(txn)?
.into_stream()
.into_strs()?
.into_iter()
.collect();
let disabled_words = match self.exact_words(txn)? {
Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
None => BTreeSet::new(),
};
let disabled_attributes = self
.exact_attributes(txn)?

View File

@ -4,7 +4,7 @@ use std::str::FromStr;
use std::time::Instant;
use either::Either;
use milli::tokenizer::{Analyzer, AnalyzerConfig};
use milli::tokenizer::TokenizerBuilder;
use milli::{
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError,
};
@ -175,12 +175,9 @@ impl Index {
&displayed_ids,
);
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let tokenizer = TokenizerBuilder::default().build();
let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words);
let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
formatter_builder.crop_marker(query.crop_marker);
formatter_builder.highlight_prefix(query.highlight_pre_tag);
formatter_builder.highlight_suffix(query.highlight_post_tag);
@ -204,7 +201,6 @@ impl Index {
&displayed_document,
&fields_ids_map,
&formatter_builder,
&analyzer,
&formatted_options,
query.show_matches_position,
&displayed_ids,
@ -414,8 +410,7 @@ fn make_document(
fn format_fields<'a, A: AsRef<[u8]>>(
document: &Document,
field_ids_map: &FieldsIdsMap,
builder: &MatcherBuilder,
analyzer: &'a Analyzer<'a, A>,
builder: &MatcherBuilder<'a, A>,
formatted_options: &BTreeMap<FieldId, FormatOptions>,
compute_matches: bool,
displayable_ids: &BTreeSet<FieldId>,
@ -446,7 +441,6 @@ fn format_fields<'a, A: AsRef<[u8]>>(
std::mem::take(value),
builder,
format,
analyzer,
&mut infos,
compute_matches,
);
@ -470,19 +464,14 @@ fn format_fields<'a, A: AsRef<[u8]>>(
fn format_value<'a, A: AsRef<[u8]>>(
value: Value,
builder: &MatcherBuilder,
builder: &MatcherBuilder<'a, A>,
format_options: Option<FormatOptions>,
analyzer: &'a Analyzer<'a, A>,
infos: &mut Vec<MatchBounds>,
compute_matches: bool,
) -> Value {
match value {
Value::String(old_string) => {
// this will be removed with charabia
let analyzed = analyzer.analyze(&old_string);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], &old_string);
let mut matcher = builder.build(&old_string);
if compute_matches {
let matches = matcher.matches();
infos.extend_from_slice(&matches[..]);
@ -507,7 +496,6 @@ fn format_value<'a, A: AsRef<[u8]>>(
highlight: format_options.highlight,
crop: None,
}),
analyzer,
infos,
compute_matches,
)
@ -527,7 +515,6 @@ fn format_value<'a, A: AsRef<[u8]>>(
highlight: format_options.highlight,
crop: None,
}),
analyzer,
infos,
compute_matches,
),
@ -536,12 +523,9 @@ fn format_value<'a, A: AsRef<[u8]>>(
.collect(),
),
Value::Number(number) => {
// this will be removed with charabia
let s = number.to_string();
let analyzed = analyzer.analyze(&s);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], &s);
let mut matcher = builder.build(&s);
if compute_matches {
let matches = matcher.matches();
infos.extend_from_slice(&matches[..]);