From 877717cb2675e154eaa98d947651c2e2405c485a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Sep 2024 08:34:04 +0200 Subject: [PATCH 1/5] Add a test using Swedish documents --- meilisearch/tests/search/locales.rs | 122 ++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index 4724f975d..53bcece06 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -1143,3 +1143,125 @@ async fn facet_search_with_localized_attributes() { } "###); } +#[actix_rt::test] +async fn swedish_search() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = json!([ + {"id": "tra1-1", "product": "trä"}, + {"id": "tra2-1", "product": "traktor"}, + {"id": "tra1-2", "product": "träbjälke"}, + {"id": "tra2-2", "product": "trafiksignal"}, + ]); + index.add_documents(documents, None).await; + let (_response, _) = index + .update_settings(json!({ + "searchableAttributes": ["product"], + "localizedAttributes": [ + // force swedish + {"attributePatterns": ["product"], "locales": ["swe"]} + ] + })) + .await; + index.wait_task(1).await; + + // infer swedish + index + .search(json!({"q": "trä", "attributesToRetrieve": ["product"]}), |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "product": "trä" + }, + { + "product": "träbjälke" + } + ], + "query": "trä", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + snapshot!(code, @"200 OK"); + }) + .await; + + index + .search(json!({"q": "tra", "attributesToRetrieve": ["product"]}), |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "product": "traktor" + }, + { + "product": "trafiksignal" + } + ], + "query": "tra", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + snapshot!(code, @"200 OK"); + }) + .await; + + // force swedish + index + .search( + json!({"q": "trä", "locales": ["swe"], "attributesToRetrieve": ["product"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "product": "trä" + }, + { + "product": "träbjälke" + } + ], + "query": "trä", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + index + .search( + json!({"q": "tra", "locales": ["swe"], "attributesToRetrieve": ["product"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "product": "traktor" + }, + { + "product": "trafiksignal" + } + ], + "query": "tra", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} From bbaee3dbc63640984051e3eb37e7fc0e57dd873e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Sep 2024 08:34:51 +0200 Subject: [PATCH 2/5] Add Swedish pipeline in all-tokenization feature --- milli/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 79b61b4f1..8a5ba366f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -106,6 +106,7 @@ all-tokenizations = [ "charabia/greek", "charabia/khmer", "charabia/vietnamese", + "charabia/swedish-recomposition", ] # Use POSIX semaphores instead of SysV semaphores in LMDB From f77661ec44f48eddc4fe7f4538815322e363b0c4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Sep 2024 16:08:59 +0200 Subject: [PATCH 3/5] Update Charabia v0.9.1 --- Cargo.lock | 6 +++--- meilisearch-types/src/locales.rs | 11 ++++++++--- meilisearch/tests/search/locales.rs | 6 +++--- milli/Cargo.toml | 2 +- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1af89d382..922ec606d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -933,9 +933,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03cd8f290cae94934cdd0103c14c2de9faf2d7d85be0d24d511af2bf1b14119d" +checksum = "55ff52497324e7d168505a16949ae836c14595606fab94687238d2f6c8d4c798" dependencies = [ "aho-corasick", "csv", @@ -2838,7 +2838,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d" dependencies = [ "cfg-if", - "windows-targets 0.48.1", + "windows-targets 0.52.4", ] [[package]] diff --git a/meilisearch-types/src/locales.rs b/meilisearch-types/src/locales.rs index 36c45aac3..8d746779e 100644 --- a/meilisearch-types/src/locales.rs +++ b/meilisearch-types/src/locales.rs @@ -39,12 +39,14 @@ macro_rules! make_locale { pub enum Locale { $($iso_639_1,)+ $($iso_639_3,)+ + Cmn, } impl From for Locale { fn from(other: milli::tokenizer::Language) -> Locale { match other { $(milli::tokenizer::Language::$iso_639_3 => Locale::$iso_639_3,)+ + milli::tokenizer::Language::Cmn => Locale::Cmn, } } } @@ -54,6 +56,7 @@ macro_rules! make_locale { match other { $(Locale::$iso_639_1 => milli::tokenizer::Language::$iso_639_3,)+ $(Locale::$iso_639_3 => milli::tokenizer::Language::$iso_639_3,)+ + Locale::Cmn => milli::tokenizer::Language::Cmn, } } } @@ -65,6 +68,7 @@ macro_rules! make_locale { let locale = match s { $($iso_639_1_str => Locale::$iso_639_1,)+ $($iso_639_3_str => Locale::$iso_639_3,)+ + "cmn" => Locale::Cmn, _ => return Err(LocaleFormatError { invalid_locale: s.to_string() }), }; @@ -79,8 +83,9 @@ macro_rules! make_locale { impl std::fmt::Display for LocaleFormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let valid_locales = [$($iso_639_1_str),+,$($iso_639_3_str),+].join(", "); - write!(f, "Unsupported locale `{}`, expected one of {}", self.invalid_locale, valid_locales) + let mut valid_locales = [$($iso_639_1_str),+,$($iso_639_3_str),+,"cmn"]; + valid_locales.sort_by(|left, right| left.len().cmp(&right.len()).then(left.cmp(right))); + write!(f, "Unsupported locale `{}`, expected one of {}", self.invalid_locale, valid_locales.join(", ")) } } @@ -99,7 +104,6 @@ make_locale!( (Bg, "bg") => (Bul, "bul"), (Ca, "ca") => (Cat, "cat"), (Cs, "cs") => (Ces, "ces"), - (Zh, "zh") => (Cmn, "cmn"), (Da, "da") => (Dan, "dan"), (De, "de") => (Deu, "deu"), (El, "el") => (Ell, "ell"), @@ -157,5 +161,6 @@ make_locale!( (Uz, "uz") => (Uzb, "uzb"), (Vi, "vi") => (Vie, "vie"), (Yi, "yi") => (Yid, "yid"), + (Zh, "zh") => (Zho, "zho"), (Zu, "zu") => (Zul, "zul"), ); diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index 53bcece06..408a01b0b 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -922,7 +922,7 @@ async fn invalid_locales() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `invalid` at `.locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `zh`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `cmn`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zul`", + "message": "Unknown value `invalid` at `.locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zh`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zho`, `zul`, `cmn`", "code": "invalid_search_locales", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_locales" @@ -935,7 +935,7 @@ async fn invalid_locales() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value in parameter `locales`: Unsupported locale `invalid`, expected one of af, ak, am, ar, az, be, bn, bg, ca, cs, zh, da, de, el, en, eo, et, fi, fr, gu, he, hi, hr, hu, hy, id, it, jv, ja, kn, ka, km, ko, la, lv, lt, ml, mr, mk, my, ne, nl, nb, or, pa, fa, pl, pt, ro, ru, si, sk, sl, sn, es, sr, sv, ta, te, tl, th, tk, tr, uk, ur, uz, vi, yi, zu, afr, aka, amh, ara, aze, bel, ben, bul, cat, ces, cmn, dan, deu, ell, eng, epo, est, fin, fra, guj, heb, hin, hrv, hun, hye, ind, ita, jav, jpn, kan, kat, khm, kor, lat, lav, lit, mal, mar, mkd, mya, nep, nld, nob, ori, pan, pes, pol, por, ron, rus, sin, slk, slv, sna, spa, srp, swe, tam, tel, tgl, tha, tuk, tur, ukr, urd, uzb, vie, yid, zul", + "message": "Invalid value in parameter `locales`: Unsupported locale `invalid`, expected one of af, ak, am, ar, az, be, bg, bn, ca, cs, da, de, el, en, eo, es, et, fa, fi, fr, gu, he, hi, hr, hu, hy, id, it, ja, jv, ka, km, kn, ko, la, lt, lv, mk, ml, mr, my, nb, ne, nl, or, pa, pl, pt, ro, ru, si, sk, sl, sn, sr, sv, ta, te, th, tk, tl, tr, uk, ur, uz, vi, yi, zh, zu, afr, aka, amh, ara, aze, bel, ben, bul, cat, ces, cmn, dan, deu, ell, eng, epo, est, fin, fra, guj, heb, hin, hrv, hun, hye, ind, ita, jav, jpn, kan, kat, khm, kor, lat, lav, lit, mal, mar, mkd, mya, nep, nld, nob, ori, pan, pes, pol, por, ron, rus, sin, slk, slv, sna, spa, srp, swe, tam, tel, tgl, tha, tuk, tur, ukr, urd, uzb, vie, yid, zho, zul", "code": "invalid_search_locales", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_locales" @@ -957,7 +957,7 @@ async fn invalid_localized_attributes_rules() { .await; snapshot!(response, @r###" { - "message": "Unknown value `japan` at `.localizedAttributes[0].locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `zh`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `cmn`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zul`", + "message": "Unknown value `japan` at `.localizedAttributes[0].locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zh`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zho`, `zul`, `cmn`", "code": "invalid_settings_localized_attributes", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_localized_attributes" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 8a5ba366f..e0a85ea8f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.9.0", default-features = false } +charabia = { version = "0.9.1", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" From 7d6768e4c4841cca4f01c098b9829c63a6ed1377 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Sep 2024 13:30:07 +0200 Subject: [PATCH 4/5] Add german tokenization pipeline --- meilisearch-types/Cargo.toml | 2 ++ meilisearch/Cargo.toml | 1 + milli/Cargo.toml | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index 73306c4dc..cb4937e57 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -66,3 +66,5 @@ khmer = ["milli/khmer"] vietnamese = ["milli/vietnamese"] # force swedish character recomposition swedish-recomposition = ["milli/swedish-recomposition"] +# force german character recomposition +german = ["milli/german"] diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index e614ecc6a..2a16e1017 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -153,6 +153,7 @@ greek = ["meilisearch-types/greek"] khmer = ["meilisearch-types/khmer"] vietnamese = ["meilisearch-types/vietnamese"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"] +german = ["meilisearch-types/german"] [package.metadata.mini-dashboard] assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e0a85ea8f..8c77f338c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -107,6 +107,7 @@ all-tokenizations = [ "charabia/khmer", "charabia/vietnamese", "charabia/swedish-recomposition", + "charabia/german-segmentation", ] # Use POSIX semaphores instead of SysV semaphores in LMDB @@ -139,6 +140,9 @@ khmer = ["charabia/khmer"] # allow vietnamese specialized tokenization vietnamese = ["charabia/vietnamese"] +# allow german specialized tokenization +german = ["charabia/german-segmentation"] + # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] From 465afe01b2e48e351cff7bd41fe7a65549958eaf Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 Sep 2024 13:41:57 +0200 Subject: [PATCH 5/5] Add test for German --- meilisearch/tests/search/locales.rs | 70 +++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index 408a01b0b..c01d854e2 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -1143,6 +1143,7 @@ async fn facet_search_with_localized_attributes() { } "###); } + #[actix_rt::test] async fn swedish_search() { let server = Server::new().await; @@ -1265,3 +1266,72 @@ async fn swedish_search() { ) .await; } + +#[actix_rt::test] +async fn german_search() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = json!([ + {"id": 1, "product": "Interkulturalität"}, + {"id": 2, "product": "Wissensorganisation"}, + ]); + index.add_documents(documents, None).await; + let (_response, _) = index + .update_settings(json!({ + "searchableAttributes": ["product"], + "localizedAttributes": [ + // force swedish + {"attributePatterns": ["product"], "locales": ["deu"]} + ] + })) + .await; + index.wait_task(1).await; + + // infer swedish + index + .search( + json!({"q": "kulturalität", "attributesToRetrieve": ["product"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "product": "Interkulturalität" + } + ], + "query": "kulturalität", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + index + .search( + json!({"q": "organisation", "attributesToRetrieve": ["product"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "product": "Wissensorganisation" + } + ], + "query": "organisation", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +}