4945: Add swedish in default pipelines r=dureuill a=ManyTheFish

# Summary
## Fix Swedish support

In Swedish the characters `å`/`ä`/`ö` are completely different than `a` or `o`  and should not be normalized as the same character.
because the Swedish specialized pipeline was not activated by default, these characters were normalized even with the settings:
```json
{
  "localizedAttributes": [ { "locales": ["swe"], "attributePatterns": ["*"] } ]
}
```

## Update Charabia adding German support

German segmentation will now be activated using the setting:
```json
{
  "localizedAttributes": [ { "locales": ["deu"], "attributePatterns": ["*"] } ]
}
```

# TODO

- [x] Activate Swedish Pipeline
- [x] Add a test to avoid future regressions
- [x] Update Charabia


Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-09-19 14:42:03 +00:00 committed by GitHub
commit bd34ed01d9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 215 additions and 10 deletions

View file

@ -153,6 +153,7 @@ greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
german = ["meilisearch-types/german"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"

View file

@ -922,7 +922,7 @@ async fn invalid_locales() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Unknown value `invalid` at `.locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `zh`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `cmn`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zul`",
"message": "Unknown value `invalid` at `.locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zh`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zho`, `zul`, `cmn`",
"code": "invalid_search_locales",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_locales"
@ -935,7 +935,7 @@ async fn invalid_locales() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `locales`: Unsupported locale `invalid`, expected one of af, ak, am, ar, az, be, bn, bg, ca, cs, zh, da, de, el, en, eo, et, fi, fr, gu, he, hi, hr, hu, hy, id, it, jv, ja, kn, ka, km, ko, la, lv, lt, ml, mr, mk, my, ne, nl, nb, or, pa, fa, pl, pt, ro, ru, si, sk, sl, sn, es, sr, sv, ta, te, tl, th, tk, tr, uk, ur, uz, vi, yi, zu, afr, aka, amh, ara, aze, bel, ben, bul, cat, ces, cmn, dan, deu, ell, eng, epo, est, fin, fra, guj, heb, hin, hrv, hun, hye, ind, ita, jav, jpn, kan, kat, khm, kor, lat, lav, lit, mal, mar, mkd, mya, nep, nld, nob, ori, pan, pes, pol, por, ron, rus, sin, slk, slv, sna, spa, srp, swe, tam, tel, tgl, tha, tuk, tur, ukr, urd, uzb, vie, yid, zul",
"message": "Invalid value in parameter `locales`: Unsupported locale `invalid`, expected one of af, ak, am, ar, az, be, bg, bn, ca, cs, da, de, el, en, eo, es, et, fa, fi, fr, gu, he, hi, hr, hu, hy, id, it, ja, jv, ka, km, kn, ko, la, lt, lv, mk, ml, mr, my, nb, ne, nl, or, pa, pl, pt, ro, ru, si, sk, sl, sn, sr, sv, ta, te, th, tk, tl, tr, uk, ur, uz, vi, yi, zh, zu, afr, aka, amh, ara, aze, bel, ben, bul, cat, ces, cmn, dan, deu, ell, eng, epo, est, fin, fra, guj, heb, hin, hrv, hun, hye, ind, ita, jav, jpn, kan, kat, khm, kor, lat, lav, lit, mal, mar, mkd, mya, nep, nld, nob, ori, pan, pes, pol, por, ron, rus, sin, slk, slv, sna, spa, srp, swe, tam, tel, tgl, tha, tuk, tur, ukr, urd, uzb, vie, yid, zho, zul",
"code": "invalid_search_locales",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_locales"
@ -957,7 +957,7 @@ async fn invalid_localized_attributes_rules() {
.await;
snapshot!(response, @r###"
{
"message": "Unknown value `japan` at `.localizedAttributes[0].locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `zh`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `cmn`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zul`",
"message": "Unknown value `japan` at `.localizedAttributes[0].locales[0]`: expected one of `af`, `ak`, `am`, `ar`, `az`, `be`, `bn`, `bg`, `ca`, `cs`, `da`, `de`, `el`, `en`, `eo`, `et`, `fi`, `fr`, `gu`, `he`, `hi`, `hr`, `hu`, `hy`, `id`, `it`, `jv`, `ja`, `kn`, `ka`, `km`, `ko`, `la`, `lv`, `lt`, `ml`, `mr`, `mk`, `my`, `ne`, `nl`, `nb`, `or`, `pa`, `fa`, `pl`, `pt`, `ro`, `ru`, `si`, `sk`, `sl`, `sn`, `es`, `sr`, `sv`, `ta`, `te`, `tl`, `th`, `tk`, `tr`, `uk`, `ur`, `uz`, `vi`, `yi`, `zh`, `zu`, `afr`, `aka`, `amh`, `ara`, `aze`, `bel`, `ben`, `bul`, `cat`, `ces`, `dan`, `deu`, `ell`, `eng`, `epo`, `est`, `fin`, `fra`, `guj`, `heb`, `hin`, `hrv`, `hun`, `hye`, `ind`, `ita`, `jav`, `jpn`, `kan`, `kat`, `khm`, `kor`, `lat`, `lav`, `lit`, `mal`, `mar`, `mkd`, `mya`, `nep`, `nld`, `nob`, `ori`, `pan`, `pes`, `pol`, `por`, `ron`, `rus`, `sin`, `slk`, `slv`, `sna`, `spa`, `srp`, `swe`, `tam`, `tel`, `tgl`, `tha`, `tuk`, `tur`, `ukr`, `urd`, `uzb`, `vie`, `yid`, `zho`, `zul`, `cmn`",
"code": "invalid_settings_localized_attributes",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_settings_localized_attributes"
@ -1143,3 +1143,195 @@ async fn facet_search_with_localized_attributes() {
}
"###);
}
#[actix_rt::test]
async fn swedish_search() {
let server = Server::new().await;
let index = server.index("test");
let documents = json!([
{"id": "tra1-1", "product": "trä"},
{"id": "tra2-1", "product": "traktor"},
{"id": "tra1-2", "product": "träbjälke"},
{"id": "tra2-2", "product": "trafiksignal"},
]);
index.add_documents(documents, None).await;
let (_response, _) = index
.update_settings(json!({
"searchableAttributes": ["product"],
"localizedAttributes": [
// force swedish
{"attributePatterns": ["product"], "locales": ["swe"]}
]
}))
.await;
index.wait_task(1).await;
// infer swedish
index
.search(json!({"q": "trä", "attributesToRetrieve": ["product"]}), |response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"product": "trä"
},
{
"product": "träbjälke"
}
],
"query": "trä",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 2
}
"###);
snapshot!(code, @"200 OK");
})
.await;
index
.search(json!({"q": "tra", "attributesToRetrieve": ["product"]}), |response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"product": "traktor"
},
{
"product": "trafiksignal"
}
],
"query": "tra",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 2
}
"###);
snapshot!(code, @"200 OK");
})
.await;
// force swedish
index
.search(
json!({"q": "trä", "locales": ["swe"], "attributesToRetrieve": ["product"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"product": "trä"
},
{
"product": "träbjälke"
}
],
"query": "trä",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 2
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
index
.search(
json!({"q": "tra", "locales": ["swe"], "attributesToRetrieve": ["product"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"product": "traktor"
},
{
"product": "trafiksignal"
}
],
"query": "tra",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 2
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
}
#[actix_rt::test]
async fn german_search() {
let server = Server::new().await;
let index = server.index("test");
let documents = json!([
{"id": 1, "product": "Interkulturalität"},
{"id": 2, "product": "Wissensorganisation"},
]);
index.add_documents(documents, None).await;
let (_response, _) = index
.update_settings(json!({
"searchableAttributes": ["product"],
"localizedAttributes": [
// force swedish
{"attributePatterns": ["product"], "locales": ["deu"]}
]
}))
.await;
index.wait_task(1).await;
// infer swedish
index
.search(
json!({"q": "kulturalität", "attributesToRetrieve": ["product"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"product": "Interkulturalität"
}
],
"query": "kulturalität",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
index
.search(
json!({"q": "organisation", "attributesToRetrieve": ["product"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"product": "Wissensorganisation"
}
],
"query": "organisation",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
}