4881: Infer locales from index settings r=curquiza a=ManyTheFish

# Pull Request

## Related issue
Fixes #4828
Fixes #4816
## What does this PR do?
- Add some test using `AttributesToSearchOn`
- Make the search infer the language based on the index settings when the `locales` filed is not precise


CI is now working:
https://github.com/meilisearch/meilisearch/actions/runs/10490050545/job/29055955667



Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-08-21 14:18:16 +00:00 committed by GitHub
commit 36d8684dc8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 385 additions and 7 deletions

View File

@ -386,12 +386,39 @@ async fn force_locales() {
|response, code| {
snapshot!(response, @r###"
{
"hits": [],
"hits": [
{
"name_zh": "进击的巨人",
"author_zh": "諫山創",
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
"id": 853,
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
},
"_formatted": {
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
"author_zh": "諫山創",
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
"id": "853",
"_vectors": {
"manual": [
"1.0",
"2.0",
"3.0"
]
}
}
}
],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
@ -483,12 +510,39 @@ async fn force_locales_with_pattern() {
|response, code| {
snapshot!(response, @r###"
{
"hits": [],
"hits": [
{
"name_zh": "进击的巨人",
"author_zh": "諫山創",
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
"id": 853,
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
},
"_formatted": {
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
"author_zh": "諫山創",
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
"id": "853",
"_vectors": {
"manual": [
"1.0",
"2.0",
"3.0"
]
}
}
}
],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
@ -761,6 +815,275 @@ async fn force_different_locales_with_pattern() {
.await;
}
#[actix_rt::test]
async fn auto_infer_locales_at_search_with_attributes_to_search_on() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
let (response, _) = index
.update_settings(
json!({
"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"],
"localizedAttributes": [
// force japanese
{"attributePatterns": ["*_zh"], "locales": ["jpn"]},
// force chinese
{"attributePatterns": ["*_ja"], "locales": ["cmn"]},
// any language
{"attributePatterns": ["*_en"], "locales": []}
]
}),
)
.await;
snapshot!(response, @r###"
{
"taskUid": 0,
"indexUid": "test",
"status": "enqueued",
"type": "settingsUpdate",
"enqueuedAt": "[date]"
}
"###);
index.add_documents(documents, None).await;
index.wait_task(1).await;
// auto infer any language
index
.search(
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
// should infer chinese
index
.search(
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"], "attributesToSearchOn": ["name_zh", "description_zh"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"name_zh": "进击的巨人",
"author_zh": "諫山創",
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
"id": 853,
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
},
"_formatted": {
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
"author_zh": "諫山創",
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
"id": "853",
"_vectors": {
"manual": [
"1.0",
"2.0",
"3.0"
]
}
}
}
],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
}
#[actix_rt::test]
async fn auto_infer_locales_at_search() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
let (response, _) = index
.update_settings(
json!({
"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"],
"localizedAttributes": [
// force japanese
{"attributePatterns": ["*"], "locales": ["jpn"]},
]
}),
)
.await;
snapshot!(response, @r###"
{
"taskUid": 0,
"indexUid": "test",
"status": "enqueued",
"type": "settingsUpdate",
"enqueuedAt": "[date]"
}
"###);
index.add_documents(documents, None).await;
index.wait_task(1).await;
index
.search(
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"name_zh": "进击的巨人",
"author_zh": "諫山創",
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
"id": 853,
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
},
"_formatted": {
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
"author_zh": "諫山創",
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
"id": "853",
"_vectors": {
"manual": [
"1.0",
"2.0",
"3.0"
]
}
}
}
],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
index
.search(
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"name_zh": "进击的巨人",
"author_zh": "諫山創",
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
"id": 853,
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
},
"_formatted": {
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
"author_zh": "諫山創",
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
"id": "853",
"_vectors": {
"manual": [
"1.0",
"2.0",
"3.0"
]
}
}
}
],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
index
.search(
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|response, code| {
snapshot!(response, @r###"
{
"hits": [
{
"name_zh": "进击的巨人",
"author_zh": "諫山創",
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
"id": 853,
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
},
"_formatted": {
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
"author_zh": "諫山創",
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
"id": "853",
"_vectors": {
"manual": [
"1.0",
"2.0",
"3.0"
]
}
}
}
],
"query": "\"进击的巨人\"",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1
}
"###);
snapshot!(code, @"200 OK");
},
)
.await;
}
#[actix_rt::test]
async fn force_different_locales_with_pattern_nested() {
let server = Server::new().await;

View File

@ -7,6 +7,7 @@ mod facet_search;
mod formatted;
mod geo;
mod hybrid;
#[cfg(not(feature = "chinese-pinyin"))]
mod locales;
mod matching_strategy;
mod multi;
@ -392,6 +393,7 @@ async fn negative_special_cases_search() {
}
#[cfg(feature = "default")]
#[cfg(not(feature = "chinese-pinyin"))]
#[actix_rt::test]
async fn test_kanji_language_detection() {
let server = Server::new().await;

View File

@ -90,6 +90,21 @@ impl LocalizedFieldIds {
pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> {
self.field_id_to_locales.get(&fields_id).map(Vec::as_slice)
}
pub fn all_locales(&self) -> Vec<Language> {
let mut locales = Vec::new();
for field_locales in self.field_id_to_locales.values() {
if !field_locales.is_empty() {
locales.extend(field_locales);
} else {
// If a field has no locales, we consider it as not localized
return Vec::new();
}
}
locales.sort();
locales.dedup();
locales
}
}
#[cfg(test)]

View File

@ -360,6 +360,7 @@ mod test {
use super::*;
#[cfg(feature = "japanese")]
#[cfg(not(feature = "chinese-pinyin"))]
#[test]
fn test_kanji_language_detection() {
use crate::index::tests::TempIndex;

View File

@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use self::vector_sort::VectorSort;
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule;
use crate::vector::Embedder;
@ -671,9 +672,44 @@ pub fn execute_search(
tokbuilder.words_dict(dictionary);
}
if let Some(locales) = locales {
tokbuilder.allow_list(locales);
}
let db_locales;
match locales {
Some(locales) => {
if !locales.is_empty() {
tokbuilder.allow_list(locales);
}
}
None => {
// If no locales are specified, we use the locales specified in the localized attributes rules
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
let localized_fields = match &ctx.restricted_fids {
// if AttributeToSearchOn is set, use the restricted list of ids
Some(restricted_fids) => {
let iter = restricted_fids
.exact
.iter()
.chain(restricted_fids.tolerant.iter())
.map(|(fid, _)| *fid);
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
}
// Otherwise use the full list of ids coming from the index searchable fields
None => LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
searchable_fields.into_iter(),
),
};
db_locales = localized_fields.all_locales();
if !db_locales.is_empty() {
tokbuilder.allow_list(&db_locales);
}
}
};
let tokenizer = tokbuilder.build();
drop(entered);

View File

@ -6,6 +6,7 @@ pub mod exactness;
pub mod geo_sort;
pub mod integration;
#[cfg(feature = "all-tokenizations")]
#[cfg(not(feature = "chinese-pinyin"))]
pub mod language;
pub mod ngram_split_words;
pub mod proximity;