mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 11:47:28 +01:00
Merge #4881
4881: Infer locales from index settings r=curquiza a=ManyTheFish # Pull Request ## Related issue Fixes #4828 Fixes #4816 ## What does this PR do? - Add some test using `AttributesToSearchOn` - Make the search infer the language based on the index settings when the `locales` filed is not precise CI is now working: https://github.com/meilisearch/meilisearch/actions/runs/10490050545/job/29055955667 Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
36d8684dc8
@ -386,12 +386,39 @@ async fn force_locales() {
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [],
|
||||
"hits": [
|
||||
{
|
||||
"name_zh": "进击的巨人",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
|
||||
"id": 853,
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_formatted": {
|
||||
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
|
||||
"id": "853",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
"1.0",
|
||||
"2.0",
|
||||
"3.0"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 0
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
@ -483,12 +510,39 @@ async fn force_locales_with_pattern() {
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [],
|
||||
"hits": [
|
||||
{
|
||||
"name_zh": "进击的巨人",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
|
||||
"id": 853,
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_formatted": {
|
||||
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
|
||||
"id": "853",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
"1.0",
|
||||
"2.0",
|
||||
"3.0"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 0
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
@ -761,6 +815,275 @@ async fn force_different_locales_with_pattern() {
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn auto_infer_locales_at_search_with_attributes_to_search_on() {
|
||||
let server = Server::new().await;
|
||||
|
||||
let index = server.index("test");
|
||||
let documents = DOCUMENTS.clone();
|
||||
let (response, _) = index
|
||||
.update_settings(
|
||||
json!({
|
||||
"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"],
|
||||
"localizedAttributes": [
|
||||
// force japanese
|
||||
{"attributePatterns": ["*_zh"], "locales": ["jpn"]},
|
||||
// force chinese
|
||||
{"attributePatterns": ["*_ja"], "locales": ["cmn"]},
|
||||
// any language
|
||||
{"attributePatterns": ["*_en"], "locales": []}
|
||||
]
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"taskUid": 0,
|
||||
"indexUid": "test",
|
||||
"status": "enqueued",
|
||||
"type": "settingsUpdate",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
// auto infer any language
|
||||
index
|
||||
.search(
|
||||
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 0
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
// should infer chinese
|
||||
index
|
||||
.search(
|
||||
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"], "attributesToSearchOn": ["name_zh", "description_zh"]}),
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name_zh": "进击的巨人",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
|
||||
"id": 853,
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_formatted": {
|
||||
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
|
||||
"id": "853",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
"1.0",
|
||||
"2.0",
|
||||
"3.0"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn auto_infer_locales_at_search() {
|
||||
let server = Server::new().await;
|
||||
|
||||
let index = server.index("test");
|
||||
let documents = DOCUMENTS.clone();
|
||||
let (response, _) = index
|
||||
.update_settings(
|
||||
json!({
|
||||
"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"],
|
||||
"localizedAttributes": [
|
||||
// force japanese
|
||||
{"attributePatterns": ["*"], "locales": ["jpn"]},
|
||||
]
|
||||
}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"taskUid": 0,
|
||||
"indexUid": "test",
|
||||
"status": "enqueued",
|
||||
"type": "settingsUpdate",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
index
|
||||
.search(
|
||||
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name_zh": "进击的巨人",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
|
||||
"id": 853,
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_formatted": {
|
||||
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
|
||||
"id": "853",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
"1.0",
|
||||
"2.0",
|
||||
"3.0"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
index
|
||||
.search(
|
||||
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name_zh": "进击的巨人",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
|
||||
"id": 853,
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_formatted": {
|
||||
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
|
||||
"id": "853",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
"1.0",
|
||||
"2.0",
|
||||
"3.0"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
index
|
||||
.search(
|
||||
json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"name_zh": "进击的巨人",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。",
|
||||
"id": 853,
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_formatted": {
|
||||
"name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
|
||||
"author_zh": "諫山創",
|
||||
"description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列,由諫山 創作画。",
|
||||
"id": "853",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
"1.0",
|
||||
"2.0",
|
||||
"3.0"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "\"进击的巨人\"",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"200 OK");
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn force_different_locales_with_pattern_nested() {
|
||||
let server = Server::new().await;
|
||||
|
@ -7,6 +7,7 @@ mod facet_search;
|
||||
mod formatted;
|
||||
mod geo;
|
||||
mod hybrid;
|
||||
#[cfg(not(feature = "chinese-pinyin"))]
|
||||
mod locales;
|
||||
mod matching_strategy;
|
||||
mod multi;
|
||||
@ -392,6 +393,7 @@ async fn negative_special_cases_search() {
|
||||
}
|
||||
|
||||
#[cfg(feature = "default")]
|
||||
#[cfg(not(feature = "chinese-pinyin"))]
|
||||
#[actix_rt::test]
|
||||
async fn test_kanji_language_detection() {
|
||||
let server = Server::new().await;
|
||||
|
@ -90,6 +90,21 @@ impl LocalizedFieldIds {
|
||||
pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> {
|
||||
self.field_id_to_locales.get(&fields_id).map(Vec::as_slice)
|
||||
}
|
||||
|
||||
pub fn all_locales(&self) -> Vec<Language> {
|
||||
let mut locales = Vec::new();
|
||||
for field_locales in self.field_id_to_locales.values() {
|
||||
if !field_locales.is_empty() {
|
||||
locales.extend(field_locales);
|
||||
} else {
|
||||
// If a field has no locales, we consider it as not localized
|
||||
return Vec::new();
|
||||
}
|
||||
}
|
||||
locales.sort();
|
||||
locales.dedup();
|
||||
locales
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -360,6 +360,7 @@ mod test {
|
||||
use super::*;
|
||||
|
||||
#[cfg(feature = "japanese")]
|
||||
#[cfg(not(feature = "chinese-pinyin"))]
|
||||
#[test]
|
||||
fn test_kanji_language_detection() {
|
||||
use crate::index::tests::TempIndex;
|
||||
|
@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use self::vector_sort::VectorSort;
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::vector::Embedder;
|
||||
@ -671,9 +672,44 @@ pub fn execute_search(
|
||||
tokbuilder.words_dict(dictionary);
|
||||
}
|
||||
|
||||
if let Some(locales) = locales {
|
||||
tokbuilder.allow_list(locales);
|
||||
}
|
||||
let db_locales;
|
||||
match locales {
|
||||
Some(locales) => {
|
||||
if !locales.is_empty() {
|
||||
tokbuilder.allow_list(locales);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// If no locales are specified, we use the locales specified in the localized attributes rules
|
||||
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
|
||||
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
|
||||
|
||||
let localized_fields = match &ctx.restricted_fids {
|
||||
// if AttributeToSearchOn is set, use the restricted list of ids
|
||||
Some(restricted_fids) => {
|
||||
let iter = restricted_fids
|
||||
.exact
|
||||
.iter()
|
||||
.chain(restricted_fids.tolerant.iter())
|
||||
.map(|(fid, _)| *fid);
|
||||
|
||||
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
|
||||
}
|
||||
// Otherwise use the full list of ids coming from the index searchable fields
|
||||
None => LocalizedFieldIds::new(
|
||||
&localized_attributes_rules,
|
||||
&fields_ids_map,
|
||||
searchable_fields.into_iter(),
|
||||
),
|
||||
};
|
||||
|
||||
db_locales = localized_fields.all_locales();
|
||||
if !db_locales.is_empty() {
|
||||
tokbuilder.allow_list(&db_locales);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let tokenizer = tokbuilder.build();
|
||||
drop(entered);
|
||||
|
@ -6,6 +6,7 @@ pub mod exactness;
|
||||
pub mod geo_sort;
|
||||
pub mod integration;
|
||||
#[cfg(feature = "all-tokenizations")]
|
||||
#[cfg(not(feature = "chinese-pinyin"))]
|
||||
pub mod language;
|
||||
pub mod ngram_split_words;
|
||||
pub mod proximity;
|
||||
|
Loading…
x
Reference in New Issue
Block a user