From cc02920f2b038b05ddf99aa0ff8ef09db0b9b2b2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 14:59:31 +0200 Subject: [PATCH 1/9] Update charabia --- Cargo.lock | 37 +--- milli/Cargo.toml | 2 +- milli/src/index.rs | 23 +++ milli/src/search/new/mod.rs | 6 +- .../extract/extract_docid_word_positions.rs | 161 ++---------------- .../src/update/index_documents/extract/mod.rs | 16 +- .../src/update/index_documents/typed_chunk.rs | 34 +--- 7 files changed, 49 insertions(+), 230 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c72053be7..547f9c0e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -934,19 +934,15 @@ dependencies = [ [[package]] name = "charabia" version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9868a22f10dee80498a8a2b6c641d80bf28ea4495fcf71c2dc4836c2dd23958c" +source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3" dependencies = [ "aho-corasick", - "cow-utils", "csv", - "deunicode", "either", "fst", "irg-kvariants", "jieba-rs", "lindera", - "litemap", "once_cell", "pinyin", "serde", @@ -954,7 +950,6 @@ dependencies = [ "unicode-normalization", "wana_kana", "whatlang", - "zerovec", ] [[package]] @@ -1145,12 +1140,6 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" -[[package]] -name = "cow-utils" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" - [[package]] name = "cpufeatures" version = "0.2.12" @@ -1551,12 +1540,6 @@ dependencies = [ "syn 2.0.60", ] -[[package]] -name = "deunicode" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" - [[package]] name = "digest" version = "0.10.7" @@ -2666,8 +2649,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" +source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3" dependencies = [ "csv", "once_cell", @@ -3278,12 +3260,6 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "litemap" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" - [[package]] name = "lmdb-master-sys" version = "0.2.2" @@ -6506,15 +6482,6 @@ dependencies = [ "syn 2.0.60", ] -[[package]] -name = "zerovec" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" -dependencies = [ - "zerofrom", -] - [[package]] name = "zip" version = "1.1.4" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e635bbcf4..1a81f6f8c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.12", default-features = false } +charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "simplify-lang-detection", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" diff --git a/milli/src/index.rs b/milli/src/index.rs index afe212f57..194f18faa 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1604,6 +1604,29 @@ impl Index { Ok(script_language) } + pub fn languages(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + let mut script_language_doc_count: Vec<(Language, u64)> = Vec::new(); + let mut total = 0; + for sl in self.script_language_docids.iter(rtxn)? { + let ((_script, language), docids) = sl?; + + // keep only Languages that contains at least 1 document. + let remaining_documents_count = docids.len(); + total += remaining_documents_count; + if remaining_documents_count > 0 { + script_language_doc_count.push((language, remaining_documents_count)); + } + } + + let threshold = total / 20; // 5% (arbitrary) + + Ok(script_language_doc_count + .into_iter() + .filter(|(_, count)| *count > threshold) + .map(|(language, _)| language) + .collect()) + } + /// Put the embedding configs: /// 1. The name of the embedder /// 2. The configuration option for this embedder diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index f6a4a802c..78b7a0446 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -670,9 +670,9 @@ pub fn execute_search( tokbuilder.words_dict(dictionary); } - let script_lang_map = ctx.index.script_language(ctx.txn)?; - if !script_lang_map.is_empty() { - tokbuilder.allow_list(&script_lang_map); + let languages = ctx.index.languages(ctx.txn)?; + if !languages.is_empty() { + tokbuilder.allow_list(&languages); } let tokenizer = tokbuilder.build(); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 721d67e96..748a3886a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -1,10 +1,9 @@ -use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io::BufReader; use std::{io, mem, str}; -use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; +use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; @@ -12,11 +11,9 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; -use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; -pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; - /// Extracts the word and positions where this word appear and /// prefixes it by the document id. /// @@ -28,7 +25,7 @@ pub fn extract_docid_word_positions( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, -) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { +) -> Result>> { let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); @@ -36,7 +33,6 @@ pub fn extract_docid_word_positions( // initialize destination values. let mut documents_ids = RoaringBitmap::new(); - let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, keep_latest_obkv, @@ -109,9 +105,9 @@ pub fn extract_docid_word_positions( let (del, add): (Result<_>, Result<_>) = rayon::join( || { // deletions - lang_safe_tokens_from_document( + tokens_from_document( &obkv, - &settings_diff.old, + &settings_diff.old.searchable_fields_ids, &del_tokenizer, max_positions_per_attributes, DelAdd::Deletion, @@ -120,9 +116,9 @@ pub fn extract_docid_word_positions( }, || { // additions - lang_safe_tokens_from_document( + tokens_from_document( &obkv, - &settings_diff.new, + &settings_diff.new.searchable_fields_ids, &add_tokenizer, max_positions_per_attributes, DelAdd::Addition, @@ -131,8 +127,8 @@ pub fn extract_docid_word_positions( }, ); - let (del_obkv, del_script_language_word_count) = del?; - let (add_obkv, add_script_language_word_count) = add?; + let del_obkv = del?; + let add_obkv = add?; // merge deletions and additions. // transforming two KV> into one KV>> @@ -150,31 +146,10 @@ pub fn extract_docid_word_positions( key_buffer.extend_from_slice(&field_id.to_be_bytes()); docid_word_positions_sorter.insert(&key_buffer, value)?; } - - // update script_language_docids deletions. - for (script, languages_frequency) in del_script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); - entry.0.push(document_id); - } - } - - // update script_language_docids additions. - for (script, languages_frequency) in add_script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); - entry.1.push(document_id); - } - } } // the returned sorter is serialized as: key: (DocId, FieldId), value: KV>. sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (reader, script_language_docids)) } /// Check if any searchable fields of a document changed. @@ -205,7 +180,7 @@ fn tokenizer_builder<'a>( stop_words: Option<&'a fst::Set>>, allowed_separators: Option<&'a [&str]>, dictionary: Option<&'a [&str]>, - script_language: Option<&'a HashMap>>, + languages: Option<&'a Vec>, ) -> TokenizerBuilder<'a, Vec> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { @@ -218,81 +193,13 @@ fn tokenizer_builder<'a>( tokenizer_builder.separators(separators); } - if let Some(script_language) = script_language { - tokenizer_builder.allow_list(script_language); + if let Some(languages) = languages { + tokenizer_builder.allow_list(languages); } tokenizer_builder } -/// Extract words mapped with their positions of a document, -/// ensuring no Language detection mistakes was made. -fn lang_safe_tokens_from_document<'a>( - obkv: &KvReader<'_, FieldId>, - settings: &InnerIndexSettings, - tokenizer: &Tokenizer<'_>, - max_positions_per_attributes: u32, - del_add: DelAdd, - buffers: &'a mut Buffers, -) -> Result<(&'a [u8], HashMap>)> { - let mut script_language_word_count = HashMap::new(); - - tokens_from_document( - obkv, - &settings.searchable_fields_ids, - tokenizer, - max_positions_per_attributes, - del_add, - buffers, - &mut script_language_word_count, - )?; - - // if we detect a potetial mistake in the language detection, - // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. - // context: https://github.com/meilisearch/meilisearch/issues/3565 - if script_language_word_count - .values() - .map(Vec::as_slice) - .any(potential_language_detection_error) - { - // build an allow list with the most frequent detected languages in the document. - let script_language: HashMap<_, _> = - script_language_word_count.iter().filter_map(most_frequent_languages).collect(); - - // if the allow list is empty, meaning that no Language is considered frequent, - // then we don't rerun the extraction. - if !script_language.is_empty() { - // build a new temporary tokenizer including the allow list. - let stop_words = settings.stop_words.as_ref(); - let separators: Option> = settings - .allowed_separators - .as_ref() - .map(|s| s.iter().map(String::as_str).collect()); - let dictionary: Option> = - settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut builder = - tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); - let tokenizer = builder.build(); - - script_language_word_count.clear(); - - // rerun the extraction. - tokens_from_document( - obkv, - &settings.searchable_fields_ids, - &tokenizer, - max_positions_per_attributes, - del_add, - buffers, - &mut script_language_word_count, - )?; - } - } - - // returns a (KV>, HashMap>) - Ok((&buffers.obkv_buffer, script_language_word_count)) -} - /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader<'a, FieldId>, @@ -301,7 +208,6 @@ fn tokens_from_document<'a>( max_positions_per_attributes: u32, del_add: DelAdd, buffers: &'a mut Buffers, - script_language_word_count: &mut HashMap>, ) -> Result<&'a [u8]> { buffers.obkv_buffer.clear(); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); @@ -326,16 +232,6 @@ fn tokens_from_document<'a>( .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { - // if a language has been detected for the token, we update the counter. - if let Some(language) = token.language { - let script = token.script; - let entry = script_language_word_count.entry(script).or_default(); - match entry.iter_mut().find(|(l, _)| *l == language) { - Some((_, n)) => *n += 1, - None => entry.push((language, 1)), - } - } - // keep a word only if it is not empty and fit in a LMDB key. let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { @@ -423,39 +319,6 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool { - if languages_frequency.len() > 1 { - let threshold = compute_language_frequency_threshold(languages_frequency); - languages_frequency.iter().any(|(_, c)| *c <= threshold) - } else { - false - } -} - -fn most_frequent_languages( - (script, languages_frequency): (&Script, &Vec<(Language, usize)>), -) -> Option<(Script, Vec)> { - if languages_frequency.len() > 1 { - let threshold = compute_language_frequency_threshold(languages_frequency); - - let languages: Vec<_> = - languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect(); - - if languages.is_empty() { - None - } else { - Some((*script, languages)) - } - } else { - None - } -} - -fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { - let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); - total / 10 // 10% is a completely arbitrary value. -} - #[derive(Default)] struct Buffers { // the field buffer for each fields desserialization, and must be cleared between each field. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 57d9d5e42..6c23a8da9 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -345,21 +345,17 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (docid_word_positions_chunk, script_language_pair) = - extract_docid_word_positions( - flattened_documents_chunk.clone(), - indexer, - &settings_diff, - max_positions_per_attributes, - )?; + let docid_word_positions_chunk = extract_docid_word_positions( + flattened_documents_chunk.clone(), + indexer, + &settings_diff, + max_positions_per_attributes, + )?; // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; - let _ = - lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); - Ok(docid_word_positions_chunk) }, || { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index c5cf35ca8..9de95778b 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,10 +1,9 @@ -use std::collections::{BTreeSet, HashMap}; +use std::collections::BTreeSet; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; -use charabia::{Language, Script}; use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; use heed::{BytesDecode, RwTxn}; @@ -94,7 +93,6 @@ pub(crate) enum TypedChunk { add_to_user_provided: RoaringBitmap, remove_from_user_provided: RoaringBitmap, }, - ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } impl TypedChunk { @@ -113,8 +111,7 @@ impl TypedChunk { | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_)) | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_)) | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_)) - | (GeoPoints(_), GeoPoints(_)) - | (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true, + | (GeoPoints(_), GeoPoints(_)) => true, ( VectorPoints { embedder_name: left, expected_dimension: left_dim, .. }, VectorPoints { embedder_name: right, expected_dimension: right_dim, .. }, @@ -775,33 +772,6 @@ pub(crate) fn write_typed_chunk_into_index( tracing::debug!("Finished vector chunk for {}", embedder_name); } - TypedChunk::ScriptLanguageDocids(_) => { - let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids"); - let _entered = span.enter(); - - for typed_chunk in typed_chunks { - let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() }; - for (key, (deletion, addition)) in sl_map { - let mut db_key_exists = false; - let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(db_values) => { - db_key_exists = true; - (db_values - deletion) | addition - } - None => addition, - }; - - if final_value.is_empty() { - // If the database entry exists, delete it. - if db_key_exists { - index.script_language_docids.delete(wtxn, &key)?; - } - } else { - index.script_language_docids.put(wtxn, &key, &final_value)?; - } - } - } - } } Ok((RoaringBitmap::new(), is_merged_database)) From d82f8fd904bc291689673685a7b38bd749b9af01 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 13:33:57 +0200 Subject: [PATCH 2/9] Add tests --- meilisearch/tests/search/locales.rs | 662 ++++++++++++++++++++++++++++ meilisearch/tests/search/mod.rs | 1 + 2 files changed, 663 insertions(+) create mode 100644 meilisearch/tests/search/locales.rs diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs new file mode 100644 index 000000000..722694ba3 --- /dev/null +++ b/meilisearch/tests/search/locales.rs @@ -0,0 +1,662 @@ +use meili_snap::*; +use once_cell::sync::Lazy; + +use crate::common::{Server, Value}; +use crate::json; + +static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 852, + "document_en": { + "name": "Attack on Titan", + "description": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "author": "Hajime Isayama", + }, + "document_ja": { + "name": "進撃の巨人", + "description": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "author": "諫山 創", + }, + "document_zh": { + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "author": "諫山創", + }, + "_vectors": { "manual": [1, 2, 3]}, + }, + { + "id": 654, + "document_en": + { + "name": "One Piece", + "description": "One Piece is a Japanese manga series written and illustrated by Eiichiro Oda", + "author": "Eiichiro Oda", + }, + "document_ja": { + "name": "ワンピース", + "description": "ワンピースは、日本の漫画シリーズであり、尾田 栄一郎によって作画されている。", + "author": "尾田 栄一郎", + }, + "document_zh": { + "name": "ONE PIECE", + "description": "海贼王》是尾田荣一郎创作的日本漫画系列。", + "author": "尾田 栄一郎", + }, + "_vectors": { "manual": [1, 2, 54] }, + } + ]) +}); + +static DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 852, + "name_en": "Attack on Titan", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "author_en": "Hajime Isayama", + "name_ja": "進撃の巨人", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "author_ja": "諫山 創", + "_vectors": { "manual": [1, 2, 3]}, + }, + { + "id": 853, + "name_zh": "进击的巨人", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "author_zh": "諫山創", + "_vectors": { "manual": [1, 2, 3]}, + }, + { + "id": 654, + "name_en": "One Piece", + "description_en": "One Piece is a Japanese manga series written and illustrated by Eiichiro Oda", + "author_en": "Eiichiro Oda", + "name_ja": "ワンピース", + "description_ja": "ワンピースは、日本の漫画シリーズであり、尾田 栄一郎によって作画されている。", + "author_ja": "尾田 栄一郎", + "_vectors": { "manual": [1, 2, 54] }, + }, + { + "id": 655, + "name_zh": "ONE PIECE", + "description_zh": "海贼王》是尾田荣一郎创作的日本漫画系列。", + "author_zh": "尾田 栄一郎", + "_vectors": { "manual": [1, 2, 54] }, + } + ]) +}); + +#[actix_rt::test] +async fn simple_search() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + index + .update_settings( + json!({"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"]}), + ) + .await; + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // english + index + .search(json!({"q": "Atta", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 852 + } + ], + "query": "Atta", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }) + .await; + + // japanese + index + .search(json!({"q": "進撃", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 853 + } + ], + "query": "進撃", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }) + .await; + + index + .search( + json!({"q": "進撃", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 852 + }, + { + "id": 853 + } + ], + "query": "進撃", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // chinese + index + .search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 853 + }, + { + "id": 852 + } + ], + "query": "进击", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + snapshot!(code, @"200 OK"); + }) + .await; +} + +#[actix_rt::test] +async fn force_locales() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["name_ja", "name_zh", "author_ja", "author_zh", "description_ja", "description_zh"], "locales": ["jpn"]} + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // chinese detection + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 853 + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn force_locales_with_pattern() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_ja", "*_zh"], "locales": ["jpn"]} + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // chinese detection + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 853 + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn force_locales_with_pattern_nested() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = NESTED_DOCUMENTS.clone(); + let (response, _) = index + .update_settings(json!({ + "searchableAttributes": ["document_en", "document_ja", "document_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["document_ja.*", "*_zh.*"], "locales": ["jpn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // chinese + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["cmn"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "id": 852 + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn invalid_locales() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + index + .update_settings( + json!({"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"]}), + ) + .await; + index.add_documents(documents, None).await; + index.wait_task(1).await; + + let (response, code) = index + .search_post(json!({"q": "Atta", "attributesToRetrieve": ["id"], "locales": ["invalid"]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Unknown value `invalid` at `.locales[0]`: expected one of `epo`, `eng`, `rus`, `cmn`, `spa`, `por`, `ita`, `ben`, `fra`, `deu`, `ukr`, `kat`, `ara`, `hin`, `jpn`, `heb`, `yid`, `pol`, `amh`, `jav`, `kor`, `nob`, `dan`, `swe`, `fin`, `tur`, `nld`, `hun`, `ces`, `ell`, `bul`, `bel`, `mar`, `kan`, `ron`, `slv`, `hrv`, `srp`, `mkd`, `lit`, `lav`, `est`, `tam`, `vie`, `urd`, `tha`, `guj`, `uzb`, `pan`, `aze`, `ind`, `tel`, `pes`, `mal`, `ori`, `mya`, `nep`, `sin`, `khm`, `tuk`, `aka`, `zul`, `sna`, `afr`, `lat`, `slk`, `cat`, `tgl`, `hye`", + "code": "invalid_search_locales", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_locales" + } + "###); + + let (response, code) = index + .search_get( + &yaup::to_string( + &json!({"q": "Atta", "attributesToRetrieve": ["id"], "locales": ["invalid"]}), + ) + .unwrap(), + ) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `locales`: Unknown value `invalid`, expected one of `epo`, `eng`, `rus`, `cmn`, `spa`, `por`, `ita`, `ben`, `fra`, `deu`, `ukr`, `kat`, `ara`, `hin`, `jpn`, `heb`, `yid`, `pol`, `amh`, `jav`, `kor`, `nob`, `dan`, `swe`, `fin`, `tur`, `nld`, `hun`, `ces`, `ell`, `bul`, `bel`, `mar`, `kan`, `ron`, `slv`, `hrv`, `srp`, `mkd`, `lit`, `lav`, `est`, `tam`, `vie`, `urd`, `tha`, `guj`, `uzb`, `pan`, `aze`, `ind`, `tel`, `pes`, `mal`, `ori`, `mya`, `nep`, `sin`, `khm`, `tuk`, `aka`, `zul`, `sna`, `afr`, `lat`, `slk`, `cat`, `tgl`, `hye`", + "code": "invalid_search_locales", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_locales" + } + "###); +} + +#[actix_rt::test] +async fn invalid_localized_attributes_rules() { + let server = Server::new().await; + + let index = server.index("test"); + let (response, _) = index + .update_settings(json!({ + "localizedAttributes": [ + {"attributePatterns": ["*_ja", "*_zh"], "locales": ["japan"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "message": "Unknown value `japan` at `.localizedAttributes[0].locales[0]`: expected one of `epo`, `eng`, `rus`, `cmn`, `spa`, `por`, `ita`, `ben`, `fra`, `deu`, `ukr`, `kat`, `ara`, `hin`, `jpn`, `heb`, `yid`, `pol`, `amh`, `jav`, `kor`, `nob`, `dan`, `swe`, `fin`, `tur`, `nld`, `hun`, `ces`, `ell`, `bul`, `bel`, `mar`, `kan`, `ron`, `slv`, `hrv`, `srp`, `mkd`, `lit`, `lav`, `est`, `tam`, `vie`, `urd`, `tha`, `guj`, `uzb`, `pan`, `aze`, `ind`, `tel`, `pes`, `mal`, `ori`, `mya`, `nep`, `sin`, `khm`, `tuk`, `aka`, `zul`, `sna`, `afr`, `lat`, `slk`, `cat`, `tgl`, `hye`", + "code": "invalid_settings_localized_attributes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_localized_attributes" + } + "###); + + let (response, _) = index + .update_settings(json!({ + "localizedAttributes": [ + {"attributePatterns": ["*_ja", "*_zh"], "locales": "jpn"} + ] + })) + .await; + snapshot!(response, @r###" + { + "message": "Invalid value type at `.localizedAttributes[0].locales`: expected an array, but found a string: `\"jpn\"`", + "code": "invalid_settings_localized_attributes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_localized_attributes" + } + "###); + + let (response, _) = index + .update_settings(json!({ + "localizedAttributes": [ + {"attributePatterns": "*_ja", "locales": ["jpn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "message": "Invalid value type at `.localizedAttributes[0].attributePatterns`: expected an array, but found a string: `\"*_ja\"`", + "code": "invalid_settings_localized_attributes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_localized_attributes" + } + "###); + + let (response, _) = index + .update_settings(json!({ + "localizedAttributes": [ + {"locales": ["jpn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "message": "Missing field `attributePatterns` inside `.localizedAttributes[0]`", + "code": "invalid_settings_localized_attributes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_localized_attributes" + } + "###); +} + +#[actix_rt::test] +async fn simple_facet_search() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings(json!({ + "filterableAttributes": ["name_en", "name_ja", "name_zh"], + })) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + let (response, _) = index + .facet_search(json!({"facetName": "name_zh", "facetQuery": "進撃", "locales": ["cmn"]})) + .await; + + snapshot!(response, @r###" + { + "facetHits": [ + { + "value": "进击的巨人", + "count": 1 + } + ], + "facetQuery": "進撃", + "processingTimeMs": "[duration]" + } + "###); + + let (response, _) = index + .facet_search(json!({"facetName": "name_zh", "facetQuery": "進撃", "locales": ["jpn"]})) + .await; + + snapshot!(response, @r###" + { + "facetHits": [ + { + "value": "进击的巨人", + "count": 1 + } + ], + "facetQuery": "進撃", + "processingTimeMs": "[duration]" + } + "###); +} + +#[actix_rt::test] +async fn facet_search_with_localized_attributes() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings(json!({ + "filterableAttributes": ["name_ja", "name_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_ja", "*_zh"], "locales": ["jpn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + let (response, _) = index + .facet_search(json!({"facetName": "name_zh", "facetQuery": "进击", "locales": ["cmn"]})) + .await; + + snapshot!(response, @r###" + { + "facetHits": [], + "facetQuery": "进击", + "processingTimeMs": "[duration]" + } + "###); + + let (response, _) = index + .facet_search(json!({"facetName": "name_zh", "facetQuery": "进击", "locales": ["jpn"]})) + .await; + + snapshot!(response, @r###" + { + "facetHits": [ + { + "value": "进击的巨人", + "count": 1 + } + ], + "facetQuery": "进击", + "processingTimeMs": "[duration]" + } + "###); + + let (response, _) = + index.facet_search(json!({"facetName": "name_zh", "facetQuery": "进击"})).await; + + snapshot!(response, @r###" + { + "facetHits": [ + { + "value": "进击的巨人", + "count": 1 + } + ], + "facetQuery": "进击", + "processingTimeMs": "[duration]" + } + "###); +} diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 7f4648e57..301ef9aa2 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -7,6 +7,7 @@ mod facet_search; mod formatted; mod geo; mod hybrid; +mod locales; mod matching_strategy; mod multi; mod pagination; From 90c0a6db7ddf5b0d8535e1a6a20987776f7b3e82 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 14:09:27 +0200 Subject: [PATCH 3/9] Implement localized search --- meilisearch-types/src/error.rs | 1 + meilisearch-types/src/lib.rs | 1 + meilisearch-types/src/locales.rs | 132 ++++++++++++++++++ .../src/analytics/segment_analytics.rs | 22 ++- meilisearch/src/routes/indexes/search.rs | 4 + meilisearch/src/search/federated.rs | 10 +- meilisearch/src/search/mod.rs | 61 ++++++-- milli/examples/search.rs | 1 + milli/src/search/facet/search.rs | 24 +++- milli/src/search/hybrid.rs | 1 + milli/src/search/mod.rs | 11 ++ milli/src/search/new/matches/mod.rs | 84 ++++++----- milli/src/search/new/mod.rs | 8 +- .../src/search/new/query_term/parse_query.rs | 2 +- 14 files changed, 292 insertions(+), 70 deletions(-) create mode 100644 meilisearch-types/src/locales.rs diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index d27d6cd3d..e56949b57 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -256,6 +256,7 @@ InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; InvalidSearchSemanticRatio , InvalidRequest , BAD_REQUEST ; +InvalidSearchLocales , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchFacetName , InvalidRequest , BAD_REQUEST ; InvalidSimilarId , InvalidRequest , BAD_REQUEST ; InvalidSearchFilter , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index e4f5cbeb4..d6049e667 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -7,6 +7,7 @@ pub mod features; pub mod index_uid; pub mod index_uid_pattern; pub mod keys; +pub mod locales; pub mod settings; pub mod star_or; pub mod task_view; diff --git a/meilisearch-types/src/locales.rs b/meilisearch-types/src/locales.rs new file mode 100644 index 000000000..14972fc33 --- /dev/null +++ b/meilisearch-types/src/locales.rs @@ -0,0 +1,132 @@ +use deserr::Deserr; +use serde::{Deserialize, Serialize}; +use serde_json::json; + +use milli::LocalizedAttributesRule; + +/// Generate a Locale enum and its From and Into implementations for milli::tokenizer::Language. +/// +/// this enum implements `Deserr` in order to be used in the API. +macro_rules! make_locale { + + ($($language:tt), +) => { + #[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize, Ord, PartialOrd)] + #[deserr(rename_all = camelCase)] + #[serde(rename_all = "camelCase")] + pub enum Locale { + $($language),+, + } + + impl From for Locale { + fn from(other: milli::tokenizer::Language) -> Locale { + match other { + $(milli::tokenizer::Language::$language => Locale::$language), + + } + } + } + + impl From for milli::tokenizer::Language { + fn from(other: Locale) -> milli::tokenizer::Language { + match other { + $(Locale::$language => milli::tokenizer::Language::$language), +, + } + } + } + + #[derive(Debug)] + pub struct LocaleFormatError { + pub invalid_locale: String, + } + + impl std::fmt::Display for LocaleFormatError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let valid_locales = [$(Locale::$language),+].iter().map(|l| format!("`{}`", json!(l).as_str().unwrap())).collect::>().join(", "); + write!(f, "Unknown value `{}`, expected one of {}", self.invalid_locale, valid_locales) + } + } + + impl std::error::Error for LocaleFormatError {} + + impl std::str::FromStr for Locale { + type Err = LocaleFormatError; + + fn from_str(s: &str) -> Result { + milli::tokenizer::Language::from_code(s).map(Self::from).ok_or(LocaleFormatError { + invalid_locale: s.to_string(), + }) + } + } + }; +} + +make_locale! { + Epo, + Eng, + Rus, + Cmn, + Spa, + Por, + Ita, + Ben, + Fra, + Deu, + Ukr, + Kat, + Ara, + Hin, + Jpn, + Heb, + Yid, + Pol, + Amh, + Jav, + Kor, + Nob, + Dan, + Swe, + Fin, + Tur, + Nld, + Hun, + Ces, + Ell, + Bul, + Bel, + Mar, + Kan, + Ron, + Slv, + Hrv, + Srp, + Mkd, + Lit, + Lav, + Est, + Tam, + Vie, + Urd, + Tha, + Guj, + Uzb, + Pan, + Aze, + Ind, + Tel, + Pes, + Mal, + Ori, + Mya, + Nep, + Sin, + Khm, + Tuk, + Aka, + Zul, + Sna, + Afr, + Lat, + Slk, + Cat, + Tgl, + Hye +} diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 487eaf003..407b90658 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1,4 +1,4 @@ -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; use std::fs; use std::mem::take; use std::path::{Path, PathBuf}; @@ -10,6 +10,7 @@ use actix_web::HttpRequest; use byte_unit::Byte; use index_scheduler::IndexScheduler; use meilisearch_auth::{AuthController, AuthFilter}; +use meilisearch_types::locales::Locale; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; use regex::Regex; @@ -653,6 +654,9 @@ pub struct SearchAggregator { // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, + // List of the unique Locales passed as parameter + locales: BTreeSet, + // pagination max_limit: usize, max_offset: usize, @@ -707,6 +711,7 @@ impl SearchAggregator { attributes_to_search_on, hybrid, ranking_score_threshold, + locales, } = query; let mut ret = Self::default(); @@ -774,6 +779,10 @@ impl SearchAggregator { ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); + if let Some(locales) = locales { + ret.locales = locales.into_iter().copied().collect(); + } + ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); @@ -859,6 +868,7 @@ impl SearchAggregator { total_degraded, total_used_negative_operator, ranking_score_threshold, + ref mut locales, } = other; if self.timestamp.is_none() { @@ -947,6 +957,9 @@ impl SearchAggregator { self.show_ranking_score |= show_ranking_score; self.show_ranking_score_details |= show_ranking_score_details; self.ranking_score_threshold |= ranking_score_threshold; + + // locales + self.locales.append(locales); } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -991,6 +1004,7 @@ impl SearchAggregator { total_degraded, total_used_negative_operator, ranking_score_threshold, + locales, } = self; if total_received == 0 { @@ -1060,6 +1074,7 @@ impl SearchAggregator { "matching_strategy": { "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, + "locales": locales, "scoring": { "show_ranking_score": show_ranking_score, "show_ranking_score_details": show_ranking_score_details, @@ -1150,6 +1165,7 @@ impl MultiSearchAggregator { attributes_to_search_on: _, hybrid: _, ranking_score_threshold: _, + locales: _, } = query; index_uid.as_str() @@ -1307,6 +1323,7 @@ impl FacetSearchAggregator { attributes_to_search_on, hybrid, ranking_score_threshold, + locales, } = query; let mut ret = Self::default(); @@ -1322,7 +1339,8 @@ impl FacetSearchAggregator { || *matching_strategy != MatchingStrategy::default() || attributes_to_search_on.is_some() || hybrid.is_some() - || ranking_score_threshold.is_some(); + || ranking_score_threshold.is_some() + || locales.is_some(); ret } diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 836b96147..e60f95948 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -7,6 +7,7 @@ use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::locales::Locale; use meilisearch_types::milli; use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; @@ -89,6 +90,8 @@ pub struct SearchQueryGet { pub hybrid_semantic_ratio: Option, #[deserr(default, error = DeserrQueryParamError)] pub ranking_score_threshold: Option, + #[deserr(default, error = DeserrQueryParamError)] + pub locales: Option>, } #[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] @@ -175,6 +178,7 @@ impl From for SearchQuery { attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), hybrid, ranking_score_threshold: other.ranking_score_threshold.map(|o| o.0), + locales: other.locales.map(|o| o.into_iter().collect()), } } } diff --git a/meilisearch/src/search/federated.rs b/meilisearch/src/search/federated.rs index 0c623d9cb..58005ec53 100644 --- a/meilisearch/src/search/federated.rs +++ b/meilisearch/src/search/federated.rs @@ -380,9 +380,6 @@ pub fn perform_federated_search( let criteria = index.criteria(&rtxn)?; - // stuff we need for the hitmaker - let script_lang_map = index.script_language(&rtxn)?; - let dictionary = index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); @@ -494,6 +491,7 @@ pub fn perform_federated_search( sort: query.sort, show_ranking_score: query.show_ranking_score, show_ranking_score_details: query.show_ranking_score_details, + locales: query.locales.map(|l| l.iter().copied().map(Into::into).collect()), }; let milli::SearchResult { @@ -509,11 +507,7 @@ pub fn perform_federated_search( degraded |= query_degraded; used_negative_operator |= query_used_negative_operator; - let tokenizer = HitMaker::tokenizer( - &script_lang_map, - dictionary.as_deref(), - separators.as_deref(), - ); + let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index 6624188ce..d28d888aa 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1,6 +1,6 @@ use core::fmt; use std::cmp::min; -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -15,16 +15,17 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::locales::Locale; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; -use milli::tokenizer::TokenizerBuilder; +use milli::tokenizer::{Language, TokenizerBuilder}; use milli::{ - AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder, - SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, LocalizedAttributesRule, + MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; use regex::Regex; use serde::Serialize; @@ -100,6 +101,8 @@ pub struct SearchQuery { pub attributes_to_search_on: Option>, #[deserr(default, error = DeserrJsonError, default)] pub ranking_score_threshold: Option, + #[deserr(default, error = DeserrJsonError, default)] + pub locales: Option>, } #[derive(Debug, Clone, Copy, PartialEq, Deserr)] @@ -169,6 +172,7 @@ impl fmt::Debug for SearchQuery { matching_strategy, attributes_to_search_on, ranking_score_threshold, + locales, } = self; let mut debug = f.debug_struct("SearchQuery"); @@ -250,6 +254,10 @@ impl fmt::Debug for SearchQuery { debug.field("ranking_score_threshold", &ranking_score_threshold); } + if let Some(locales) = locales { + debug.field("locales", &locales); + } + debug.finish() } } @@ -425,6 +433,8 @@ pub struct SearchQueryWithIndex { pub attributes_to_search_on: Option>, #[deserr(default, error = DeserrJsonError, default)] pub ranking_score_threshold: Option, + #[deserr(default, error = DeserrJsonError, default)] + pub locales: Option>, #[deserr(default)] pub federation_options: Option, @@ -477,6 +487,7 @@ impl SearchQueryWithIndex { attributes_to_search_on, hybrid, ranking_score_threshold, + locales, } = self; ( index_uid, @@ -506,6 +517,7 @@ impl SearchQueryWithIndex { attributes_to_search_on, hybrid, ranking_score_threshold, + locales, // do not use ..Default::default() here, // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` }, @@ -866,6 +878,10 @@ fn prepare_search<'t>( search.sort_criteria(sort); } + if let Some(ref locales) = query.locales { + search.locales(locales.iter().copied().map(Into::into).collect()); + } + Ok((search, is_finite_pagination, max_total_hits, offset)) } @@ -917,6 +933,7 @@ pub fn perform_search( highlight_pre_tag, highlight_post_tag, crop_marker, + locales, // already used in prepare_search vector: _, hybrid: _, @@ -941,6 +958,7 @@ pub fn perform_search( sort, show_ranking_score, show_ranking_score_details, + locales: locales.map(|l| l.iter().copied().map(Into::into).collect()), }; let documents = make_hits( @@ -1046,6 +1064,7 @@ struct AttributesFormat { sort: Option>, show_ranking_score: bool, show_ranking_score_details: bool, + locales: Option>, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -1093,19 +1112,16 @@ struct HitMaker<'a> { show_ranking_score_details: bool, sort: Option>, show_matches_position: bool, + locales: Option>, } impl<'a> HitMaker<'a> { pub fn tokenizer<'b>( - script_lang_map: &'b HashMap>, dictionary: Option<&'b [&'b str]>, separators: Option<&'b [&'b str]>, ) -> milli::tokenizer::Tokenizer<'b> { let mut tokenizer_builder = TokenizerBuilder::default(); tokenizer_builder.create_char_map(true); - if !script_lang_map.is_empty() { - tokenizer_builder.allow_list(script_lang_map); - } if let Some(separators) = separators { tokenizer_builder.separators(separators); @@ -1218,6 +1234,7 @@ impl<'a> HitMaker<'a> { show_ranking_score_details: format.show_ranking_score_details, show_matches_position: format.show_matches_position, sort: format.sort, + locales: format.locales, }) } @@ -1280,6 +1297,7 @@ impl<'a> HitMaker<'a> { &self.formatted_options, self.show_matches_position, &self.displayed_ids, + self.locales.as_deref(), )?; if let Some(sort) = self.sort.as_ref() { @@ -1312,8 +1330,6 @@ fn make_hits<'a>( ) -> Result, MeilisearchHttpError> { let mut documents = Vec::new(); - let script_lang_map = index.script_language(rtxn)?; - let dictionary = index.dictionary(rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); @@ -1321,8 +1337,7 @@ fn make_hits<'a>( let separators: Option> = separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); - let tokenizer = - HitMaker::tokenizer(&script_lang_map, dictionary.as_deref(), separators.as_deref()); + let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); @@ -1341,6 +1356,7 @@ pub fn perform_facet_search( facet_name: String, search_kind: SearchKind, features: RoFeatures, + locales: Option>, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -1363,6 +1379,10 @@ pub fn perform_facet_search( facet_search.max_values(max_facets as usize); } + if let Some(locales) = locales { + facet_search.locales(locales); + } + Ok(FacetSearchResult { facet_hits: facet_search.execute()?, facet_query, @@ -1443,6 +1463,7 @@ pub fn perform_similar( sort: None, show_ranking_score, show_ranking_score_details, + locales: None, }; let hits = make_hits( @@ -1631,6 +1652,7 @@ fn format_fields( formatted_options: &BTreeMap, compute_matches: bool, displayable_ids: &BTreeSet, + locales: Option<&[Language]>, ) -> Result<(Option, Document), MeilisearchHttpError> { let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); @@ -1664,6 +1686,14 @@ fn format_fields( let mut infos = Vec::new(); *value = format_value(std::mem::take(value), builder, format, &mut infos, compute_matches); + *value = format_value( + std::mem::take(value), + builder, + format, + &mut infos, + compute_matches, + locales, + ); if let Some(matches) = matches_position.as_mut() { if !infos.is_empty() { @@ -1688,10 +1718,11 @@ fn format_value( format_options: Option, infos: &mut Vec, compute_matches: bool, + locales: Option<&[Language]>, ) -> Value { match value { Value::String(old_string) => { - let mut matcher = builder.build(&old_string); + let mut matcher = builder.build(&old_string, locales); if compute_matches { let matches = matcher.matches(); infos.extend_from_slice(&matches[..]); @@ -1718,6 +1749,7 @@ fn format_value( }), infos, compute_matches, + locales, ) }) .collect(), @@ -1737,6 +1769,7 @@ fn format_value( }), infos, compute_matches, + locales, ), ) }) @@ -1745,7 +1778,7 @@ fn format_value( Value::Number(number) => { let s = number.to_string(); - let mut matcher = builder.build(&s); + let mut matcher = builder.build(&s, locales); if compute_matches { let matches = matcher.matches(); infos.extend_from_slice(&matches[..]); diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 87020994a..bb374f629 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -68,6 +68,7 @@ fn main() -> Result<(), Box> { logger, TimeBudget::max(), None, + None, )?; if let Some((logger, dir)) = detailed_logger { logger.finish(&mut ctx, Path::new(dir))?; diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index a6756a7af..6ef62e39a 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -3,7 +3,7 @@ use std::collections::BinaryHeap; use std::ops::ControlFlow; use charabia::normalizer::NormalizerOption; -use charabia::Normalize; +use charabia::{Language, Normalize, StrDetection, Token}; use fst::automaton::{Automaton, Str}; use fst::{IntoStreamer, Streamer}; use roaring::RoaringBitmap; @@ -23,6 +23,7 @@ pub struct SearchForFacetValues<'a> { search_query: Search<'a>, max_values: usize, is_hybrid: bool, + locales: Option>, } impl<'a> SearchForFacetValues<'a> { @@ -37,6 +38,7 @@ impl<'a> SearchForFacetValues<'a> { search_query, max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, is_hybrid, + locales: None, } } @@ -50,6 +52,11 @@ impl<'a> SearchForFacetValues<'a> { self } + pub fn locales(&mut self, locales: Vec) -> &mut Self { + self.locales = Some(locales); + self + } + fn one_original_value_of( &self, field_id: FieldId, @@ -109,8 +116,7 @@ impl<'a> SearchForFacetValues<'a> { match self.query.as_ref() { Some(query) => { - let options = NormalizerOption { lossy: true, ..Default::default() }; - let query = query.normalize(&options); + let query = normalize_facet_string(query, self.locales.as_deref()); let query = query.as_ref(); let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; @@ -330,3 +336,15 @@ impl ValuesCollection { } } } +fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { + let options = NormalizerOption { lossy: true, ..Default::default() }; + let mut detection = StrDetection::new(facet_string, locales); + let token = Token { + lemma: std::borrow::Cow::Borrowed(facet_string), + script: detection.script(), + language: detection.language(), + ..Default::default() + }; + + token.normalize(&options).lemma.to_string() +} diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index 2102bf479..e08111473 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -174,6 +174,7 @@ impl<'a> Search<'a> { semantic: self.semantic.clone(), time_budget: self.time_budget.clone(), ranking_score_threshold: self.ranking_score_threshold, + locales: self.locales.clone(), }; let semantic = search.semantic.take(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2b2afa607..0f5eb23e1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,6 +1,7 @@ use std::fmt; use std::sync::Arc; +use charabia::Language; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; @@ -52,6 +53,7 @@ pub struct Search<'a> { semantic: Option, time_budget: TimeBudget, ranking_score_threshold: Option, + locales: Option>, } impl<'a> Search<'a> { @@ -72,6 +74,7 @@ impl<'a> Search<'a> { rtxn, index, semantic: None, + locales: None, time_budget: TimeBudget::max(), ranking_score_threshold: None, } @@ -160,6 +163,11 @@ impl<'a> Search<'a> { self } + pub fn locales(&mut self, locales: Vec) -> &mut Search<'a> { + self.locales = Some(locales); + self + } + pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { let ctx = SearchContext::new(self.index, self.rtxn)?; @@ -232,6 +240,7 @@ impl<'a> Search<'a> { &mut DefaultSearchLogger, self.time_budget.clone(), self.ranking_score_threshold, + self.locales.as_ref(), )?, }; @@ -272,6 +281,7 @@ impl fmt::Debug for Search<'_> { semantic, time_budget, ranking_score_threshold, + locales, } = self; f.debug_struct("Search") .field("query", query) @@ -292,6 +302,7 @@ impl fmt::Debug for Search<'_> { ) .field("time_budget", time_budget) .field("ranking_score_threshold", ranking_score_threshold) + .field("locales", locales) .finish() } } diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 7bc4d9c5d..4688b8f32 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use charabia::{SeparatorKind, Token, Tokenizer}; +use charabia::{Language, SeparatorKind, Token, Tokenizer}; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; @@ -46,7 +46,11 @@ impl<'m> MatcherBuilder<'m> { self } - pub fn build<'t>(&self, text: &'t str) -> Matcher<'t, 'm, '_> { + pub fn build<'t, 'lang>( + &self, + text: &'t str, + locales: Option<&'lang [Language]>, + ) -> Matcher<'t, 'm, '_, 'lang> { let crop_marker = match &self.crop_marker { Some(marker) => marker.as_str(), None => DEFAULT_CROP_MARKER, @@ -68,6 +72,7 @@ impl<'m> MatcherBuilder<'m> { highlight_prefix, highlight_suffix, matches: None, + locales, } } } @@ -107,17 +112,18 @@ pub struct MatchBounds { /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. -pub struct Matcher<'t, 'tokenizer, 'b> { +pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { text: &'t str, matching_words: &'b MatchingWords, tokenizer: &'b Tokenizer<'tokenizer>, + locales: Option<&'lang [Language]>, crop_marker: &'b str, highlight_prefix: &'b str, highlight_suffix: &'b str, matches: Option<(Vec>, Vec)>, } -impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_> { +impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { /// some words are counted as matches only if they are close together and in the good order, @@ -173,7 +179,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_> { false } - let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); + let tokens: Vec<_> = + self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect(); let mut matches = Vec::new(); let mut words_positions = tokens @@ -530,6 +537,7 @@ mod tests { &mut crate::DefaultSearchLogger, TimeBudget::max(), None, + None, ) .unwrap(); @@ -553,19 +561,19 @@ mod tests { // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); } @@ -580,23 +588,23 @@ mod tests { // empty text. let text = ""; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop should return complete text, because there is no matches. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -605,7 +613,7 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -622,7 +630,7 @@ mod tests { // Text containing prefix match. let text = "Ŵôřlḑôle"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -631,7 +639,7 @@ mod tests { // Text containing unicode match. let text = "Ŵôřlḑ"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -643,7 +651,7 @@ mod tests { // Text containing unicode match. let text = "Westfália"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no crop should return complete text with highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -661,7 +669,7 @@ mod tests { // empty text. let text = ""; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), @"" @@ -669,7 +677,7 @@ mod tests { // text containing only separators. let text = ":-)"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), @":-)" @@ -677,7 +685,7 @@ mod tests { // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no highlight should return 10 first words with a marker at the end. insta::assert_snapshot!( matcher.format(format_options), @@ -686,7 +694,7 @@ mod tests { // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no highlight should return 10 first words with a marker at the end. insta::assert_snapshot!( matcher.format(format_options), @@ -695,7 +703,7 @@ mod tests { // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // should crop the phrase instead of croping around the match. insta::assert_snapshot!( matcher.format(format_options), @@ -704,7 +712,7 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no highlight should return 10 last words with a marker at the start. insta::assert_snapshot!( matcher.format(format_options), @@ -713,7 +721,7 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // no highlight should return 10 last words with a marker at the start. insta::assert_snapshot!( matcher.format(format_options), @@ -722,7 +730,7 @@ mod tests { // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // crop should return 10 last words with a marker at the start. insta::assert_snapshot!( matcher.format(format_options), @@ -731,7 +739,7 @@ mod tests { // Text containing matches with different density. let text = "split void the void void world void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // crop should return 10 last words with a marker at the start. insta::assert_snapshot!( matcher.format(format_options), @@ -740,7 +748,7 @@ mod tests { // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // crop should return 10 last words with a marker at the start. insta::assert_snapshot!( matcher.format(format_options), @@ -758,7 +766,7 @@ mod tests { // empty text. let text = ""; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), @"" @@ -766,7 +774,7 @@ mod tests { // text containing only separators. let text = ":-)"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), @":-)" @@ -774,7 +782,7 @@ mod tests { // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // both should return 10 first words with a marker at the end. insta::assert_snapshot!( matcher.format(format_options), @@ -783,7 +791,7 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // both should return 10 last words with a marker at the start and highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -792,7 +800,7 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // both should return 10 last words with a marker at the start and highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -801,7 +809,7 @@ mod tests { // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // crop should return 10 last words with a marker at the start. insta::assert_snapshot!( matcher.format(format_options), @@ -824,7 +832,7 @@ mod tests { let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), @@ -832,7 +840,7 @@ mod tests { ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @@ -851,7 +859,7 @@ mod tests { // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // because crop size < query size, partially format matches. insta::assert_snapshot!( matcher.format(format_options), @@ -860,7 +868,7 @@ mod tests { // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // because crop size < query size, partially format matches. insta::assert_snapshot!( matcher.format(format_options), @@ -869,7 +877,7 @@ mod tests { // set crop size to 0 let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); // because crop size is 0, crop is ignored. insta::assert_snapshot!( matcher.format(format_options), @@ -889,7 +897,7 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text); + let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 78b7a0446..577e12a39 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -24,7 +24,7 @@ mod tests; use std::collections::HashSet; use bucket_sort::{bucket_sort, BucketSortOutput}; -use charabia::TokenizerBuilder; +use charabia::{Language, TokenizerBuilder}; use db_cache::DatabaseCache; use exact_attribute::ExactAttribute; use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; @@ -639,6 +639,7 @@ pub fn execute_search( query_graph_logger: &mut dyn SearchLogger, time_budget: TimeBudget, ranking_score_threshold: Option, + locales: Option<&Vec>, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -670,9 +671,8 @@ pub fn execute_search( tokbuilder.words_dict(dictionary); } - let languages = ctx.index.languages(ctx.txn)?; - if !languages.is_empty() { - tokbuilder.allow_list(&languages); + if let Some(locales) = locales { + tokbuilder.allow_list(locales); } let tokenizer = tokbuilder.build(); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index d4c1c2f95..bb98f19ce 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -24,7 +24,7 @@ pub struct ExtractedTokens { #[tracing::instrument(level = "trace", skip_all, target = "search::query")] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext<'_>, - query: NormalizedTokenIter<'_, '_>, + query: NormalizedTokenIter<'_, '_, '_, '_>, words_limit: Option, ) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; From 04fa44e7eb6568cf76fe52f56c8c0c3270bf32e9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 14:51:36 +0200 Subject: [PATCH 4/9] Implement localized attributes settings --- dump/src/lib.rs | 1 + dump/src/reader/compat/v5_to_v6.rs | 1 + meilisearch-types/src/error.rs | 1 + meilisearch-types/src/locales.rs | 26 ++++ meilisearch-types/src/settings.rs | 26 +++- .../src/routes/indexes/facet_search.rs | 7 ++ meilisearch/src/routes/indexes/settings.rs | 23 ++++ meilisearch/src/search/mod.rs | 22 +++- milli/src/heed_codec/mod.rs | 2 - milli/src/heed_codec/script_language_codec.rs | 39 ------ milli/src/index.rs | 92 ++++---------- milli/src/lib.rs | 4 + milli/src/localized_attributes_rules.rs | 114 ++++++++++++++++++ milli/src/update/clear_documents.rs | 2 - .../extract/extract_docid_word_positions.rs | 40 +++--- .../extract/extract_facet_string_docids.rs | 110 ++++++++++++----- milli/src/update/index_documents/mod.rs | 38 ------ milli/src/update/settings.rs | 66 +++++++++- 18 files changed, 405 insertions(+), 209 deletions(-) delete mode 100644 milli/src/heed_codec/script_language_codec.rs create mode 100644 milli/src/localized_attributes_rules.rs diff --git a/dump/src/lib.rs b/dump/src/lib.rs index 722633ec6..a17fcf941 100644 --- a/dump/src/lib.rs +++ b/dump/src/lib.rs @@ -286,6 +286,7 @@ pub(crate) mod test { pagination: Setting::NotSet, embedders: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + localized_attributes: Setting::NotSet, _kind: std::marker::PhantomData, }; settings.check() diff --git a/dump/src/reader/compat/v5_to_v6.rs b/dump/src/reader/compat/v5_to_v6.rs index e6e030186..40a055465 100644 --- a/dump/src/reader/compat/v5_to_v6.rs +++ b/dump/src/reader/compat/v5_to_v6.rs @@ -379,6 +379,7 @@ impl From> for v6::Settings { v5::Setting::NotSet => v6::Setting::NotSet, }, embedders: v6::Setting::NotSet, + localized_attributes: v6::Setting::NotSet, search_cutoff_ms: v6::Setting::NotSet, _kind: std::marker::PhantomData, } diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index e56949b57..4d80fe9c9 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -298,6 +298,7 @@ InvalidSettingsSeparatorTokens , InvalidRequest , BAD_REQUEST ; InvalidSettingsDictionary , InvalidRequest , BAD_REQUEST ; InvalidSettingsSynonyms , InvalidRequest , BAD_REQUEST ; InvalidSettingsTypoTolerance , InvalidRequest , BAD_REQUEST ; +InvalidSettingsLocalizedAttributes , InvalidRequest , BAD_REQUEST ; InvalidState , Internal , INTERNAL_SERVER_ERROR ; InvalidStoreFile , Internal , INTERNAL_SERVER_ERROR ; InvalidSwapDuplicateIndexFound , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch-types/src/locales.rs b/meilisearch-types/src/locales.rs index 14972fc33..6f7fb3a40 100644 --- a/meilisearch-types/src/locales.rs +++ b/meilisearch-types/src/locales.rs @@ -130,3 +130,29 @@ make_locale! { Tgl, Hye } + +#[derive(Debug, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[deserr(rename_all = camelCase)] +#[serde(rename_all = "camelCase")] +pub struct LocalizedAttributesRuleView { + pub attribute_patterns: Vec, + pub locales: Vec, +} + +impl From for LocalizedAttributesRuleView { + fn from(rule: LocalizedAttributesRule) -> Self { + Self { + attribute_patterns: rule.attribute_patterns, + locales: rule.locales.into_iter().map(|l| l.into()).collect(), + } + } +} + +impl From for LocalizedAttributesRule { + fn from(view: LocalizedAttributesRuleView) -> Self { + Self { + attribute_patterns: view.attribute_patterns, + locales: view.locales.into_iter().map(|l| l.into()).collect(), + } + } +} diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 8a9708d29..9e7a2bc15 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -17,6 +17,7 @@ use serde::{Deserialize, Serialize, Serializer}; use crate::deserr::DeserrJsonError; use crate::error::deserr_codes::*; use crate::facet_values_sort::FacetValuesSort; +use crate::locales::LocalizedAttributesRuleView; /// The maximum number of results that the engine /// will be able to return in one search call. @@ -198,6 +199,9 @@ pub struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub search_cutoff_ms: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + pub localized_attributes: Setting>, #[serde(skip)] #[deserr(skip)] @@ -261,6 +265,7 @@ impl Settings { pagination: Setting::Reset, embedders: Setting::Reset, search_cutoff_ms: Setting::Reset, + localized_attributes: Setting::Reset, _kind: PhantomData, } } @@ -284,7 +289,8 @@ impl Settings { pagination, embedders, search_cutoff_ms, - .. + localized_attributes: localized_attributes_rules, + _kind, } = self; Settings { @@ -305,6 +311,7 @@ impl Settings { pagination, embedders, search_cutoff_ms, + localized_attributes: localized_attributes_rules, _kind: PhantomData, } } @@ -352,6 +359,7 @@ impl Settings { pagination: self.pagination, embedders: self.embedders, search_cutoff_ms: self.search_cutoff_ms, + localized_attributes: self.localized_attributes, _kind: PhantomData, } } @@ -402,6 +410,7 @@ pub fn apply_settings_to_builder( pagination, embedders, search_cutoff_ms, + localized_attributes: localized_attributes_rules, _kind, } = settings; @@ -485,6 +494,13 @@ pub fn apply_settings_to_builder( Setting::NotSet => (), } + match localized_attributes_rules { + Setting::Set(ref rules) => builder + .set_localized_attributes_rules(rules.iter().cloned().map(|r| r.into()).collect()), + Setting::Reset => builder.reset_localized_attributes_rules(), + Setting::NotSet => (), + } + match typo_tolerance { Setting::Set(ref value) => { match value.enabled { @@ -679,6 +695,8 @@ pub fn settings( let search_cutoff_ms = index.search_cutoff(rtxn)?; + let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; + let mut settings = Settings { displayed_attributes: match displayed_attributes { Some(attrs) => Setting::Set(attrs), @@ -711,6 +729,10 @@ pub fn settings( Some(cutoff) => Setting::Set(cutoff), None => Setting::Reset, }, + localized_attributes: match localized_attributes_rules { + Some(rules) => Setting::Set(rules.into_iter().map(|r| r.into()).collect()), + None => Setting::Reset, + }, _kind: PhantomData, }; @@ -902,6 +924,7 @@ pub(crate) mod test { faceting: Setting::NotSet, pagination: Setting::NotSet, embedders: Setting::NotSet, + localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, _kind: PhantomData::, }; @@ -930,6 +953,7 @@ pub(crate) mod test { faceting: Setting::NotSet, pagination: Setting::NotSet, embedders: Setting::NotSet, + localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, _kind: PhantomData::, }; diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index ecb7757af..da575fdc4 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -6,6 +6,7 @@ use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::locales::Locale; use serde_json::Value; use tracing::debug; @@ -48,6 +49,8 @@ pub struct FacetSearchQuery { pub attributes_to_search_on: Option>, #[deserr(default, error = DeserrJsonError, default)] pub ranking_score_threshold: Option, + #[deserr(default, error = DeserrJsonError, default)] + pub locales: Option>, } pub async fn search( @@ -67,6 +70,7 @@ pub async fn search( let facet_query = query.facet_query.clone(); let facet_name = query.facet_name.clone(); + let locales = query.locales.clone().map(|l| l.into_iter().map(Into::into).collect()); let mut search_query = SearchQuery::from(query); // Tenant token search_rules. @@ -86,6 +90,7 @@ pub async fn search( facet_name, search_kind, index_scheduler.features(), + locales ) }) .await?; @@ -113,6 +118,7 @@ impl From for SearchQuery { attributes_to_search_on, hybrid, ranking_score_threshold, + locales, } = value; SearchQuery { @@ -141,6 +147,7 @@ impl From for SearchQuery { attributes_to_search_on, hybrid, ranking_score_threshold, + locales, } } } diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index e35ebc930..b62690295 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -474,6 +474,28 @@ make_setting_route!( } ); +make_setting_route!( + "/localized-attributes", + put, + Vec, + meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, + >, + localized_attributes, + "localizedAttributes", + analytics, + |rules: &Option>, req: &HttpRequest| { + use serde_json::json; + analytics.publish( + "LocalizedAttributesRules Updated".to_string(), + json!({ + "locales": rules.as_ref().map(|rules| rules.iter().map(|rule| rule.locales.iter().cloned()).flatten().collect::>()) + }), + Some(req), + ); + } +); + make_setting_route!( "/ranking-rules", put, @@ -786,6 +808,7 @@ pub async fn update_all( }, "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), + "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.into_iter().map(|rule| rule.locales.iter().cloned()).flatten().collect::>()), }), Some(&req), ); diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index d28d888aa..11bf4f84e 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1290,6 +1290,9 @@ impl<'a> HitMaker<'a> { document.insert("_vectors".into(), vectors.into()); } + let localized_attributes = + self.index.localized_attributes_rules(self.rtxn)?.unwrap_or_default(); + let (matches_position, formatted) = format_fields( &displayed_document, &self.fields_ids_map, @@ -1298,6 +1301,7 @@ impl<'a> HitMaker<'a> { self.show_matches_position, &self.displayed_ids, self.locales.as_deref(), + &localized_attributes, )?; if let Some(sort) = self.sort.as_ref() { @@ -1365,6 +1369,14 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; + let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + let locales = locales.or_else(|| { + localized_attributes + .into_iter() + .find(|attr| attr.match_str(&facet_name)) + .map(|attr| attr.locales) + }); + let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, &search_kind, time_budget, features)?; let mut facet_search = SearchForFacetValues::new( @@ -1653,6 +1665,7 @@ fn format_fields( compute_matches: bool, displayable_ids: &BTreeSet, locales: Option<&[Language]>, + localized_attributes: &[LocalizedAttributesRule], ) -> Result<(Option, Document), MeilisearchHttpError> { let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); @@ -1685,7 +1698,14 @@ fn format_fields( .reduce(|acc, option| acc.merge(option)); let mut infos = Vec::new(); - *value = format_value(std::mem::take(value), builder, format, &mut infos, compute_matches); + // if no locales has been provided, we try to find the locales in the localized_attributes. + let locales = locales.or_else(|| { + localized_attributes + .iter() + .find(|rule| rule.match_str(key)) + .map(LocalizedAttributesRule::locales) + }); + *value = format_value( std::mem::take(value), builder, diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 449d1955c..575b886bd 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -7,7 +7,6 @@ mod fst_set_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; -mod script_language_codec; mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; @@ -26,7 +25,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; -pub use self::script_language_codec::ScriptLanguageCodec; pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec}; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs deleted file mode 100644 index 35f7af3c7..000000000 --- a/milli/src/heed_codec/script_language_codec.rs +++ /dev/null @@ -1,39 +0,0 @@ -use std::borrow::Cow; -use std::ffi::CStr; -use std::str; - -use charabia::{Language, Script}; -use heed::BoxedError; - -pub struct ScriptLanguageCodec; - -impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { - type DItem = (Script, Language); - - fn bytes_decode(bytes: &'a [u8]) -> Result { - let cstr = CStr::from_bytes_until_nul(bytes)?; - let script = cstr.to_str()?; - let script_name = Script::from_name(script); - // skip '\0' byte between the two strings. - let lan = str::from_utf8(&bytes[script.len() + 1..])?; - let lan_name = Language::from_name(lan); - - Ok((script_name, lan_name)) - } -} - -impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { - type EItem = (Script, Language); - - fn bytes_encode((script, lan): &Self::EItem) -> Result, BoxedError> { - let script_name = script.name().as_bytes(); - let lan_name = lan.name().as_bytes(); - - let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1); - bytes.extend_from_slice(script_name); - bytes.push(0); - bytes.extend_from_slice(lan_name); - - Ok(Cow::Owned(bytes)) - } -} diff --git a/milli/src/index.rs b/milli/src/index.rs index 194f18faa..f5342f2c0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -4,7 +4,6 @@ use std::convert::TryInto; use std::fs::File; use std::path::Path; -use charabia::{Language, Script}; use heed::types::*; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; @@ -19,9 +18,7 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::{ - BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, -}; +use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; @@ -29,8 +26,8 @@ use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, - FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, + FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec, + RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -73,6 +70,7 @@ pub mod main_key { pub const PROXIMITY_PRECISION: &str = "proximity-precision"; pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; pub const SEARCH_CUTOFF: &str = "search_cutoff"; + pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; } pub mod db_name { @@ -101,7 +99,6 @@ pub mod db_name { pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; pub const VECTOR_ARROY: &str = "vector-arroy"; pub const DOCUMENTS: &str = "documents"; - pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; } #[derive(Clone)] @@ -142,9 +139,6 @@ pub struct Index { /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field pub word_prefix_fid_docids: Database, - /// Maps the script and language with all the docids that corresponds to it. - pub script_language_docids: Database, - /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, /// Maps the facet field id and the docids for which this field is set as null @@ -198,8 +192,6 @@ impl Index { env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?; let word_pair_proximity_docids = env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; - let script_language_docids = - env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = @@ -243,7 +235,6 @@ impl Index { word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, - script_language_docids, word_position_docids, word_fid_docids, word_prefix_position_docids, @@ -1562,69 +1553,32 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::PROXIMITY_PRECISION) } - /* script language docids */ - /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. - pub fn script_language_documents_ids( + pub fn localized_attributes_rules( &self, rtxn: &RoTxn<'_>, - key: &(Script, Language), - ) -> heed::Result> { - self.script_language_docids.get(rtxn, key) + ) -> heed::Result>> { + self.main + .remap_types::>>() + .get(rtxn, main_key::LOCALIZED_ATTRIBUTES_RULES) } - pub fn script_language( + pub(crate) fn put_localized_attributes_rules( &self, - rtxn: &RoTxn<'_>, - ) -> heed::Result>> { - let mut script_language: HashMap> = HashMap::new(); - let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); - let mut total = 0; - for sl in self.script_language_docids.iter(rtxn)? { - let ((script, language), docids) = sl?; - - // keep only Languages that contains at least 1 document. - let remaining_documents_count = docids.len(); - total += remaining_documents_count; - if remaining_documents_count > 0 { - script_language_doc_count.push((script, language, remaining_documents_count)); - } - } - - let threshold = total / 20; // 5% (arbitrary) - for (script, language, count) in script_language_doc_count { - if count > threshold { - if let Some(languages) = script_language.get_mut(&script) { - (*languages).push(language); - } else { - script_language.insert(script, vec![language]); - } - } - } - - Ok(script_language) + txn: &mut RwTxn<'_>, + val: Vec, + ) -> heed::Result<()> { + self.main.remap_types::>>().put( + txn, + main_key::LOCALIZED_ATTRIBUTES_RULES, + &val, + ) } - pub fn languages(&self, rtxn: &RoTxn<'_>) -> heed::Result> { - let mut script_language_doc_count: Vec<(Language, u64)> = Vec::new(); - let mut total = 0; - for sl in self.script_language_docids.iter(rtxn)? { - let ((_script, language), docids) = sl?; - - // keep only Languages that contains at least 1 document. - let remaining_documents_count = docids.len(); - total += remaining_documents_count; - if remaining_documents_count > 0 { - script_language_doc_count.push((language, remaining_documents_count)); - } - } - - let threshold = total / 20; // 5% (arbitrary) - - Ok(script_language_doc_count - .into_iter() - .filter(|(_, count)| *count > threshold) - .map(|(language, _)| language) - .collect()) + pub(crate) fn delete_localized_attributes_rules( + &self, + txn: &mut RwTxn<'_>, + ) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::LOCALIZED_ATTRIBUTES_RULES) } /// Put the embedding configs: diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fcb0da19c..461971ddf 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -16,6 +16,7 @@ pub mod facet; mod fields_ids_map; pub mod heed_codec; pub mod index; +mod localized_attributes_rules; pub mod order_by_map; pub mod prompt; pub mod proximity; @@ -69,6 +70,9 @@ pub use self::search::{ Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; +pub use self::localized_attributes_rules::LocalizedAttributesRule; +use self::localized_attributes_rules::LocalizedFieldIds; + pub type Result = std::result::Result; pub type Attribute = u32; diff --git a/milli/src/localized_attributes_rules.rs b/milli/src/localized_attributes_rules.rs new file mode 100644 index 000000000..a3b3e820b --- /dev/null +++ b/milli/src/localized_attributes_rules.rs @@ -0,0 +1,114 @@ +use std::collections::HashMap; + +use charabia::Language; +use serde::{Deserialize, Serialize}; + +use crate::fields_ids_map::FieldsIdsMap; +use crate::FieldId; + +/// A rule that defines which locales are supported for a given attribute. +/// +/// The rule is a list of attribute patterns and a list of locales. +/// The attribute patterns are matched against the attribute name. +/// The pattern `*` matches any attribute name. +/// The pattern `attribute_name*` matches any attribute name that starts with `attribute_name`. +/// The pattern `*attribute_name` matches any attribute name that ends with `attribute_name`. +/// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct LocalizedAttributesRule { + pub attribute_patterns: Vec, + pub locales: Vec, +} + +impl LocalizedAttributesRule { + pub fn new(attribute_patterns: Vec, locales: Vec) -> Self { + Self { attribute_patterns, locales } + } + + pub fn match_str(&self, str: &str) -> bool { + self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str)) + } + + pub fn locales(&self) -> &[Language] { + &self.locales + } +} + +fn match_pattern(pattern: &str, str: &str) -> bool { + let res = if pattern == "*" { + true + } else if pattern.starts_with('*') && pattern.ends_with('*') { + str.contains(&pattern[1..pattern.len() - 1]) + } else if pattern.ends_with('*') { + str.starts_with(&pattern[..pattern.len() - 1]) + } else if pattern.starts_with('*') { + str.ends_with(&pattern[1..]) + } else { + pattern == str + }; + + res +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LocalizedFieldIds { + field_id_to_locales: HashMap>, +} + +impl LocalizedFieldIds { + pub fn new>( + rules: &Option>, + fields_ids_map: &FieldsIdsMap, + fields_ids: I, + ) -> Self { + let mut field_id_to_locales = HashMap::new(); + + if let Some(rules) = rules { + let fields = fields_ids.filter_map(|field_id| { + fields_ids_map.name(field_id).map(|field_name| (field_id, field_name)) + }); + + for (field_id, field_name) in fields { + let mut locales = Vec::new(); + for rule in rules { + if rule.match_str(field_name) { + locales.extend(rule.locales.iter()); + } + } + + if !locales.is_empty() { + locales.sort(); + locales.dedup(); + field_id_to_locales.insert(field_id, locales); + } + } + } + + Self { field_id_to_locales } + } + + pub fn locales<'a>(&'a self, fields_id: FieldId) -> Option<&'a [Language]> { + self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_match_pattern() { + assert_eq!(match_pattern("*", "test"), true); + assert_eq!(match_pattern("test*", "test"), true); + assert_eq!(match_pattern("test*", "testa"), true); + assert_eq!(match_pattern("*test", "test"), true); + assert_eq!(match_pattern("*test", "atest"), true); + assert_eq!(match_pattern("*test*", "test"), true); + assert_eq!(match_pattern("*test*", "atesta"), true); + assert_eq!(match_pattern("*test*", "atest"), true); + assert_eq!(match_pattern("*test*", "testa"), true); + assert_eq!(match_pattern("test*test", "test"), false); + assert_eq!(match_pattern("*test", "testa"), false); + assert_eq!(match_pattern("test*", "atest"), false); + } +} diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 9eca378a5..6c4efb859 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -36,7 +36,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { field_id_word_count_docids, word_prefix_position_docids, word_prefix_fid_docids, - script_language_docids, facet_id_f64_docids, facet_id_string_docids, facet_id_normalized_string_strings, @@ -83,7 +82,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; word_prefix_fid_docids.clear(self.wtxn)?; - script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_normalized_string_strings.clear(self.wtxn)?; facet_id_string_fst.clear(self.wtxn)?; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 748a3886a..ba11ceeb3 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::BufReader; use std::{io, mem, str}; -use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; +use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; @@ -11,7 +11,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; -use crate::update::settings::InnerIndexSettingsDiff; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; /// Extracts the word and positions where this word appear and @@ -57,13 +57,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut del_builder = tokenizer_builder( - old_stop_words, - old_separators.as_deref(), - old_dictionary.as_deref(), - None, - ); - let del_tokenizer = del_builder.build(); + let del_builder = + tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); + let del_tokenizer = del_builder.into_tokenizer(); let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff @@ -73,13 +69,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut add_builder = tokenizer_builder( - new_stop_words, - new_separators.as_deref(), - new_dictionary.as_deref(), - None, - ); - let add_tokenizer = add_builder.build(); + let add_builder = + tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); + let add_tokenizer = add_builder.into_tokenizer(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?; @@ -107,7 +99,7 @@ pub fn extract_docid_word_positions( // deletions tokens_from_document( &obkv, - &settings_diff.old.searchable_fields_ids, + &settings_diff.old, &del_tokenizer, max_positions_per_attributes, DelAdd::Deletion, @@ -118,7 +110,7 @@ pub fn extract_docid_word_positions( // additions tokens_from_document( &obkv, - &settings_diff.new.searchable_fields_ids, + &settings_diff.new, &add_tokenizer, max_positions_per_attributes, DelAdd::Addition, @@ -180,7 +172,6 @@ fn tokenizer_builder<'a>( stop_words: Option<&'a fst::Set>>, allowed_separators: Option<&'a [&str]>, dictionary: Option<&'a [&str]>, - languages: Option<&'a Vec>, ) -> TokenizerBuilder<'a, Vec> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { @@ -193,17 +184,13 @@ fn tokenizer_builder<'a>( tokenizer_builder.separators(separators); } - if let Some(languages) = languages { - tokenizer_builder.allow_list(languages); - } - tokenizer_builder } /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader<'a, FieldId>, - searchable_fields: &[FieldId], + settings: &InnerIndexSettings, tokenizer: &Tokenizer<'_>, max_positions_per_attributes: u32, del_add: DelAdd, @@ -213,7 +200,7 @@ fn tokens_from_document<'a>( let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { // if field is searchable. - if searchable_fields.as_ref().contains(&field_id) { + if settings.searchable_fields_ids.contains(&field_id) { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { // parse json. @@ -228,7 +215,8 @@ fn tokens_from_document<'a>( buffers.field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { // create an iterator of token with their positions. - let tokens = process_tokens(tokenizer.tokenize(field)) + let locales = settings.localized_searchable_fields_ids.locales(field_id); + let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 3deace127..6452a67a1 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -5,6 +5,7 @@ use std::iter::FromIterator; use std::{io, str}; use charabia::normalizer::{Normalize, NormalizerOption}; +use charabia::{Language, StrDetection, Token}; use heed::types::SerdeJson; use heed::BytesEncode; @@ -26,10 +27,9 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, - _settings_diff: &InnerIndexSettingsDiff, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); - let options = NormalizerOption { lossy: true, ..Default::default() }; let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, @@ -54,12 +54,8 @@ pub fn extract_facet_string_docids( while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); - // nothing to do if we delete and re-add the value. - if deladd_reader.get(DelAdd::Deletion).is_some() - && deladd_reader.get(DelAdd::Addition).is_some() - { - continue; - } + let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() + && deladd_reader.get(DelAdd::Addition).is_some(); let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); @@ -72,29 +68,66 @@ pub fn extract_facet_string_docids( // Facet search normalization { - let mut hyper_normalized_value = normalized_value.normalize(&options); - let normalized_truncated_facet: String; - if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH { - normalized_truncated_facet = hyper_normalized_value - .char_indices() - .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); - hyper_normalized_value = normalized_truncated_facet.into(); - } + let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); + let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales); + let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales); + let set = BTreeSet::from_iter(std::iter::once(normalized_value)); - buffer.clear(); - let mut obkv = KvWriterDelAdd::new(&mut buffer); - for (deladd_key, _) in deladd_reader.iter() { - let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; - obkv.insert(deladd_key, val)?; - } - obkv.finish()?; + // if the facet string is the same, we can put the deletion and addition in the same obkv. + if old_hyper_normalized_value == new_hyper_normalized_value { + // nothing to do if we delete and re-add the value. + if is_same_value { + continue; + } - let key = (field_id, hyper_normalized_value.as_ref()); - let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; - normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd_key, val)?; + } + obkv.finish()?; + + let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } else { + // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different. + // deletion + if deladd_reader.get(DelAdd::Deletion).is_some() { + // insert old value + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + obkv.insert(DelAdd::Deletion, val)?; + obkv.finish()?; + let key: (u16, &str) = (field_id, old_hyper_normalized_value.as_ref()); + let key_bytes = + BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + + // addition + if deladd_reader.get(DelAdd::Addition).is_some() { + // insert new value + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + obkv.insert(DelAdd::Addition, val)?; + obkv.finish()?; + let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref()); + let key_bytes = + BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + } + } + + // nothing to do if we delete and re-add the value. + if is_same_value { + continue; } let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; @@ -112,3 +145,24 @@ pub fn extract_facet_string_docids( let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) } + +/// Normalizes the facet string and truncates it to the max length. +fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { + let options = NormalizerOption { lossy: true, ..Default::default() }; + let mut detection = StrDetection::new(facet_string, locales); + let token = Token { + lemma: std::borrow::Cow::Borrowed(facet_string), + script: detection.script(), + language: detection.language(), + ..Default::default() + }; + + // truncate the facet string to the max length + token + .normalize(&options) + .lemma + .char_indices() + .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect() +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2521b778f..1df31fff2 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3388,44 +3388,6 @@ mod tests { wtxn.commit().unwrap(); } - #[test] - #[cfg(feature = "all-tokenizations")] - fn stored_detected_script_and_language_should_not_return_deleted_documents() { - use charabia::{Language, Script}; - let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, - { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, - { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, - { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, - { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, - { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, - ])) - .unwrap(); - - let key_cmn = (Script::Cj, Language::Cmn); - let cj_cmn_docs = - index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(1); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - - delete_documents(&mut wtxn, &index, &["1"]); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let cj_cmn_docs = - index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(5); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - } - #[test] fn delete_words_exact_attributes() { let index = TempIndex::new(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 448c74fd8..2cac2777d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -28,7 +28,7 @@ use crate::vector::settings::{ WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldId, FieldsIdsMap, Index, Result}; +use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -159,6 +159,7 @@ pub struct Settings<'a, 't, 'i> { proximity_precision: Setting, embedder_settings: Setting>>, search_cutoff: Setting, + localized_attributes_rules: Setting>, } impl<'a, 't, 'i> Settings<'a, 't, 'i> { @@ -193,6 +194,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { proximity_precision: Setting::NotSet, embedder_settings: Setting::NotSet, search_cutoff: Setting::NotSet, + localized_attributes_rules: Setting::NotSet, indexer_config, } } @@ -391,6 +393,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.search_cutoff = Setting::Reset; } + pub fn set_localized_attributes_rules(&mut self, value: Vec) { + self.localized_attributes_rules = Setting::Set(value); + } + + pub fn reset_localized_attributes_rules(&mut self) { + self.localized_attributes_rules = Setting::Reset; + } + #[tracing::instrument( level = "trace" skip(self, progress_callback, should_abort, settings_diff), @@ -1118,6 +1128,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } + fn update_localized_attributes_rules(&mut self) -> Result { + let changed = match &self.localized_attributes_rules { + Setting::Set(new) => { + let old = self.index.localized_attributes_rules(self.wtxn)?; + if old.as_ref() == Some(new) { + false + } else { + self.index.put_localized_attributes_rules(self.wtxn, new.clone())?; + true + } + } + Setting::Reset => self.index.delete_localized_attributes_rules(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> where FP: Fn(UpdateIndexingStep) + Sync, @@ -1151,6 +1179,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; + self.update_localized_attributes_rules()?; let embedding_config_updates = self.update_embedding_configs()?; @@ -1229,6 +1258,8 @@ impl InnerIndexSettingsDiff { || old_settings.allowed_separators != new_settings.allowed_separators || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision + || old_settings.localized_searchable_fields_ids + != new_settings.localized_searchable_fields_ids }; let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; @@ -1304,6 +1335,7 @@ impl InnerIndexSettingsDiff { } (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) + || self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids } pub fn reindex_vectors(&self) -> bool { @@ -1341,6 +1373,8 @@ pub(crate) struct InnerIndexSettings { pub geo_fields_ids: Option<(FieldId, FieldId)>, pub non_searchable_fields_ids: Vec, pub non_faceted_fields_ids: Vec, + pub localized_searchable_fields_ids: LocalizedFieldIds, + pub localized_faceted_fields_ids: LocalizedFieldIds, } impl InnerIndexSettings { @@ -1382,6 +1416,17 @@ impl InnerIndexSettings { } None => None, }; + let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; + let localized_searchable_fields_ids = LocalizedFieldIds::new( + &localized_attributes_rules, + &fields_ids_map, + searchable_fields_ids.iter().cloned(), + ); + let localized_faceted_fields_ids = LocalizedFieldIds::new( + &localized_attributes_rules, + &fields_ids_map, + faceted_fields_ids.iter().cloned(), + ); let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); @@ -1403,6 +1448,8 @@ impl InnerIndexSettings { geo_fields_ids, non_searchable_fields_ids: vectors_fids.clone(), non_faceted_fields_ids: vectors_fids.clone(), + localized_searchable_fields_ids, + localized_faceted_fields_ids, }) } @@ -1418,6 +1465,12 @@ impl InnerIndexSettings { index.put_faceted_fields(wtxn, &new_facets)?; self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; + let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; + self.localized_faceted_fields_ids = LocalizedFieldIds::new( + &localized_attributes_rules, + &self.fields_ids_map, + self.faceted_fields_ids.iter().cloned(), + ); Ok(()) } @@ -1441,8 +1494,13 @@ impl InnerIndexSettings { &self.fields_ids_map, )?; } - let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - self.searchable_fields_ids = searchable_fields_ids; + self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?; + let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; + self.localized_searchable_fields_ids = LocalizedFieldIds::new( + &localized_attributes_rules, + &self.fields_ids_map, + self.searchable_fields_ids.iter().cloned(), + ); Ok(()) } @@ -2573,6 +2631,7 @@ mod tests { proximity_precision, embedder_settings, search_cutoff, + localized_attributes_rules, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet)); @@ -2597,6 +2656,7 @@ mod tests { assert!(matches!(proximity_precision, Setting::NotSet)); assert!(matches!(embedder_settings, Setting::NotSet)); assert!(matches!(search_cutoff, Setting::NotSet)); + assert!(matches!(localized_attributes_rules, Setting::NotSet)); }) .unwrap(); } From e06fbcc607f0dcd97681a51f90cb5103f7307980 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 14:52:02 +0200 Subject: [PATCH 5/9] Update snapshots --- .../lib.rs/import_vectors/Intel to kefir succeeds.snap | 2 +- .../src/snapshots/lib.rs/import_vectors/Intel to kefir.snap | 2 +- .../snapshots/lib.rs/import_vectors/adding Intel succeeds.snap | 2 +- .../src/snapshots/lib.rs/import_vectors/after adding Intel.snap | 2 +- .../import_vectors/after_registering_settings_task_vectors.snap | 2 +- .../import_vectors/settings_update_processed_vectors.snap | 2 +- .../test_settings_update/after_registering_settings_task.snap | 2 +- .../lib.rs/test_settings_update/settings_update_processed.snap | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap index 6f2da1f17..a28e85204 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap index 569556a19..344134888 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap index b626d8bc5..fd8096d13 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap index 65f758f32..24098d658 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap index 9c628461d..c11dfba62 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap index eddf6d7e8..bc87d8212 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index 7873fb6cf..e5fab9ad6 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap index 8a4838094..b49cee730 100644 --- a/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] From 4fbe048cbfa72fae7b6913e5cd85ccb682b48148 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 15:11:29 +0200 Subject: [PATCH 6/9] Update Cargo.lock --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 547f9c0e3..b3bc8b534 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -934,7 +934,7 @@ dependencies = [ [[package]] name = "charabia" version = "0.8.12" -source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3" +source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#2992583137458afcebff5d44cae93fa46d9cf664" dependencies = [ "aho-corasick", "csv", @@ -2649,7 +2649,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3" +source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#2992583137458afcebff5d44cae93fa46d9cf664" dependencies = [ "csv", "once_cell", From 70d71581eefbb494b369bff07bcff78f77993815 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 15:19:07 +0200 Subject: [PATCH 7/9] fix clippy --- meilisearch-types/src/locales.rs | 3 +- .../src/analytics/segment_analytics.rs | 2 +- .../src/routes/indexes/facet_search.rs | 2 +- meilisearch/src/routes/indexes/settings.rs | 4 +- meilisearch/src/search/mod.rs | 1 + milli/src/lib.rs | 5 +-- milli/src/localized_attributes_rules.rs | 40 +++++++++---------- 7 files changed, 27 insertions(+), 30 deletions(-) diff --git a/meilisearch-types/src/locales.rs b/meilisearch-types/src/locales.rs index 6f7fb3a40..8c15fe528 100644 --- a/meilisearch-types/src/locales.rs +++ b/meilisearch-types/src/locales.rs @@ -1,9 +1,8 @@ use deserr::Deserr; +use milli::LocalizedAttributesRule; use serde::{Deserialize, Serialize}; use serde_json::json; -use milli::LocalizedAttributesRule; - /// Generate a Locale enum and its From and Into implementations for milli::tokenizer::Language. /// /// this enum implements `Deserr` in order to be used in the API. diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 407b90658..07350d506 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -780,7 +780,7 @@ impl SearchAggregator { ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); if let Some(locales) = locales { - ret.locales = locales.into_iter().copied().collect(); + ret.locales = locales.iter().copied().collect(); } ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index da575fdc4..a648987ca 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -90,7 +90,7 @@ pub async fn search( facet_name, search_kind, index_scheduler.features(), - locales + locales, ) }) .await?; diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index b62690295..6f081f1c7 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -489,7 +489,7 @@ make_setting_route!( analytics.publish( "LocalizedAttributesRules Updated".to_string(), json!({ - "locales": rules.as_ref().map(|rules| rules.iter().map(|rule| rule.locales.iter().cloned()).flatten().collect::>()) + "locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()) }), Some(req), ); @@ -808,7 +808,7 @@ pub async fn update_all( }, "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), - "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.into_iter().map(|rule| rule.locales.iter().cloned()).flatten().collect::>()), + "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()), }), Some(&req), ); diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index 11bf4f84e..dada9159b 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1657,6 +1657,7 @@ fn make_document( Ok(document) } +#[allow(clippy::too_many_arguments)] fn format_fields( document: &Document, field_ids_map: &FieldsIdsMap, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 461971ddf..8008b7bd1 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -63,6 +63,8 @@ pub use self::heed_codec::{ UncheckedU8StrStrCodec, }; pub use self::index::Index; +pub use self::localized_attributes_rules::LocalizedAttributesRule; +use self::localized_attributes_rules::LocalizedFieldIds; pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::similar::Similar; pub use self::search::{ @@ -70,9 +72,6 @@ pub use self::search::{ Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; -pub use self::localized_attributes_rules::LocalizedAttributesRule; -use self::localized_attributes_rules::LocalizedFieldIds; - pub type Result = std::result::Result; pub type Attribute = u32; diff --git a/milli/src/localized_attributes_rules.rs b/milli/src/localized_attributes_rules.rs index a3b3e820b..aa4eddee1 100644 --- a/milli/src/localized_attributes_rules.rs +++ b/milli/src/localized_attributes_rules.rs @@ -35,19 +35,17 @@ impl LocalizedAttributesRule { } fn match_pattern(pattern: &str, str: &str) -> bool { - let res = if pattern == "*" { + if pattern == "*" { true } else if pattern.starts_with('*') && pattern.ends_with('*') { str.contains(&pattern[1..pattern.len() - 1]) - } else if pattern.ends_with('*') { - str.starts_with(&pattern[..pattern.len() - 1]) - } else if pattern.starts_with('*') { - str.ends_with(&pattern[1..]) + } else if let Some(pattern) = pattern.strip_prefix('*') { + str.ends_with(pattern) + } else if let Some(pattern) = pattern.strip_suffix('*') { + str.starts_with(pattern) } else { pattern == str - }; - - res + } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -87,7 +85,7 @@ impl LocalizedFieldIds { Self { field_id_to_locales } } - pub fn locales<'a>(&'a self, fields_id: FieldId) -> Option<&'a [Language]> { + pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> { self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) } } @@ -98,17 +96,17 @@ mod tests { #[test] fn test_match_pattern() { - assert_eq!(match_pattern("*", "test"), true); - assert_eq!(match_pattern("test*", "test"), true); - assert_eq!(match_pattern("test*", "testa"), true); - assert_eq!(match_pattern("*test", "test"), true); - assert_eq!(match_pattern("*test", "atest"), true); - assert_eq!(match_pattern("*test*", "test"), true); - assert_eq!(match_pattern("*test*", "atesta"), true); - assert_eq!(match_pattern("*test*", "atest"), true); - assert_eq!(match_pattern("*test*", "testa"), true); - assert_eq!(match_pattern("test*test", "test"), false); - assert_eq!(match_pattern("*test", "testa"), false); - assert_eq!(match_pattern("test*", "atest"), false); + assert!(match_pattern("*", "test")); + assert!(match_pattern("test*", "test")); + assert!(match_pattern("test*", "testa")); + assert!(match_pattern("*test", "test")); + assert!(match_pattern("*test", "atest")); + assert!(match_pattern("*test*", "test")); + assert!(match_pattern("*test*", "atesta")); + assert!(match_pattern("*test*", "atest")); + assert!(match_pattern("*test*", "testa")); + assert!(!match_pattern("test*test", "test")); + assert!(!match_pattern("*test", "testa")); + assert!(!match_pattern("test*", "atest")); } } From a918561ac1ff220e4284f138e4f8a638b0e919af Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 25 Jul 2024 10:16:23 +0200 Subject: [PATCH 8/9] Fix PR comments --- meilisearch-types/src/locales.rs | 26 +- meilisearch/tests/dumps/mod.rs | 42 +- meilisearch/tests/search/locales.rs | 675 ++++++++++++++++++++++-- milli/src/index.rs | 4 +- milli/src/localized_attributes_rules.rs | 2 + milli/src/search/facet/search.rs | 2 +- milli/src/update/settings.rs | 19 +- 7 files changed, 689 insertions(+), 81 deletions(-) diff --git a/meilisearch-types/src/locales.rs b/meilisearch-types/src/locales.rs index 8c15fe528..c6902dd71 100644 --- a/meilisearch-types/src/locales.rs +++ b/meilisearch-types/src/locales.rs @@ -40,19 +40,7 @@ macro_rules! make_locale { impl std::fmt::Display for LocaleFormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let valid_locales = [$(Locale::$language),+].iter().map(|l| format!("`{}`", json!(l).as_str().unwrap())).collect::>().join(", "); - write!(f, "Unknown value `{}`, expected one of {}", self.invalid_locale, valid_locales) - } - } - - impl std::error::Error for LocaleFormatError {} - - impl std::str::FromStr for Locale { - type Err = LocaleFormatError; - - fn from_str(s: &str) -> Result { - milli::tokenizer::Language::from_code(s).map(Self::from).ok_or(LocaleFormatError { - invalid_locale: s.to_string(), - }) + write!(f, "Unsupported locale `{}`, expected one of {}", self.invalid_locale, valid_locales) } } }; @@ -130,6 +118,18 @@ make_locale! { Hye } +impl std::error::Error for LocaleFormatError {} + +impl std::str::FromStr for Locale { + type Err = LocaleFormatError; + + fn from_str(s: &str) -> Result { + milli::tokenizer::Language::from_code(s) + .map(Self::from) + .ok_or(LocaleFormatError { invalid_locale: s.to_string() }) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize)] #[deserr(rename_all = camelCase)] #[serde(rename_all = "camelCase")] diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 92f72fe78..ea98e200f 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -78,7 +78,8 @@ async fn import_dump_v1_movie_raw() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -240,7 +241,8 @@ async fn import_dump_v1_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -388,7 +390,8 @@ async fn import_dump_v1_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -522,7 +525,8 @@ async fn import_dump_v2_movie_raw() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -668,7 +672,8 @@ async fn import_dump_v2_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -813,7 +818,8 @@ async fn import_dump_v2_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -947,7 +953,8 @@ async fn import_dump_v3_movie_raw() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -1093,7 +1100,8 @@ async fn import_dump_v3_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -1238,7 +1246,8 @@ async fn import_dump_v3_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -1372,7 +1381,8 @@ async fn import_dump_v4_movie_raw() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -1518,7 +1528,8 @@ async fn import_dump_v4_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -1663,7 +1674,8 @@ async fn import_dump_v4_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "### ); @@ -1909,7 +1921,8 @@ async fn import_dump_v6_containing_experimental_features() { "pagination": { "maxTotalHits": 1000 }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "###); @@ -2087,7 +2100,8 @@ async fn generate_and_import_dump_containing_vectors() { "documentTemplate": "{{doc.doggo}}" } }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "###); diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index 722694ba3..9f1c22b75 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -103,12 +103,41 @@ async fn simple_search() { // english index - .search(json!({"q": "Atta", "attributesToRetrieve": ["id"]}), |response, code| { + .search(json!({"q": "Atta", "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { "hits": [ { - "id": 852 + "name_en": "Attack on Titan", + "name_ja": "進撃の巨人", + "author_en": "Hajime Isayama", + "author_ja": "諫山 創", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "id": 852, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_en": "Attack on Titan", + "name_ja": "進撃の巨人", + "author_en": "Hajime Isayama", + "author_ja": "諫山 創", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "id": "852", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } } ], "query": "Atta", @@ -124,12 +153,35 @@ async fn simple_search() { // japanese index - .search(json!({"q": "進撃", "attributesToRetrieve": ["id"]}), |response, code| { + .search(json!({"q": "進撃", "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { "hits": [ { - "id": 853 + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } } ], "query": "進撃", @@ -145,25 +197,77 @@ async fn simple_search() { index .search( - json!({"q": "進撃", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + json!({"q": "進撃", "locales": ["jpn"], "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" - { - "hits": [ { - "id": 852 - }, - { - "id": 853 + "hits": [ + { + "name_en": "Attack on Titan", + "name_ja": "進撃の巨人", + "author_en": "Hajime Isayama", + "author_ja": "諫山 創", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "id": 852, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_en": "Attack on Titan", + "name_ja": "進撃の巨人", + "author_en": "Hajime Isayama", + "author_ja": "諫山 創", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "id": "852", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + }, + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "進撃", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 } - ], - "query": "進撃", - "processingTimeMs": "[duration]", - "limit": 20, - "offset": 0, - "estimatedTotalHits": 2 - } - "###); + "###); snapshot!(code, @"200 OK"); }, ) @@ -171,15 +275,67 @@ async fn simple_search() { // chinese index - .search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| { + .search(json!({"q": "进击", "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { "hits": [ { - "id": 853 + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } }, { - "id": 852 + "name_en": "Attack on Titan", + "name_ja": "進撃の巨人", + "author_en": "Hajime Isayama", + "author_ja": "諫山 創", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "id": 852, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_en": "Attack on Titan", + "name_ja": "進撃の巨人", + "author_en": "Hajime Isayama", + "author_ja": "諫山 創", + "description_en": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "description_ja": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "id": "852", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } } ], "query": "进击", @@ -226,7 +382,7 @@ async fn force_locales() { // chinese detection index .search( - json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"]}), + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { @@ -246,13 +402,36 @@ async fn force_locales() { // force japanese index .search( - json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { "hits": [ { - "id": 853 + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } } ], "query": "\"进击的巨人\"", @@ -300,7 +479,7 @@ async fn force_locales_with_pattern() { // chinese detection index .search( - json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"]}), + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { @@ -320,13 +499,36 @@ async fn force_locales_with_pattern() { // force japanese index .search( - json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { "hits": [ { - "id": 853 + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } } ], "query": "\"进击的巨人\"", @@ -372,7 +574,7 @@ async fn force_locales_with_pattern_nested() { // chinese index .search( - json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["cmn"]}), + json!({"q": "\"进击的巨人\"", "locales": ["cmn"], "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { @@ -392,13 +594,60 @@ async fn force_locales_with_pattern_nested() { // force japanese index .search( - json!({"q": "\"进击的巨人\"", "attributesToRetrieve": ["id"], "locales": ["jpn"]}), + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), |response, code| { snapshot!(response, @r###" { "hits": [ { - "id": 852 + "document_en": { + "name": "Attack on Titan", + "description": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "author": "Hajime Isayama" + }, + "document_ja": { + "name": "進撃の巨人", + "description": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "author": "諫山 創" + }, + "document_zh": { + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "author": "諫山創" + }, + "id": 852, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "document_en": { + "name": "Attack on Titan", + "description": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "author": "Hajime Isayama" + }, + "document_ja": { + "name": "進撃の巨人", + "description": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "author": "諫山 創" + }, + "document_zh": { + "name": "巨人", + "description": "巨人是日本的漫画系列,由諫山 創作画。", + "author": "諫山創" + }, + "id": "852", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } } ], "query": "\"进击的巨人\"", @@ -413,6 +662,357 @@ async fn force_locales_with_pattern_nested() { ) .await; } +#[actix_rt::test] +async fn force_different_locales_with_pattern() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_zh"], "locales": ["jpn"]}, + // force chinese + {"attributePatterns": ["*_ja"], "locales": ["cmn"]} + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // force chinese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["cmn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn force_different_locales_with_pattern_nested() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = NESTED_DOCUMENTS.clone(); + let (response, _) = index + .update_settings(json!({ + "searchableAttributes": ["document_en", "document_ja", "document_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_zh.*"], "locales": ["jpn"]}, + // force chinese + {"attributePatterns": ["document_ja.*", "document_zh.*"], "locales": ["cmn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // chinese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["cmn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "document_en": { + "name": "Attack on Titan", + "description": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "author": "Hajime Isayama" + }, + "document_ja": { + "name": "進撃の巨人", + "description": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "author": "諫山 創" + }, + "document_zh": { + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "author": "諫山創" + }, + "id": 852, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "document_en": { + "name": "Attack on Titan", + "description": "Attack on Titan is a Japanese manga series written and illustrated by Hajime Isayama", + "author": "Hajime Isayama" + }, + "document_ja": { + "name": "進撃の巨人", + "description": "進撃の巨人は、日本の漫画シリーズであり、諫山 創によって作画されている。", + "author": "諫山 創" + }, + "document_zh": { + "name": "巨人", + "description": "巨人是日本的漫画系列,由諫山 創作画。", + "author": "諫山創" + }, + "id": "852", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn settings_change() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = NESTED_DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + let (response, _) = index + .update_settings(json!({ + "searchableAttributes": ["document_en", "document_ja", "document_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["document_ja.*", "*_zh.*"], "locales": ["jpn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "taskUid": 1, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.wait_task(1).await; + + // chinese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["cmn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // change settings + let (response, _) = index + .update_settings(json!({ + "searchableAttributes": ["document_en", "document_ja", "document_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_zh.*"], "locales": ["jpn"]}, + // force chinese + {"attributePatterns": ["document_ja.*"], "locales": ["cmn"]} + ] + })) + .await; + snapshot!(response, @r###" + { + "taskUid": 2, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.wait_task(2).await; + + // chinese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["cmn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // force japanese + index + .search( + json!({"q": "\"进击的巨人\"", "locales": ["jpn"], "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} #[actix_rt::test] async fn invalid_locales() { @@ -428,9 +1028,7 @@ async fn invalid_locales() { index.add_documents(documents, None).await; index.wait_task(1).await; - let (response, code) = index - .search_post(json!({"q": "Atta", "attributesToRetrieve": ["id"], "locales": ["invalid"]})) - .await; + let (response, code) = index.search_post(json!({"q": "Atta", "locales": ["invalid"]})).await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -442,17 +1040,12 @@ async fn invalid_locales() { "###); let (response, code) = index - .search_get( - &yaup::to_string( - &json!({"q": "Atta", "attributesToRetrieve": ["id"], "locales": ["invalid"]}), - ) - .unwrap(), - ) + .search_get(&yaup::to_string(&json!({"q": "Atta", "locales": ["invalid"]})).unwrap()) .await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value in parameter `locales`: Unknown value `invalid`, expected one of `epo`, `eng`, `rus`, `cmn`, `spa`, `por`, `ita`, `ben`, `fra`, `deu`, `ukr`, `kat`, `ara`, `hin`, `jpn`, `heb`, `yid`, `pol`, `amh`, `jav`, `kor`, `nob`, `dan`, `swe`, `fin`, `tur`, `nld`, `hun`, `ces`, `ell`, `bul`, `bel`, `mar`, `kan`, `ron`, `slv`, `hrv`, `srp`, `mkd`, `lit`, `lav`, `est`, `tam`, `vie`, `urd`, `tha`, `guj`, `uzb`, `pan`, `aze`, `ind`, `tel`, `pes`, `mal`, `ori`, `mya`, `nep`, `sin`, `khm`, `tuk`, `aka`, `zul`, `sna`, `afr`, `lat`, `slk`, `cat`, `tgl`, `hye`", + "message": "Invalid value in parameter `locales`: Unsupported locale `invalid`, expected one of `epo`, `eng`, `rus`, `cmn`, `spa`, `por`, `ita`, `ben`, `fra`, `deu`, `ukr`, `kat`, `ara`, `hin`, `jpn`, `heb`, `yid`, `pol`, `amh`, `jav`, `kor`, `nob`, `dan`, `swe`, `fin`, `tur`, `nld`, `hun`, `ces`, `ell`, `bul`, `bel`, `mar`, `kan`, `ron`, `slv`, `hrv`, `srp`, `mkd`, `lit`, `lav`, `est`, `tam`, `vie`, `urd`, `tha`, `guj`, `uzb`, `pan`, `aze`, `ind`, `tel`, `pes`, `mal`, `ori`, `mya`, `nep`, `sin`, `khm`, `tuk`, `aka`, `zul`, `sna`, `afr`, `lat`, `slk`, `cat`, `tgl`, `hye`", "code": "invalid_search_locales", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_locales" diff --git a/milli/src/index.rs b/milli/src/index.rs index f5342f2c0..3a2f3169c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1558,7 +1558,7 @@ impl Index { rtxn: &RoTxn<'_>, ) -> heed::Result>> { self.main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::LOCALIZED_ATTRIBUTES_RULES) } @@ -1567,7 +1567,7 @@ impl Index { txn: &mut RwTxn<'_>, val: Vec, ) -> heed::Result<()> { - self.main.remap_types::>>().put( + self.main.remap_types::>>().put( txn, main_key::LOCALIZED_ATTRIBUTES_RULES, &val, diff --git a/milli/src/localized_attributes_rules.rs b/milli/src/localized_attributes_rules.rs index aa4eddee1..739d03043 100644 --- a/milli/src/localized_attributes_rules.rs +++ b/milli/src/localized_attributes_rules.rs @@ -71,6 +71,8 @@ impl LocalizedFieldIds { for rule in rules { if rule.match_str(field_name) { locales.extend(rule.locales.iter()); + // Take the first rule that matches + break; } } diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index 6ef62e39a..39fb7374a 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -346,5 +346,5 @@ fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> S ..Default::default() }; - token.normalize(&options).lemma.to_string() + token.normalize(&options).lemma.into_owned() } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 2cac2777d..e423852f1 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1128,22 +1128,21 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } - fn update_localized_attributes_rules(&mut self) -> Result { - let changed = match &self.localized_attributes_rules { + fn update_localized_attributes_rules(&mut self) -> Result<()> { + match &self.localized_attributes_rules { Setting::Set(new) => { let old = self.index.localized_attributes_rules(self.wtxn)?; - if old.as_ref() == Some(new) { - false - } else { + if old.as_ref() != Some(new) { self.index.put_localized_attributes_rules(self.wtxn, new.clone())?; - true } } - Setting::Reset => self.index.delete_localized_attributes_rules(self.wtxn)?, - Setting::NotSet => false, - }; + Setting::Reset => { + self.index.delete_localized_attributes_rules(self.wtxn)?; + } + Setting::NotSet => (), + } - Ok(changed) + Ok(()) } pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> From 59115fd058332fd6dbbfb503846fa389b210780d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 25 Jul 2024 10:50:45 +0200 Subject: [PATCH 9/9] Fix tests --- meilisearch/tests/settings/get_settings.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 239151197..2f51dfb44 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -55,7 +55,7 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 16); + assert_eq!(settings.keys().len(), 17); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); @@ -195,7 +195,8 @@ async fn secrets_are_hidden_in_settings() { "response": "{{embedding}}" } }, - "searchCutoffMs": null + "searchCutoffMs": null, + "localizedAttributes": null } "###);