From cc02920f2b038b05ddf99aa0ff8ef09db0b9b2b2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 23 Jul 2024 14:59:31 +0200 Subject: [PATCH] Update charabia --- Cargo.lock | 37 +--- milli/Cargo.toml | 2 +- milli/src/index.rs | 23 +++ milli/src/search/new/mod.rs | 6 +- .../extract/extract_docid_word_positions.rs | 161 ++---------------- .../src/update/index_documents/extract/mod.rs | 16 +- .../src/update/index_documents/typed_chunk.rs | 34 +--- 7 files changed, 49 insertions(+), 230 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c72053be7..547f9c0e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -934,19 +934,15 @@ dependencies = [ [[package]] name = "charabia" version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9868a22f10dee80498a8a2b6c641d80bf28ea4495fcf71c2dc4836c2dd23958c" +source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3" dependencies = [ "aho-corasick", - "cow-utils", "csv", - "deunicode", "either", "fst", "irg-kvariants", "jieba-rs", "lindera", - "litemap", "once_cell", "pinyin", "serde", @@ -954,7 +950,6 @@ dependencies = [ "unicode-normalization", "wana_kana", "whatlang", - "zerovec", ] [[package]] @@ -1145,12 +1140,6 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" -[[package]] -name = "cow-utils" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" - [[package]] name = "cpufeatures" version = "0.2.12" @@ -1551,12 +1540,6 @@ dependencies = [ "syn 2.0.60", ] -[[package]] -name = "deunicode" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" - [[package]] name = "digest" version = "0.10.7" @@ -2666,8 +2649,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" +source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3" dependencies = [ "csv", "once_cell", @@ -3278,12 +3260,6 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "litemap" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" - [[package]] name = "lmdb-master-sys" version = "0.2.2" @@ -6506,15 +6482,6 @@ dependencies = [ "syn 2.0.60", ] -[[package]] -name = "zerovec" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" -dependencies = [ - "zerofrom", -] - [[package]] name = "zip" version = "1.1.4" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e635bbcf4..1a81f6f8c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.12", default-features = false } +charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "simplify-lang-detection", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" diff --git a/milli/src/index.rs b/milli/src/index.rs index afe212f57..194f18faa 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1604,6 +1604,29 @@ impl Index { Ok(script_language) } + pub fn languages(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + let mut script_language_doc_count: Vec<(Language, u64)> = Vec::new(); + let mut total = 0; + for sl in self.script_language_docids.iter(rtxn)? { + let ((_script, language), docids) = sl?; + + // keep only Languages that contains at least 1 document. + let remaining_documents_count = docids.len(); + total += remaining_documents_count; + if remaining_documents_count > 0 { + script_language_doc_count.push((language, remaining_documents_count)); + } + } + + let threshold = total / 20; // 5% (arbitrary) + + Ok(script_language_doc_count + .into_iter() + .filter(|(_, count)| *count > threshold) + .map(|(language, _)| language) + .collect()) + } + /// Put the embedding configs: /// 1. The name of the embedder /// 2. The configuration option for this embedder diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index f6a4a802c..78b7a0446 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -670,9 +670,9 @@ pub fn execute_search( tokbuilder.words_dict(dictionary); } - let script_lang_map = ctx.index.script_language(ctx.txn)?; - if !script_lang_map.is_empty() { - tokbuilder.allow_list(&script_lang_map); + let languages = ctx.index.languages(ctx.txn)?; + if !languages.is_empty() { + tokbuilder.allow_list(&languages); } let tokenizer = tokbuilder.build(); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 721d67e96..748a3886a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -1,10 +1,9 @@ -use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io::BufReader; use std::{io, mem, str}; -use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; +use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use obkv::{KvReader, KvWriterU16}; use roaring::RoaringBitmap; use serde_json::Value; @@ -12,11 +11,9 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; -use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; -pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; - /// Extracts the word and positions where this word appear and /// prefixes it by the document id. /// @@ -28,7 +25,7 @@ pub fn extract_docid_word_positions( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, max_positions_per_attributes: Option, -) -> Result<(grenad::Reader>, ScriptLanguageDocidsMap)> { +) -> Result>> { let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); @@ -36,7 +33,6 @@ pub fn extract_docid_word_positions( // initialize destination values. let mut documents_ids = RoaringBitmap::new(); - let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, keep_latest_obkv, @@ -109,9 +105,9 @@ pub fn extract_docid_word_positions( let (del, add): (Result<_>, Result<_>) = rayon::join( || { // deletions - lang_safe_tokens_from_document( + tokens_from_document( &obkv, - &settings_diff.old, + &settings_diff.old.searchable_fields_ids, &del_tokenizer, max_positions_per_attributes, DelAdd::Deletion, @@ -120,9 +116,9 @@ pub fn extract_docid_word_positions( }, || { // additions - lang_safe_tokens_from_document( + tokens_from_document( &obkv, - &settings_diff.new, + &settings_diff.new.searchable_fields_ids, &add_tokenizer, max_positions_per_attributes, DelAdd::Addition, @@ -131,8 +127,8 @@ pub fn extract_docid_word_positions( }, ); - let (del_obkv, del_script_language_word_count) = del?; - let (add_obkv, add_script_language_word_count) = add?; + let del_obkv = del?; + let add_obkv = add?; // merge deletions and additions. // transforming two KV> into one KV>> @@ -150,31 +146,10 @@ pub fn extract_docid_word_positions( key_buffer.extend_from_slice(&field_id.to_be_bytes()); docid_word_positions_sorter.insert(&key_buffer, value)?; } - - // update script_language_docids deletions. - for (script, languages_frequency) in del_script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); - entry.0.push(document_id); - } - } - - // update script_language_docids additions. - for (script, languages_frequency) in add_script_language_word_count { - for (language, _) in languages_frequency { - let entry = script_language_docids - .entry((script, language)) - .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); - entry.1.push(document_id); - } - } } // the returned sorter is serialized as: key: (DocId, FieldId), value: KV>. sorter_into_reader(docid_word_positions_sorter, indexer) - .map(|reader| (reader, script_language_docids)) } /// Check if any searchable fields of a document changed. @@ -205,7 +180,7 @@ fn tokenizer_builder<'a>( stop_words: Option<&'a fst::Set>>, allowed_separators: Option<&'a [&str]>, dictionary: Option<&'a [&str]>, - script_language: Option<&'a HashMap>>, + languages: Option<&'a Vec>, ) -> TokenizerBuilder<'a, Vec> { let mut tokenizer_builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { @@ -218,81 +193,13 @@ fn tokenizer_builder<'a>( tokenizer_builder.separators(separators); } - if let Some(script_language) = script_language { - tokenizer_builder.allow_list(script_language); + if let Some(languages) = languages { + tokenizer_builder.allow_list(languages); } tokenizer_builder } -/// Extract words mapped with their positions of a document, -/// ensuring no Language detection mistakes was made. -fn lang_safe_tokens_from_document<'a>( - obkv: &KvReader<'_, FieldId>, - settings: &InnerIndexSettings, - tokenizer: &Tokenizer<'_>, - max_positions_per_attributes: u32, - del_add: DelAdd, - buffers: &'a mut Buffers, -) -> Result<(&'a [u8], HashMap>)> { - let mut script_language_word_count = HashMap::new(); - - tokens_from_document( - obkv, - &settings.searchable_fields_ids, - tokenizer, - max_positions_per_attributes, - del_add, - buffers, - &mut script_language_word_count, - )?; - - // if we detect a potetial mistake in the language detection, - // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. - // context: https://github.com/meilisearch/meilisearch/issues/3565 - if script_language_word_count - .values() - .map(Vec::as_slice) - .any(potential_language_detection_error) - { - // build an allow list with the most frequent detected languages in the document. - let script_language: HashMap<_, _> = - script_language_word_count.iter().filter_map(most_frequent_languages).collect(); - - // if the allow list is empty, meaning that no Language is considered frequent, - // then we don't rerun the extraction. - if !script_language.is_empty() { - // build a new temporary tokenizer including the allow list. - let stop_words = settings.stop_words.as_ref(); - let separators: Option> = settings - .allowed_separators - .as_ref() - .map(|s| s.iter().map(String::as_str).collect()); - let dictionary: Option> = - settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut builder = - tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); - let tokenizer = builder.build(); - - script_language_word_count.clear(); - - // rerun the extraction. - tokens_from_document( - obkv, - &settings.searchable_fields_ids, - &tokenizer, - max_positions_per_attributes, - del_add, - buffers, - &mut script_language_word_count, - )?; - } - } - - // returns a (KV>, HashMap>) - Ok((&buffers.obkv_buffer, script_language_word_count)) -} - /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader<'a, FieldId>, @@ -301,7 +208,6 @@ fn tokens_from_document<'a>( max_positions_per_attributes: u32, del_add: DelAdd, buffers: &'a mut Buffers, - script_language_word_count: &mut HashMap>, ) -> Result<&'a [u8]> { buffers.obkv_buffer.clear(); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); @@ -326,16 +232,6 @@ fn tokens_from_document<'a>( .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { - // if a language has been detected for the token, we update the counter. - if let Some(language) = token.language { - let script = token.script; - let entry = script_language_word_count.entry(script).or_default(); - match entry.iter_mut().find(|(l, _)| *l == language) { - Some((_, n)) => *n += 1, - None => entry.push((language, 1)), - } - } - // keep a word only if it is not empty and fit in a LMDB key. let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { @@ -423,39 +319,6 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool { - if languages_frequency.len() > 1 { - let threshold = compute_language_frequency_threshold(languages_frequency); - languages_frequency.iter().any(|(_, c)| *c <= threshold) - } else { - false - } -} - -fn most_frequent_languages( - (script, languages_frequency): (&Script, &Vec<(Language, usize)>), -) -> Option<(Script, Vec)> { - if languages_frequency.len() > 1 { - let threshold = compute_language_frequency_threshold(languages_frequency); - - let languages: Vec<_> = - languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect(); - - if languages.is_empty() { - None - } else { - Some((*script, languages)) - } - } else { - None - } -} - -fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { - let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); - total / 10 // 10% is a completely arbitrary value. -} - #[derive(Default)] struct Buffers { // the field buffer for each fields desserialization, and must be cleared between each field. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 57d9d5e42..6c23a8da9 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -345,21 +345,17 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (docid_word_positions_chunk, script_language_pair) = - extract_docid_word_positions( - flattened_documents_chunk.clone(), - indexer, - &settings_diff, - max_positions_per_attributes, - )?; + let docid_word_positions_chunk = extract_docid_word_positions( + flattened_documents_chunk.clone(), + indexer, + &settings_diff, + max_positions_per_attributes, + )?; // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; - let _ = - lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); - Ok(docid_word_positions_chunk) }, || { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index c5cf35ca8..9de95778b 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,10 +1,9 @@ -use std::collections::{BTreeSet, HashMap}; +use std::collections::BTreeSet; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; -use charabia::{Language, Script}; use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; use heed::{BytesDecode, RwTxn}; @@ -94,7 +93,6 @@ pub(crate) enum TypedChunk { add_to_user_provided: RoaringBitmap, remove_from_user_provided: RoaringBitmap, }, - ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } impl TypedChunk { @@ -113,8 +111,7 @@ impl TypedChunk { | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_)) | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_)) | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_)) - | (GeoPoints(_), GeoPoints(_)) - | (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true, + | (GeoPoints(_), GeoPoints(_)) => true, ( VectorPoints { embedder_name: left, expected_dimension: left_dim, .. }, VectorPoints { embedder_name: right, expected_dimension: right_dim, .. }, @@ -775,33 +772,6 @@ pub(crate) fn write_typed_chunk_into_index( tracing::debug!("Finished vector chunk for {}", embedder_name); } - TypedChunk::ScriptLanguageDocids(_) => { - let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids"); - let _entered = span.enter(); - - for typed_chunk in typed_chunks { - let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() }; - for (key, (deletion, addition)) in sl_map { - let mut db_key_exists = false; - let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(db_values) => { - db_key_exists = true; - (db_values - deletion) | addition - } - None => addition, - }; - - if final_value.is_empty() { - // If the database entry exists, delete it. - if db_key_exists { - index.script_language_docids.delete(wtxn, &key)?; - } - } else { - index.script_language_docids.put(wtxn, &key, &final_value)?; - } - } - } - } } Ok((RoaringBitmap::new(), is_merged_database))