Update charabia

2025-06-25 07:58:30 +02:00 · 2024-07-23 14:59:31 +02:00 · 2024-07-23 14:59:31 +02:00 · cc02920f2b
commit cc02920f2b
parent c26bd68de5
7 changed files with 49 additions and 230 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -934,19 +934,15 @@ dependencies = [
 [[package]]
 name = "charabia"
 version = "0.8.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
+source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3"
 checksum = "9868a22f10dee80498a8a2b6c641d80bf28ea4495fcf71c2dc4836c2dd23958c"
 dependencies = [
 "aho-corasick",
 "cow-utils",
 "csv",
 "deunicode",
 "either",
 "fst",
 "irg-kvariants",
 "jieba-rs",
 "lindera",
 "litemap",
 "once_cell",
 "pinyin",
 "serde",
@ -954,7 +950,6 @@ dependencies = [
 "unicode-normalization",
 "wana_kana",
 "whatlang",
 "zerovec",
 ]
 [[package]]
@ -1145,12 +1140,6 @@ version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
 [[package]]
 name = "cow-utils"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
 [[package]]
 name = "cpufeatures"
 version = "0.2.12"
@ -1551,12 +1540,6 @@ dependencies = [
 "syn 2.0.60",
 ]
 [[package]]
 name = "deunicode"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00"
 [[package]]
 name = "digest"
 version = "0.10.7"
@ -2666,8 +2649,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
 [[package]]
 name = "irg-kvariants"
 version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
+source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3"
 checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26"
 dependencies = [
 "csv",
 "once_cell",
@ -3278,12 +3260,6 @@ dependencies = [
 "unicode-segmentation",
 ]
 [[package]]
 name = "litemap"
 version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
 [[package]]
 name = "lmdb-master-sys"
 version = "0.2.2"
@ -6506,15 +6482,6 @@ dependencies = [
 "syn 2.0.60",
 ]
 [[package]]
 name = "zerovec"
 version = "0.10.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
 dependencies = [
 "zerofrom",
 ]
 [[package]]
 name = "zip"
 version = "1.1.4"
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -17,7 +17,7 @@ bincode = "1.3.3"
 bstr = "1.9.1"
 bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
 byteorder = "1.5.0"
-charabia = { version = "0.8.12", default-features = false }
+charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "simplify-lang-detection", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.13"
 deserr = "0.6.2"
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1604,6 +1604,29 @@ impl Index {
        Ok(script_language)
    }
    pub fn languages(&self, rtxn: &RoTxn<'_>) -> heed::Result<Vec<Language>> {
        let mut script_language_doc_count: Vec<(Language, u64)> = Vec::new();
        let mut total = 0;
        for sl in self.script_language_docids.iter(rtxn)? {
            let ((_script, language), docids) = sl?;
            // keep only Languages that contains at least 1 document.
            let remaining_documents_count = docids.len();
            total += remaining_documents_count;
            if remaining_documents_count > 0 {
                script_language_doc_count.push((language, remaining_documents_count));
            }
        }
        let threshold = total / 20; // 5% (arbitrary)
        Ok(script_language_doc_count
            .into_iter()
            .filter(|(_, count)| *count > threshold)
            .map(|(language, _)| language)
            .collect())
    }
    /// Put the embedding configs:
    /// 1. The name of the embedder
    /// 2. The configuration option for this embedder
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -670,9 +670,9 @@ pub fn execute_search(
            tokbuilder.words_dict(dictionary);
        }
-        let script_lang_map = ctx.index.script_language(ctx.txn)?;
+        let languages = ctx.index.languages(ctx.txn)?;
-        if !script_lang_map.is_empty() {
+        if !languages.is_empty() {
-            tokbuilder.allow_list(&script_lang_map);
+            tokbuilder.allow_list(&languages);
        }
        let tokenizer = tokbuilder.build();
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -1,10 +1,9 @@
 use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::BufReader;
 use std::{io, mem, str};
-use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
+use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
 use obkv::{KvReader, KvWriterU16};
 use roaring::RoaringBitmap;
 use serde_json::Value;
@ -12,11 +11,9 @@ use serde_json::Value;
 use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
 use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
-use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
+use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
 pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
 ///
@ -28,7 +25,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    indexer: GrenadParameters,
    settings_diff: &InnerIndexSettingsDiff,
    max_positions_per_attributes: Option<u32>,
-) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    let max_positions_per_attributes = max_positions_per_attributes
        .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
    let max_memory = indexer.max_memory_by_thread();
@ -36,7 +33,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    // initialize destination values.
    let mut documents_ids = RoaringBitmap::new();
    let mut script_language_docids = HashMap::new();
    let mut docid_word_positions_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
        keep_latest_obkv,
@ -109,9 +105,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
        let (del, add): (Result<_>, Result<_>) = rayon::join(
            || {
                // deletions
-                lang_safe_tokens_from_document(
+                tokens_from_document(
                    &obkv,
-                    &settings_diff.old,
+                    &settings_diff.old.searchable_fields_ids,
                    &del_tokenizer,
                    max_positions_per_attributes,
                    DelAdd::Deletion,
@ -120,9 +116,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
            },
            || {
                // additions
-                lang_safe_tokens_from_document(
+                tokens_from_document(
                    &obkv,
-                    &settings_diff.new,
+                    &settings_diff.new.searchable_fields_ids,
                    &add_tokenizer,
                    max_positions_per_attributes,
                    DelAdd::Addition,
@ -131,8 +127,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
            },
        );
-        let (del_obkv, del_script_language_word_count) = del?;
+        let del_obkv = del?;
-        let (add_obkv, add_script_language_word_count) = add?;
+        let add_obkv = add?;
        // merge deletions and additions.
        // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
@ -150,31 +146,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
            key_buffer.extend_from_slice(&field_id.to_be_bytes());
            docid_word_positions_sorter.insert(&key_buffer, value)?;
        }
        // update script_language_docids deletions.
        for (script, languages_frequency) in del_script_language_word_count {
            for (language, _) in languages_frequency {
                let entry = script_language_docids
                    .entry((script, language))
                    .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
                entry.0.push(document_id);
            }
        }
        // update script_language_docids additions.
        for (script, languages_frequency) in add_script_language_word_count {
            for (language, _) in languages_frequency {
                let entry = script_language_docids
                    .entry((script, language))
                    .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
                entry.1.push(document_id);
            }
        }
    }
    // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
    sorter_into_reader(docid_word_positions_sorter, indexer)
        .map(|reader| (reader, script_language_docids))
 }
 /// Check if any searchable fields of a document changed.
@ -205,7 +180,7 @@ fn tokenizer_builder<'a>(
    stop_words: Option<&'a fst::Set<Vec<u8>>>,
    allowed_separators: Option<&'a [&str]>,
    dictionary: Option<&'a [&str]>,
-    script_language: Option<&'a HashMap<Script, Vec<Language>>>,
+    languages: Option<&'a Vec<Language>>,
 ) -> TokenizerBuilder<'a, Vec<u8>> {
    let mut tokenizer_builder = TokenizerBuilder::new();
    if let Some(stop_words) = stop_words {
@ -218,81 +193,13 @@ fn tokenizer_builder<'a>(
        tokenizer_builder.separators(separators);
    }
-    if let Some(script_language) = script_language {
+    if let Some(languages) = languages {
-        tokenizer_builder.allow_list(script_language);
+        tokenizer_builder.allow_list(languages);
    }
    tokenizer_builder
 }
 /// Extract words mapped with their positions of a document,
 /// ensuring no Language detection mistakes was made.
 fn lang_safe_tokens_from_document<'a>(
    obkv: &KvReader<'_, FieldId>,
    settings: &InnerIndexSettings,
    tokenizer: &Tokenizer<'_>,
    max_positions_per_attributes: u32,
    del_add: DelAdd,
    buffers: &'a mut Buffers,
 ) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
    let mut script_language_word_count = HashMap::new();
    tokens_from_document(
        obkv,
        &settings.searchable_fields_ids,
        tokenizer,
        max_positions_per_attributes,
        del_add,
        buffers,
        &mut script_language_word_count,
    )?;
    // if we detect a potetial mistake in the language detection,
    // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
    // context: https://github.com/meilisearch/meilisearch/issues/3565
    if script_language_word_count
        .values()
        .map(Vec::as_slice)
        .any(potential_language_detection_error)
    {
        // build an allow list with the most frequent detected languages in the document.
        let script_language: HashMap<_, _> =
            script_language_word_count.iter().filter_map(most_frequent_languages).collect();
        // if the allow list is empty, meaning that no Language is considered frequent,
        // then we don't rerun the extraction.
        if !script_language.is_empty() {
            // build a new temporary tokenizer including the allow list.
            let stop_words = settings.stop_words.as_ref();
            let separators: Option<Vec<_>> = settings
                .allowed_separators
                .as_ref()
                .map(|s| s.iter().map(String::as_str).collect());
            let dictionary: Option<Vec<_>> =
                settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
            let mut builder =
                tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
            let tokenizer = builder.build();
            script_language_word_count.clear();
            // rerun the extraction.
            tokens_from_document(
                obkv,
                &settings.searchable_fields_ids,
                &tokenizer,
                max_positions_per_attributes,
                del_add,
                buffers,
                &mut script_language_word_count,
            )?;
        }
    }
    // returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
    Ok((&buffers.obkv_buffer, script_language_word_count))
 }
 /// Extract words mapped with their positions of a document.
 fn tokens_from_document<'a>(
    obkv: &KvReader<'a, FieldId>,
@ -301,7 +208,6 @@ fn tokens_from_document<'a>(
    max_positions_per_attributes: u32,
    del_add: DelAdd,
    buffers: &'a mut Buffers,
    script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
 ) -> Result<&'a [u8]> {
    buffers.obkv_buffer.clear();
    let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
@ -326,16 +232,6 @@ fn tokens_from_document<'a>(
                        .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
                    for (index, token) in tokens {
                        // if a language has been detected for the token, we update the counter.
                        if let Some(language) = token.language {
                            let script = token.script;
                            let entry = script_language_word_count.entry(script).or_default();
                            match entry.iter_mut().find(|(l, _)| *l == language) {
                                Some((_, n)) => *n += 1,
                                None => entry.push((language, 1)),
                            }
                        }
                        // keep a word only if it is not empty and fit in a LMDB key.
                        let token = token.lemma().trim();
                        if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
@ -423,39 +319,6 @@ fn process_tokens<'a>(
        .filter(|(_, t)| t.is_word())
 }
 fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool {
    if languages_frequency.len() > 1 {
        let threshold = compute_language_frequency_threshold(languages_frequency);
        languages_frequency.iter().any(|(_, c)| *c <= threshold)
    } else {
        false
    }
 }
 fn most_frequent_languages(
    (script, languages_frequency): (&Script, &Vec<(Language, usize)>),
 ) -> Option<(Script, Vec<Language>)> {
    if languages_frequency.len() > 1 {
        let threshold = compute_language_frequency_threshold(languages_frequency);
        let languages: Vec<_> =
            languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect();
        if languages.is_empty() {
            None
        } else {
            Some((*script, languages))
        }
    } else {
        None
    }
 }
 fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize {
    let total: usize = languages_frequency.iter().map(|(_, c)| c).sum();
    total / 10 // 10% is a completely arbitrary value.
 }
 #[derive(Default)]
 struct Buffers {
    // the field buffer for each fields desserialization, and must be cleared between each field.
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -345,8 +345,7 @@ fn send_and_extract_flattened_documents_data(
    let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
-                let (docid_word_positions_chunk, script_language_pair) =
+                let docid_word_positions_chunk = extract_docid_word_positions(
                    extract_docid_word_positions(
                    flattened_documents_chunk.clone(),
                    indexer,
                    &settings_diff,
@ -357,9 +356,6 @@ fn send_and_extract_flattened_documents_data(
                let docid_word_positions_chunk =
                    unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
                let _ =
                    lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
                Ok(docid_word_positions_chunk)
            },
            || {
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -1,10 +1,9 @@
-use std::collections::{BTreeSet, HashMap};
+use std::collections::BTreeSet;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::{self, BufReader};
 use bytemuck::allocation::pod_collect_to_vec;
 use charabia::{Language, Script};
 use grenad::{Merger, MergerBuilder};
 use heed::types::Bytes;
 use heed::{BytesDecode, RwTxn};
@ -94,7 +93,6 @@ pub(crate) enum TypedChunk {
        add_to_user_provided: RoaringBitmap,
        remove_from_user_provided: RoaringBitmap,
    },
    ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
 }
 impl TypedChunk {
@ -113,8 +111,7 @@ impl TypedChunk {
            | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
            | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
            | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
-            | (GeoPoints(_), GeoPoints(_))
+            | (GeoPoints(_), GeoPoints(_)) => true,
            | (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true,
            (
                VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
                VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
@ -775,33 +772,6 @@ pub(crate) fn write_typed_chunk_into_index(
            tracing::debug!("Finished vector chunk for {}", embedder_name);
        }
        TypedChunk::ScriptLanguageDocids(_) => {
            let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids");
            let _entered = span.enter();
            for typed_chunk in typed_chunks {
                let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() };
                for (key, (deletion, addition)) in sl_map {
                    let mut db_key_exists = false;
                    let final_value = match index.script_language_docids.get(wtxn, &key)? {
                        Some(db_values) => {
                            db_key_exists = true;
                            (db_values - deletion) | addition
                        }
                        None => addition,
                    };
                    if final_value.is_empty() {
                        // If the database entry exists, delete it.
                        if db_key_exists {
                            index.script_language_docids.delete(wtxn, &key)?;
                        }
                    } else {
                        index.script_language_docids.put(wtxn, &key, &final_value)?;
                    }
                }
            }
        }
    }
    Ok((RoaringBitmap::new(), is_merged_database))