From c45d1e3610731746eb1ade02652d18580847e89e Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 12 Oct 2022 06:21:35 -0500
Subject: [PATCH 01/21] Create a new database on index and add a specialized
 codec for it

---
 milli/src/heed_codec/mod.rs                   |  2 +
 milli/src/heed_codec/script_language_codec.rs | 43 +++++++++++++++++++
 milli/src/index.rs                            |  7 +++
 3 files changed, 52 insertions(+)
 create mode 100644 milli/src/heed_codec/script_language_codec.rs
diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
index 702dcf661..2ac130f48 100644
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@@ -8,6 +8,7 @@ mod roaring_bitmap_length;
 mod str_beu32_codec;
 mod str_ref;
 mod str_str_u8_codec;
+mod script_language_codec;
 
 pub use byte_slice_ref::ByteSliceRefCodec;
 pub use str_ref::StrRefCodec;
@@ -21,3 +22,4 @@ pub use self::roaring_bitmap_length::{
 };
 pub use self::str_beu32_codec::StrBEU32Codec;
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
+pub use self::script_language_codec::ScriptLanguageCodec;
diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs
new file mode 100644
index 000000000..af15990ea
--- /dev/null
+++ b/milli/src/heed_codec/script_language_codec.rs
@@ -0,0 +1,43 @@
+use std::borrow::Cow;
+
+use std::mem::size_of;
+use std::str;
+
+use charabia::{Language, Script};
+
+pub struct ScriptLanguageCodec;
+
+impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
+    type DItem = (Script, Language);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let footer_len = size_of::<u32>();
+
+        if bytes.len() < footer_len {
+            return None;
+        }
+
+        let (script, bytes) = bytes.split_at(bytes.len() - footer_len);
+        let script = str::from_utf8(script).ok()?;
+        let script_name = Script::from_name(script);
+        let lan = str::from_utf8(bytes).ok()?;
+        let lan_name = Language::from_name(lan);
+
+        Some((script_name, lan_name))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
+    type EItem = (Script, Language);
+
+    fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
+        let script_name = script.name();
+        let lan_name = lan.name();
+
+        let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len());
+        bytes.extend_from_slice(script_name.as_bytes());
+        bytes.extend_from_slice(lan_name.as_bytes());
+
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 3f7ef14e6..76bc273d9 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -14,6 +14,7 @@ use time::OffsetDateTime;
 use crate::error::{InternalError, UserError};
 use crate::facet::FacetType;
 use crate::fields_ids_map::FieldsIdsMap;
+use crate::heed_codec::ScriptLanguageCodec;
 use crate::heed_codec::facet::{
     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
     FieldIdCodec, OrderedF64Codec,
@@ -83,6 +84,7 @@ pub mod db_name {
     pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
     pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
     pub const DOCUMENTS: &str = "documents";
+    pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
 }
 
 #[derive(Clone)]
@@ -122,6 +124,9 @@ pub struct Index {
     /// Maps the position of a word prefix with all the docids where this prefix appears.
     pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
 
+    /// Maps the script and language with all the docids that corresponds to it. 
+    pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
+
     /// Maps the facet field id and the docids for which this field exists
     pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
 
@@ -159,6 +164,7 @@ impl Index {
         let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?;
         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
+        let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?;
         let word_prefix_pair_proximity_docids =
             env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
         let prefix_word_pair_proximity_docids =
@@ -186,6 +192,7 @@ impl Index {
             exact_word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
+            script_language_docids,
             word_prefix_pair_proximity_docids,
             prefix_word_pair_proximity_docids,
             word_position_docids,

From d97fb6117ea086080bde016115f2acdd46c46702 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 12 Oct 2022 06:24:56 -0500
Subject: [PATCH 02/21] Extract and index data

---
 .../extract/extract_docid_word_positions.rs   | 15 ++++++++---
 .../src/update/index_documents/extract/mod.rs |  4 ++-
 .../src/update/index_documents/typed_chunk.rs | 27 ++++++++++++++++---
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index be9b479bb..66b2c768b 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::{io, mem, str};
 
-use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
+use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder};
 use roaring::RoaringBitmap;
 use serde_json::Value;
 
@@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     searchable_fields: &Option<HashSet<FieldId>>,
     stop_words: Option<&fst::Set<&[u8]>>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<File>)> {
+) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> {
     let max_positions_per_attributes = max_positions_per_attributes
         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
     let max_memory = indexer.max_memory_by_thread();
 
     let mut documents_ids = RoaringBitmap::new();
+    let mut script_language_pair = HashMap::new();
     let mut docid_word_positions_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
         concat_u32s_array,
@@ -70,6 +71,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
 
                     for (index, token) in tokens {
+                        let script = token.script;
+                        let language = token.language.unwrap_or_default();
+                        let entry = script_language_pair
+                            .entry((script, language))
+                            .or_insert_with(RoaringBitmap::new);
+                        entry.push(document_id);
                         let token = token.lemma().trim();
                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
                             key_buffer.truncate(mem::size_of::<u32>());
@@ -88,7 +95,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         }
     }
 
-    sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader))
+    sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair))
 }
 
 /// Transform a JSON value into a string that can be indexed.
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index e696ed44b..540b8993b 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data(
     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
         rayon::join(
             || {
-                let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions(
+                let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions(
                     flattened_documents_chunk.clone(),
                     indexer,
                     searchable_fields,
@@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data(
                 let _ = lmdb_writer_sx
                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
 
+                let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
+
                 Ok(docid_word_positions_chunk)
             },
             || {
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 16784bd92..920971eec 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -1,8 +1,10 @@
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 
+use charabia::{Language, Script};
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::{BytesDecode, RwTxn};
@@ -16,10 +18,7 @@ use super::{ClonableMmap, MergeFn};
 use crate::facet::FacetType;
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::as_cloneable_grenad;
-use crate::{
-    lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
-    Result,
-};
+use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz};
 
 pub(crate) enum TypedChunk {
     DocidWordPositions(grenad::Reader<CursorClonableMmap>),
@@ -38,6 +37,7 @@ pub(crate) enum TypedChunk {
     FieldIdFacetNumberDocids(grenad::Reader<File>),
     FieldIdFacetExistsDocids(grenad::Reader<File>),
     GeoPoints(grenad::Reader<File>),
+    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>)
 }
 
 /// Write typed chunk in the corresponding LMDB database of the provided index.
@@ -210,6 +210,25 @@ pub(crate) fn write_typed_chunk_into_index(
             index.put_geo_rtree(wtxn, &rtree)?;
             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
         }
+        TypedChunk::ScriptLanguageDocids(hash_pair) => {
+            let mut buffer = Vec::new();
+            for (key, value) in hash_pair {
+                buffer.clear();
+                let final_value = match index.script_language_docids.get(wtxn, &key)? {
+                    Some(db_values) => {
+                        let mut db_value_buffer = Vec::new();
+                        serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
+                        let mut new_value_buffer = Vec::new();
+                        serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
+                        merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
+                        let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?;
+                        merged_db_values
+                    }
+                    None => value
+                };
+                index.script_language_docids.put(wtxn, &key, &final_value)?;
+            }
+        } 
     }
 
     Ok((RoaringBitmap::new(), is_merged_database))

From b216ddba63144ca0aba71d8d1707ea1109b96f12 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 12 Oct 2022 06:28:36 -0500
Subject: [PATCH 03/21] Delete and clear data from the new database

---
 milli/src/update/clear_documents.rs  | 2 ++
 milli/src/update/delete_documents.rs | 1 +
 2 files changed, 3 insertions(+)

diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index adeea11fa..0296bc192 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_position_docids,
             field_id_word_count_docids,
             word_prefix_position_docids,
+            script_language_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
             facet_id_exists_docids,
@@ -82,6 +83,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_position_docids.clear(self.wtxn)?;
         field_id_word_count_docids.clear(self.wtxn)?;
         word_prefix_position_docids.clear(self.wtxn)?;
+        script_language_docids.clear(self.wtxn)?;
         facet_id_f64_docids.clear(self.wtxn)?;
         facet_id_exists_docids.clear(self.wtxn)?;
         facet_id_string_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index f4a6d396e..d4fff68c2 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -243,6 +243,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             facet_id_string_docids: _,
             field_id_docid_facet_f64s: _,
             field_id_docid_facet_strings: _,
+            script_language_docids,
             facet_id_exists_docids,
             documents,
         } = self.index;

From a27f329e3a86d438b47d229aaeb1362092887fbe Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Fri, 14 Oct 2022 14:05:53 -0500
Subject: [PATCH 04/21] Add tests for checking that detected script and
 language associated with document(s) were stored during indexing

---
 milli/src/index.rs                      |  7 +++++
 milli/src/update/index_documents/mod.rs | 35 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 76bc273d9..03f16a126 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -4,6 +4,7 @@ use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
 
+use charabia::{Language, Script};
 use heed::flags::Flags;
 use heed::types::*;
 use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
@@ -1194,6 +1195,12 @@ impl Index {
     pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result<bool> {
         self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS)
     }
+
+    /* script  language docids */
+    /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
+    pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result<Option<RoaringBitmap>> {
+        self.script_language_docids.get(rtxn, key)
+    }
 }
 
 #[cfg(test)]
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index be97defbd..f736cb55e 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1907,4 +1907,39 @@ mod tests {
 
         index.add_documents(doc1).unwrap();
     }
+
+    #[cfg(feature = "default")]
+    #[test]
+    fn store_detected_script_and_language_per_document_during_indexing() {
+        use charabia::{Language, Script};
+        let index = TempIndex::new();
+        index
+            .add_documents(documents!([
+                { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
+                { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
+                { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
+                { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
+                { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
+            ]))
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let key_thai = (Script::Thai, Language::Other);
+        let key_jpn = (Script::Cj, Language::Jpn);
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let thai_docs = index.script_language_documents_ids(&rtxn, &key_thai).unwrap().unwrap();
+        let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
+        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
+        let mut expected_thai_docids = RoaringBitmap::new();
+        expected_thai_docids.push(4);
+        assert_eq!(thai_docs, expected_thai_docids);
+        let mut expected_cj_jpn_docids = RoaringBitmap::new();
+        expected_cj_jpn_docids.push(3);
+        assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(1);
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+    }
 }

From 34d04f3d3f94adb51aecca199f2084f30a3128d0 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Fri, 14 Oct 2022 16:22:42 -0500
Subject: [PATCH 05/21] Filter from script_language_docids database soft
 deleted documents

---
 milli/src/index.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 03f16a126..dc9cb7994 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1199,7 +1199,9 @@ impl Index {
     /* script  language docids */
     /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
     pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result<Option<RoaringBitmap>> {
-        self.script_language_docids.get(rtxn, key)
+        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
+        let doc_ids = self.script_language_docids.get(rtxn, key)?;
+        Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
     }
 }
 

From 369c05732e79dbd602c408b7947672bbc8e23b46 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Fri, 14 Oct 2022 16:25:09 -0500
Subject: [PATCH 06/21] Add test checking if from script_language_docids
 database were removed deleted docids

---
 milli/src/update/delete_documents.rs | 45 ++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index d4fff68c2..7e9b11592 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -500,6 +500,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             .execute(self.wtxn)?;
         }
 
+        // Remove the documents ids from the script language database.
+        let mut iter = script_language_docids.iter_mut(self.wtxn)?;
+        while let Some((key, mut docids)) = iter.next().transpose()? {
+            let previous_len = docids.len();
+            docids -= &self.to_delete_docids;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+            } else if docids.len() != previous_len {
+                let key = key.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&key, &docids)? };
+            }
+        }
+
+        drop(iter);
         // We delete the documents ids that are under the facet field id values.
         remove_docids_from_facet_id_exists_docids(
             self.wtxn,
@@ -1167,4 +1183,33 @@ mod tests {
         stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
         stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
     }
+
+    #[test]
+    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
+        use charabia::{Language, Script};
+        let index = TempIndex::new();
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
+                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
+                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
+                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
+                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
+            ]))
+            .unwrap();
+
+        delete_documents(&mut wtxn, &index, &["1"]);
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+    }
 }

From fd60a39f1c4d56941cb3cd3f58d8590096458545 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Mon, 17 Oct 2022 06:51:04 -0500
Subject: [PATCH 07/21] Format code

---
 milli/src/heed_codec/mod.rs                    |  3 ++-
 milli/src/index.rs                             | 10 +++++++---
 .../extract/extract_docid_word_positions.rs    |  3 ++-
 .../src/update/index_documents/extract/mod.rs  | 18 ++++++++++--------
 .../src/update/index_documents/typed_chunk.rs  | 11 +++++++----
 5 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
index 2ac130f48..f3ca5f0d1 100644
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@@ -5,10 +5,10 @@ mod field_id_word_count_codec;
 mod obkv_codec;
 mod roaring_bitmap;
 mod roaring_bitmap_length;
+mod script_language_codec;
 mod str_beu32_codec;
 mod str_ref;
 mod str_str_u8_codec;
-mod script_language_codec;
 
 pub use byte_slice_ref::ByteSliceRefCodec;
 pub use str_ref::StrRefCodec;
@@ -20,6 +20,7 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
 pub use self::roaring_bitmap_length::{
     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
 };
+pub use self::script_language_codec::ScriptLanguageCodec;
 pub use self::str_beu32_codec::StrBEU32Codec;
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
 pub use self::script_language_codec::ScriptLanguageCodec;
diff --git a/milli/src/index.rs b/milli/src/index.rs
index dc9cb7994..ef26fc305 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -15,12 +15,12 @@ use time::OffsetDateTime;
 use crate::error::{InternalError, UserError};
 use crate::facet::FacetType;
 use crate::fields_ids_map::FieldsIdsMap;
-use crate::heed_codec::ScriptLanguageCodec;
 use crate::heed_codec::facet::{
     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
     FieldIdCodec, OrderedF64Codec,
 };
 use crate::heed_codec::StrRefCodec;
+use crate::heed_codec::ScriptLanguageCodec;
 use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
@@ -125,7 +125,7 @@ pub struct Index {
     /// Maps the position of a word prefix with all the docids where this prefix appears.
     pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
 
-    /// Maps the script and language with all the docids that corresponds to it. 
+    /// Maps the script and language with all the docids that corresponds to it.
     pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
 
     /// Maps the facet field id and the docids for which this field exists
@@ -1198,7 +1198,11 @@ impl Index {
 
     /* script  language docids */
     /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any.
-    pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result<Option<RoaringBitmap>> {
+    pub fn script_language_documents_ids(
+        &self,
+        rtxn: &RoTxn,
+        key: &(Script, Language),
+    ) -> heed::Result<Option<RoaringBitmap>> {
         let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
         let doc_ids = self.script_language_docids.get(rtxn, key)?;
         Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 66b2c768b..8a9f7e04f 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -95,7 +95,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         }
     }
 
-    sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair))
+    sorter_into_reader(docid_word_positions_sorter, indexer)
+        .map(|reader| (documents_ids, reader, script_language_pair))
 }
 
 /// Transform a JSON value into a string that can be indexed.
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 540b8993b..f38bdd497 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -257,13 +257,14 @@ fn send_and_extract_flattened_documents_data(
     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
         rayon::join(
             || {
-                let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions(
-                    flattened_documents_chunk.clone(),
-                    indexer,
-                    searchable_fields,
-                    stop_words.as_ref(),
-                    max_positions_per_attributes,
-                )?;
+                let (documents_ids, docid_word_positions_chunk, script_language_pair) =
+                    extract_docid_word_positions(
+                        flattened_documents_chunk.clone(),
+                        indexer.clone(),
+                        searchable_fields,
+                        stop_words.as_ref(),
+                        max_positions_per_attributes,
+                    )?;
 
                 // send documents_ids to DB writer
                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
@@ -274,7 +275,8 @@ fn send_and_extract_flattened_documents_data(
                 let _ = lmdb_writer_sx
                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
 
-                let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
+                let _ =
+                    lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
 
                 Ok(docid_word_positions_chunk)
             },
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 920971eec..35f09c051 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -18,7 +18,10 @@ use super::{ClonableMmap, MergeFn};
 use crate::facet::FacetType;
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::as_cloneable_grenad;
-use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz};
+use crate::{
+    lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
+    Result,
+};
 
 pub(crate) enum TypedChunk {
     DocidWordPositions(grenad::Reader<CursorClonableMmap>),
@@ -37,7 +40,7 @@ pub(crate) enum TypedChunk {
     FieldIdFacetNumberDocids(grenad::Reader<File>),
     FieldIdFacetExistsDocids(grenad::Reader<File>),
     GeoPoints(grenad::Reader<File>),
-    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>)
+    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
 }
 
 /// Write typed chunk in the corresponding LMDB database of the provided index.
@@ -224,11 +227,11 @@ pub(crate) fn write_typed_chunk_into_index(
                         let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?;
                         merged_db_values
                     }
-                    None => value
+                    None => value,
                 };
                 index.script_language_docids.put(wtxn, &key, &final_value)?;
             }
-        } 
+        }
     }
 
     Ok((RoaringBitmap::new(), is_merged_database))

From 2d58b28f43ab45d6d0cd1e9c04f9450ec256f1d0 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 19 Oct 2022 07:03:46 -0500
Subject: [PATCH 08/21] Improve script language codec

---
 milli/src/heed_codec/script_language_codec.rs | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs
index af15990ea..7e150723a 100644
--- a/milli/src/heed_codec/script_language_codec.rs
+++ b/milli/src/heed_codec/script_language_codec.rs
@@ -1,6 +1,5 @@
 use std::borrow::Cow;
 
-use std::mem::size_of;
 use std::str;
 
 use charabia::{Language, Script};
@@ -11,16 +10,11 @@ impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
     type DItem = (Script, Language);
 
     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        let footer_len = size_of::<u32>();
-
-        if bytes.len() < footer_len {
-            return None;
-        }
-
-        let (script, bytes) = bytes.split_at(bytes.len() - footer_len);
-        let script = str::from_utf8(script).ok()?;
+        let sep = bytes.iter().position(|b| *b == 0)?;
+        let (s_bytes, l_bytes) = bytes.split_at(sep);
+        let script = str::from_utf8(s_bytes).ok()?;
         let script_name = Script::from_name(script);
-        let lan = str::from_utf8(bytes).ok()?;
+        let lan = str::from_utf8(l_bytes).ok()?;
         let lan_name = Language::from_name(lan);
 
         Some((script_name, lan_name))
@@ -31,12 +25,13 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
     type EItem = (Script, Language);
 
     fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
-        let script_name = script.name();
-        let lan_name = lan.name();
+        let script_name = script.name().as_bytes();
+        let lan_name = lan.name().as_bytes();
 
-        let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len());
-        bytes.extend_from_slice(script_name.as_bytes());
-        bytes.extend_from_slice(lan_name.as_bytes());
+        let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1);
+        bytes.extend_from_slice(script_name);
+        bytes.push(0);
+        bytes.extend_from_slice(lan_name);
 
         Some(Cow::Owned(bytes))
     }

From d8207356f416e018e4e3bf7969c9e27120884335 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 19 Oct 2022 07:09:05 -0500
Subject: [PATCH 09/21] Skip script,language insertion if language is
 undetected

---
 .../extract/extract_docid_word_positions.rs         | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 8a9f7e04f..e091df6b8 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -71,12 +71,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
 
                     for (index, token) in tokens {
-                        let script = token.script;
-                        let language = token.language.unwrap_or_default();
-                        let entry = script_language_pair
-                            .entry((script, language))
-                            .or_insert_with(RoaringBitmap::new);
-                        entry.push(document_id);
+                        if let Some(language) = token.language {
+                            let script = token.script;
+                            let entry = script_language_pair
+                                .entry((script, language))
+                                .or_insert_with(RoaringBitmap::new);
+                            entry.push(document_id);
+                        }
                         let token = token.lemma().trim();
                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
                             key_buffer.truncate(mem::size_of::<u32>());

From 50bc1562571d2e3710ad9495e3b74ddb1de0e3da Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 19 Oct 2022 07:13:10 -0500
Subject: [PATCH 10/21] Fix tests

---
 milli/src/update/index_documents/mod.rs | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index f736cb55e..2f06558f2 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1925,21 +1925,13 @@ mod tests {
             .unwrap();
 
         let rtxn = index.read_txn().unwrap();
-        let key_thai = (Script::Thai, Language::Other);
         let key_jpn = (Script::Cj, Language::Jpn);
         let key_cmn = (Script::Cj, Language::Cmn);
-        let thai_docs = index.script_language_documents_ids(&rtxn, &key_thai).unwrap().unwrap();
         let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
         let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
-        let mut expected_thai_docids = RoaringBitmap::new();
-        expected_thai_docids.push(4);
-        assert_eq!(thai_docs, expected_thai_docids);
-        let mut expected_cj_jpn_docids = RoaringBitmap::new();
-        expected_cj_jpn_docids.push(3);
+        let expected_cj_jpn_docids = [3].iter().collect();
         assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
-        let mut expected_cj_cmn_docids = RoaringBitmap::new();
-        expected_cj_cmn_docids.push(1);
-        expected_cj_cmn_docids.push(5);
+        let expected_cj_cmn_docids = [1,5].iter().collect();
         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
     }
 }

From 7681be5367b99b59cfde97d0b9809d077cbf6b67 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 19 Oct 2022 07:18:11 -0500
Subject: [PATCH 11/21] Format code

---
 milli/src/update/index_documents/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2f06558f2..b41892f7d 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1931,7 +1931,7 @@ mod tests {
         let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
         let expected_cj_jpn_docids = [3].iter().collect();
         assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
-        let expected_cj_cmn_docids = [1,5].iter().collect();
+        let expected_cj_cmn_docids = [1, 5].iter().collect();
         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
     }
 }

From 2922c5c8997e305b0370e006f12c78b8a5161993 Mon Sep 17 00:00:00 2001
From: f3r10 <frledesma@outlook.com>
Date: Wed, 19 Oct 2022 10:32:22 -0500
Subject: [PATCH 12/21] Fix code format

---
 milli/src/heed_codec/script_language_codec.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs
index 7e150723a..b79ba4897 100644
--- a/milli/src/heed_codec/script_language_codec.rs
+++ b/milli/src/heed_codec/script_language_codec.rs
@@ -1,5 +1,4 @@
 use std::borrow::Cow;
-
 use std::str;
 
 use charabia::{Language, Script};

From f4569b04ad086479c557a116f90b4edc87f1d86a Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 1 Feb 2023 15:23:25 +0100
Subject: [PATCH 13/21] Update Charabia version

---
 Cargo.lock       | 14 ++++++++++++--
 milli/Cargo.toml |  3 ++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dd7d828da..a894cb8c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -666,14 +666,14 @@ dependencies = [
 [[package]]
 name = "charabia"
 version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b57f9571f611796ea38e5a9c12e5ce37476f70397b032757f8dfe0c7b9bc5637"
+source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0"
 dependencies = [
  "cow-utils",
  "csv",
  "deunicode",
  "fst",
  "jieba-rs",
+ "kvariants",
  "lindera",
  "once_cell",
  "pinyin",
@@ -2074,6 +2074,16 @@ dependencies = [
  "simple_asn1",
 ]
 
+[[package]]
+name = "kvariants"
+version = "0.1.0"
+source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0"
+dependencies = [
+ "csv",
+ "once_cell",
+ "serde",
+]
+
 [[package]]
 name = "language-tags"
 version = "0.3.2"
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index b32592ab9..b6449c5db 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -9,7 +9,8 @@ bimap = { version = "0.6.2", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "1.0.1"
 byteorder = "1.4.3"
-charabia = { version = "0.7.0", default-features = false }
+# charabia = { version = "0.7.0", default-features = false }
+charabia = { git = "https://github.com/meilisearch/charabia", branch = "fix-script-lang-serialization", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.6"
 deserr = "0.1.4"

From 77d32d0ee811a19ca0d9be962ca981ef8278b4ca Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 1 Feb 2023 15:24:49 +0100
Subject: [PATCH 14/21] Fix codec deserialization

---
 milli/src/heed_codec/script_language_codec.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs
index b79ba4897..83e8a7241 100644
--- a/milli/src/heed_codec/script_language_codec.rs
+++ b/milli/src/heed_codec/script_language_codec.rs
@@ -14,7 +14,8 @@ impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
         let script = str::from_utf8(s_bytes).ok()?;
         let script_name = Script::from_name(script);
         let lan = str::from_utf8(l_bytes).ok()?;
-        let lan_name = Language::from_name(lan);
+        // skip '\0' byte between the two strings.
+        let lan_name = Language::from_name(&lan[1..]);
 
         Some((script_name, lan_name))
     }

From 064158e4e29077749b8eb18dd87a706ec71908e5 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 1 Feb 2023 15:34:01 +0100
Subject: [PATCH 15/21] Update test

---
 milli/src/heed_codec/mod.rs          |  1 -
 milli/src/index.rs                   |  5 ++---
 milli/src/update/delete_documents.rs | 29 +++++++++++++++++++++++-----
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
index f3ca5f0d1..a4df63e22 100644
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@@ -23,4 +23,3 @@ pub use self::roaring_bitmap_length::{
 pub use self::script_language_codec::ScriptLanguageCodec;
 pub use self::str_beu32_codec::StrBEU32Codec;
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
-pub use self::script_language_codec::ScriptLanguageCodec;
diff --git a/milli/src/index.rs b/milli/src/index.rs
index ef26fc305..803c04a50 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -19,8 +19,7 @@ use crate::heed_codec::facet::{
     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
     FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::StrRefCodec;
-use crate::heed_codec::ScriptLanguageCodec;
+use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
 use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
@@ -154,7 +153,7 @@ impl Index {
     ) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(18);
+        options.max_dbs(19);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 7e9b11592..0fbf53f74 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -1184,8 +1184,9 @@ mod tests {
         stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
     }
 
-    #[test]
-    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
+    fn stored_detected_script_and_language_should_not_return_deleted_documents_(
+        deletion_strategy: DeletionStrategy,
+    ) {
         use charabia::{Language, Script};
         let index = TempIndex::new();
         let mut wtxn = index.write_txn().unwrap();
@@ -1202,14 +1203,32 @@ mod tests {
             ]))
             .unwrap();
 
-        delete_documents(&mut wtxn, &index, &["1"]);
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let cj_cmn_docs =
+            index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(1);
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+
+        delete_documents(&mut wtxn, &index, &["1"], deletion_strategy);
         wtxn.commit().unwrap();
 
         let rtxn = index.read_txn().unwrap();
-        let key_cmn = (Script::Cj, Language::Cmn);
-        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
+        let cj_cmn_docs =
+            index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
         let mut expected_cj_cmn_docids = RoaringBitmap::new();
         expected_cj_cmn_docids.push(5);
         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
     }
+
+    #[test]
+    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
+        stored_detected_script_and_language_should_not_return_deleted_documents_(
+            DeletionStrategy::AlwaysHard,
+        );
+        stored_detected_script_and_language_should_not_return_deleted_documents_(
+            DeletionStrategy::AlwaysSoft,
+        );
+    }
 }

From 643d99e0f988e4c7f95cd7cf0666df92cb39a003 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 1 Feb 2023 18:39:54 +0100
Subject: [PATCH 16/21] Add expectancy test

---
 milli/src/search/mod.rs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index df59634bb..b5274599c 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -446,6 +446,27 @@ mod test {
     use super::*;
     use crate::index::tests::TempIndex;
 
+    #[test]
+    fn test_kanji_language_detection() {
+        let index = TempIndex::new();
+
+        index
+            .add_documents(documents!([
+                { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": 1, "title": "東京のお寿司。" },
+                { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
+            ]))
+            .unwrap();
+
+        let txn = index.write_txn().unwrap();
+        let mut search = Search::new(&txn, &index);
+
+        search.query("東京");
+        let SearchResult { documents_ids, .. } = search.execute().unwrap();
+
+        assert_eq!(documents_ids, vec![1]);
+    }
+
     #[test]
     fn test_is_authorized_typos() {
         let index = TempIndex::new();

From 0bc1a18f524377ef9e8596367dfb17478502f5cd Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 1 Feb 2023 18:57:43 +0100
Subject: [PATCH 17/21] Use Languages list detected during indexing at search
 time

---
 milli/src/index.rs      | 20 ++++++++++++++++++++
 milli/src/search/mod.rs |  5 +++++
 2 files changed, 25 insertions(+)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 803c04a50..c14d131a6 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1206,6 +1206,26 @@ impl Index {
         let doc_ids = self.script_language_docids.get(rtxn, key)?;
         Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
     }
+
+    pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Script, Vec<Language>>> {
+        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
+
+        let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
+        for sl in self.script_language_docids.iter(rtxn)? {
+            let ((script, language), docids) = sl?;
+
+            // keep only Languages that contains at least 1 document.
+            if !soft_deleted_documents.is_superset(&docids) {
+                if let Some(languages) = script_language.get_mut(&script) {
+                    (*languages).push(language);
+                } else {
+                    script_language.insert(script, vec![language]);
+                }
+            }
+        }
+
+        Ok(script_language)
+    }
 }
 
 #[cfg(test)]
diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index b5274599c..f6970fcd1 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -152,6 +152,11 @@ impl<'a> Search<'a> {
                     tokbuilder.stop_words(stop_words);
                 }
 
+                let script_lang_map = self.index.script_language(self.rtxn)?;
+                if !script_lang_map.is_empty() {
+                    tokbuilder.allow_list(&script_lang_map);
+                }
+
                 let tokenizer = tokbuilder.build();
                 let tokens = tokenizer.tokenize(query);
                 builder

From cb8d5f2d4bf4c537a6b91e235d75e0cc66a4d5a3 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Feb 2023 14:00:31 +0100
Subject: [PATCH 18/21] Update Charabia to 0.7.1

---
 Cargo.lock       | 174 ++++++++++++++++++++++++-----------------------
 milli/Cargo.toml |   3 +-
 2 files changed, 91 insertions(+), 86 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a894cb8c6..b607263ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -523,12 +523,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "build_const"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4ae4235e6dac0694637c763029ecea1a2ec9e4e06ec2729bd21ba4d9c863eb7"
-
 [[package]]
 name = "bumpalo"
 version = "3.11.1"
@@ -665,16 +659,19 @@ dependencies = [
 
 [[package]]
 name = "charabia"
-version = "0.7.0"
-source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ad3d9667a6b4e03813162c22c4d58235c2dc25d580d60837ce29199038341c9"
 dependencies = [
  "cow-utils",
  "csv",
  "deunicode",
  "fst",
+ "irg-kvariants",
  "jieba-rs",
- "kvariants",
  "lindera",
+ "lindera-ipadic",
+ "lindera-ko-dic",
  "once_cell",
  "pinyin",
  "serde",
@@ -727,14 +724,9 @@ version = "3.2.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
 dependencies = [
- "atty",
  "bitflags",
- "clap_derive 3.2.18",
  "clap_lex 0.2.4",
  "indexmap",
- "once_cell",
- "strsim",
- "termcolor",
  "textwrap",
 ]
 
@@ -745,7 +737,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
 dependencies = [
  "bitflags",
- "clap_derive 4.0.21",
+ "clap_derive",
  "clap_lex 0.3.0",
  "is-terminal",
  "once_cell",
@@ -753,19 +745,6 @@ dependencies = [
  "termcolor",
 ]
 
-[[package]]
-name = "clap_derive"
-version = "3.2.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65"
-dependencies = [
- "heck",
- "proc-macro-error",
- "proc-macro2 1.0.49",
- "quote 1.0.23",
- "syn 1.0.107",
-]
-
 [[package]]
 name = "clap_derive"
 version = "4.0.21"
@@ -879,15 +858,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "crc"
-version = "1.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
-dependencies = [
- "build_const",
-]
-
 [[package]]
 name = "crc32fast"
 version = "1.3.2"
@@ -1333,6 +1303,19 @@ dependencies = [
  "termcolor",
 ]
 
+[[package]]
+name = "env_logger"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
+dependencies = [
+ "humantime",
+ "is-terminal",
+ "log",
+ "regex",
+ "termcolor",
+]
+
 [[package]]
 name = "errno"
 version = "0.2.8"
@@ -1986,6 +1969,17 @@ version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146"
 
+[[package]]
+name = "irg-kvariants"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c73214298363629cf9dbfc93b426808865ee3c121029778cb31b1284104fdf78"
+dependencies = [
+ "csv",
+ "once_cell",
+ "serde",
+]
+
 [[package]]
 name = "is-terminal"
 version = "0.4.2"
@@ -2075,13 +2069,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "kvariants"
-version = "0.1.0"
-source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0"
+name = "kanaria"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff"
 dependencies = [
- "csv",
- "once_cell",
- "serde",
+ "bitflags",
 ]
 
 [[package]]
@@ -2153,14 +2146,15 @@ dependencies = [
 
 [[package]]
 name = "lindera"
-version = "0.17.0"
+version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "082ca91ac4d1557028ace9bfb8cee1500d156a4574dda93cfcdcf4caaebb9bd7"
+checksum = "0f33a20bb9cbf95572b2d2f40d7040c8d8c7ad09ae20e1f6513db6ef2564dfc5"
 dependencies = [
  "anyhow",
  "bincode",
  "byteorder",
  "encoding",
+ "kanaria",
  "lindera-cc-cedict-builder",
  "lindera-core",
  "lindera-dictionary",
@@ -2169,24 +2163,27 @@ dependencies = [
  "lindera-ko-dic",
  "lindera-ko-dic-builder",
  "lindera-unidic-builder",
+ "regex",
  "serde",
  "serde_json",
  "thiserror",
+ "unicode-blocks",
+ "unicode-normalization",
+ "yada",
 ]
 
 [[package]]
 name = "lindera-cc-cedict-builder"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8967615a6d85320ec2755e1435c36165467ba01a79026adc3f86dad1b668df3"
+checksum = "60c3b379251edadbac7a5fdb31e482274e11dae6ab6cc789d0d86cf34369cf49"
 dependencies = [
  "anyhow",
  "bincode",
  "byteorder",
- "clap 3.2.23",
  "csv",
  "encoding",
- "env_logger",
+ "env_logger 0.10.0",
  "glob",
  "lindera-core",
  "lindera-decompress",
@@ -2195,16 +2192,28 @@ dependencies = [
 ]
 
 [[package]]
-name = "lindera-core"
-version = "0.17.0"
+name = "lindera-compress"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e8ed3cea13f73557a4574a179b1518670a3b70bfdad120521313b03cc89380e"
+checksum = "a8d0ea3de5625e2381cac94e518d3b56103fde56bc0dce840fe875c1e871b125"
+dependencies = [
+ "anyhow",
+ "flate2",
+ "lindera-decompress",
+]
+
+[[package]]
+name = "lindera-core"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2281747b98fdd46bcc54ce7fdb6870dad9f67ddb3dc086c47b6704f3e1178cd5"
 dependencies = [
  "anyhow",
  "bincode",
  "byteorder",
  "encoding_rs",
  "log",
+ "once_cell",
  "serde",
  "thiserror",
  "yada",
@@ -2212,20 +2221,20 @@ dependencies = [
 
 [[package]]
 name = "lindera-decompress"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2badb41828f89cfa6452db0a66da77897c0a04478304de26c8b2b36613e08d43"
+checksum = "52101bd454754c506305ab897af5ac2ae41fe91e3272c1ff5c6a02a089dfaefd"
 dependencies = [
  "anyhow",
- "lzma-rs",
+ "flate2",
  "serde",
 ]
 
 [[package]]
 name = "lindera-dictionary"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e219722c9f56b920c231210e7c25d8b5d35b508e7a2fd69d368916c4b1c926f6"
+checksum = "af1c6668848f1d30d216c99093a3ed3fe125c105fa12a4aeed5a1861dc01dd52"
 dependencies = [
  "anyhow",
  "bincode",
@@ -2235,15 +2244,16 @@ dependencies = [
 
 [[package]]
 name = "lindera-ipadic"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8e87c8362c724e8188fb7d9b6d184cac15d01369295e9bff7812b630d57e3b"
+checksum = "693098007200fa43fd5cdc9ca8740f371327369672ce812cd87a1f6344971e31"
 dependencies = [
  "bincode",
  "byteorder",
  "encoding",
  "flate2",
  "lindera-core",
+ "lindera-decompress",
  "lindera-ipadic-builder",
  "once_cell",
  "tar",
@@ -2251,19 +2261,19 @@ dependencies = [
 
 [[package]]
 name = "lindera-ipadic-builder"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1439e95852e444a116424086dc64d709c90e8af269ff7d2c2c4020f666f8dfab"
+checksum = "7b6b7240d097a8fc37ee8f90ebff02c4db0ba5325ecb0dacb6da3724596798c9"
 dependencies = [
  "anyhow",
  "bincode",
  "byteorder",
- "clap 3.2.23",
  "csv",
  "encoding_rs",
  "encoding_rs_io",
- "env_logger",
+ "env_logger 0.10.0",
  "glob",
+ "lindera-compress",
  "lindera-core",
  "lindera-decompress",
  "log",
@@ -2273,15 +2283,16 @@ dependencies = [
 
 [[package]]
 name = "lindera-ko-dic"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb15f949220da45872d774b7831bb030855ec083435c907499782f8558c8a203"
+checksum = "abd3c5a4addeb61ca66788a3dd1fd51093e6cd8fea1d997042ada5aa60e8cc5e"
 dependencies = [
  "bincode",
  "byteorder",
  "encoding",
  "flate2",
  "lindera-core",
+ "lindera-decompress",
  "lindera-ko-dic-builder",
  "once_cell",
  "tar",
@@ -2289,18 +2300,18 @@ dependencies = [
 
 [[package]]
 name = "lindera-ko-dic-builder"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fde5a7352f4754be4f741e90bf4dff38a12a6572ab3880d0cf688e1166b8d82b"
+checksum = "512bb1393a9281e0b13704319d1343b7931416865852d9d7b7c0178431518326"
 dependencies = [
  "anyhow",
  "bincode",
  "byteorder",
- "clap 3.2.23",
  "csv",
  "encoding",
- "env_logger",
+ "env_logger 0.10.0",
  "glob",
+ "lindera-compress",
  "lindera-core",
  "lindera-decompress",
  "log",
@@ -2309,17 +2320,16 @@ dependencies = [
 
 [[package]]
 name = "lindera-unidic-builder"
-version = "0.17.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1451b2ed8a7184a5f815d84f99d358c1d67297305831453dfdc0eb5d08e22b5"
+checksum = "7f575a27f8ba67c15fe16ebf7d277a0ac04e8c8a0f72670ebc2443da9d41c450"
 dependencies = [
  "anyhow",
  "bincode",
  "byteorder",
- "clap 3.2.23",
  "csv",
  "encoding",
- "env_logger",
+ "env_logger 0.10.0",
  "glob",
  "lindera-core",
  "lindera-decompress",
@@ -2408,16 +2418,6 @@ dependencies = [
  "syn 1.0.107",
 ]
 
-[[package]]
-name = "lzma-rs"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aba8ecb0450dfabce4ad72085eed0a75dffe8f21f7ada05638564ea9db2d7fb1"
-dependencies = [
- "byteorder",
- "crc",
-]
-
 [[package]]
 name = "manifest-dir-macros"
 version = "0.1.16"
@@ -2475,7 +2475,7 @@ dependencies = [
  "deserr",
  "dump",
  "either",
- "env_logger",
+ "env_logger 0.9.3",
  "file-store",
  "flate2",
  "fst",
@@ -4113,6 +4113,12 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
 
+[[package]]
+name = "unicode-blocks"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9de2be6bad6f56ce8373d377e611cbb2265de3a656138065609ce82e217aad70"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.6"
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index b6449c5db..2e5d3f376 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -9,8 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "1.0.1"
 byteorder = "1.4.3"
-# charabia = { version = "0.7.0", default-features = false }
-charabia = { git = "https://github.com/meilisearch/charabia", branch = "fix-script-lang-serialization", default-features = false }
+charabia = { version = "0.7.1", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.6"
 deserr = "0.1.4"

From 119e6d8811c83dd50ccca9a7bdb45e047b63569c Mon Sep 17 00:00:00 2001
From: Many the fish <many@meilisearch.com>
Date: Mon, 20 Feb 2023 15:33:10 +0100
Subject: [PATCH 19/21] Update milli/src/search/mod.rs

Co-authored-by: Tamo <tamo@meilisearch.com>
---
 milli/src/search/mod.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index f6970fcd1..451a3d56e 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -451,6 +451,7 @@ mod test {
     use super::*;
     use crate::index::tests::TempIndex;
 
+    #[cfg(feature = "default")]
     #[test]
     fn test_kanji_language_detection() {
         let index = TempIndex::new();

From 23f4e82b53b1d93944a949be2770fe2b6d271cd5 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Feb 2023 15:43:29 +0100
Subject: [PATCH 20/21] Add test ensuring that Meilisearch works on kanji only
 requests

---
 meilisearch/tests/search/mod.rs | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs
index 60ffa6cee..91ff64d37 100644
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@@ -148,6 +148,28 @@ async fn simple_search() {
         .await;
 }
 
+#[cfg(feature = "default")]
+#[actix_rt::test]
+async fn test_kanji_language_detection() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let documents = json!([
+        { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+        { "id": 1, "title": "東京のお寿司。" },
+        { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
+    ]);
+    index.add_documents(documents, None).await;
+    index.wait_task(0).await;
+
+    index
+        .search(json!({"q": "東京"}), |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            assert_eq!(response["hits"].as_array().unwrap().len(), 1);
+        })
+        .await;
+}
+
 #[actix_rt::test]
 async fn search_multiple_params() {
     let server = Server::new().await;

From bbecab8948748908b730a197823c499f07205986 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 21 Feb 2023 10:18:44 +0100
Subject: [PATCH 21/21] fix clippy

---
 .../index_documents/extract/extract_docid_word_positions.rs   | 4 +++-
 milli/src/update/index_documents/extract/mod.rs               | 2 +-
 milli/src/update/index_documents/typed_chunk.rs               | 3 +--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index e091df6b8..2d51fcc1a 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -13,6 +13,8 @@ use crate::{
     absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
 };
 
+pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
+
 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
 ///
@@ -25,7 +27,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     searchable_fields: &Option<HashSet<FieldId>>,
     stop_words: Option<&fst::Set<&[u8]>>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> {
+) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
     let max_positions_per_attributes = max_positions_per_attributes
         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
     let max_memory = indexer.max_memory_by_thread();
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index f38bdd497..c0f07cf79 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -260,7 +260,7 @@ fn send_and_extract_flattened_documents_data(
                 let (documents_ids, docid_word_positions_chunk, script_language_pair) =
                     extract_docid_word_positions(
                         flattened_documents_chunk.clone(),
-                        indexer.clone(),
+                        indexer,
                         searchable_fields,
                         stop_words.as_ref(),
                         max_positions_per_attributes,
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 35f09c051..b9b11cfa8 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -224,8 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
                         let mut new_value_buffer = Vec::new();
                         serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
                         merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
-                        let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?;
-                        merged_db_values
+                        RoaringBitmap::deserialize_from(&buffer[..])?
                     }
                     None => value,
                 };