From c45d1e3610731746eb1ade02652d18580847e89e Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 12 Oct 2022 06:21:35 -0500 Subject: [PATCH 01/21] Create a new database on index and add a specialized codec for it --- milli/src/heed_codec/mod.rs | 2 + milli/src/heed_codec/script_language_codec.rs | 43 +++++++++++++++++++ milli/src/index.rs | 7 +++ 3 files changed, 52 insertions(+) create mode 100644 milli/src/heed_codec/script_language_codec.rs diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 702dcf661..2ac130f48 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -8,6 +8,7 @@ mod roaring_bitmap_length; mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; +mod script_language_codec; pub use byte_slice_ref::ByteSliceRefCodec; pub use str_ref::StrRefCodec; @@ -21,3 +22,4 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; +pub use self::script_language_codec::ScriptLanguageCodec; diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs new file mode 100644 index 000000000..af15990ea --- /dev/null +++ b/milli/src/heed_codec/script_language_codec.rs @@ -0,0 +1,43 @@ +use std::borrow::Cow; + +use std::mem::size_of; +use std::str; + +use charabia::{Language, Script}; + +pub struct ScriptLanguageCodec; + +impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { + type DItem = (Script, Language); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (script, bytes) = bytes.split_at(bytes.len() - footer_len); + let script = str::from_utf8(script).ok()?; + let script_name = Script::from_name(script); + let lan = str::from_utf8(bytes).ok()?; + let lan_name = Language::from_name(lan); + + Some((script_name, lan_name)) + } +} + +impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { + type EItem = (Script, Language); + + fn bytes_encode((script, lan): &Self::EItem) -> Option> { + let script_name = script.name(); + let lan_name = lan.name(); + + let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len()); + bytes.extend_from_slice(script_name.as_bytes()); + bytes.extend_from_slice(lan_name.as_bytes()); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 3f7ef14e6..76bc273d9 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,6 +14,7 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::ScriptLanguageCodec; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, @@ -83,6 +84,7 @@ pub mod db_name { pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const DOCUMENTS: &str = "documents"; + pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; } #[derive(Clone)] @@ -122,6 +124,9 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, + /// Maps the script and language with all the docids that corresponds to it. + pub script_language_docids: Database, + /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, @@ -159,6 +164,7 @@ impl Index { let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; let prefix_word_pair_proximity_docids = @@ -186,6 +192,7 @@ impl Index { exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + script_language_docids, word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids, From d97fb6117ea086080bde016115f2acdd46c46702 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 12 Oct 2022 06:24:56 -0500 Subject: [PATCH 02/21] Extract and index data --- .../extract/extract_docid_word_positions.rs | 15 ++++++++--- .../src/update/index_documents/extract/mod.rs | 4 ++- .../src/update/index_documents/typed_chunk.rs | 27 ++++++++++++++++--- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index be9b479bb..66b2c768b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; +use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; @@ -25,12 +25,13 @@ pub fn extract_docid_word_positions( searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader)> { +) -> Result<(RoaringBitmap, grenad::Reader, HashMap<(Script, Language), RoaringBitmap>)> { let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); + let mut script_language_pair = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, concat_u32s_array, @@ -70,6 +71,12 @@ pub fn extract_docid_word_positions( .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { + let script = token.script; + let language = token.language.unwrap_or_default(); + let entry = script_language_pair + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); @@ -88,7 +95,7 @@ pub fn extract_docid_word_positions( } } - sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) + sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair)) } /// Transform a JSON value into a string that can be indexed. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index e696ed44b..540b8993b 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, searchable_fields, @@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data( let _ = lmdb_writer_sx .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); + Ok(docid_word_positions_chunk) }, || { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 16784bd92..920971eec 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,8 +1,10 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; +use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; @@ -16,10 +18,7 @@ use super::{ClonableMmap, MergeFn}; use crate::facet::FacetType; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, -}; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -38,6 +37,7 @@ pub(crate) enum TypedChunk { FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), + ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>) } /// Write typed chunk in the corresponding LMDB database of the provided index. @@ -210,6 +210,25 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } + TypedChunk::ScriptLanguageDocids(hash_pair) => { + let mut buffer = Vec::new(); + for (key, value) in hash_pair { + buffer.clear(); + let final_value = match index.script_language_docids.get(wtxn, &key)? { + Some(db_values) => { + let mut db_value_buffer = Vec::new(); + serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; + let mut new_value_buffer = Vec::new(); + serialize_roaring_bitmap(&value, &mut new_value_buffer)?; + merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; + let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?; + merged_db_values + } + None => value + }; + index.script_language_docids.put(wtxn, &key, &final_value)?; + } + } } Ok((RoaringBitmap::new(), is_merged_database)) From b216ddba63144ca0aba71d8d1707ea1109b96f12 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 12 Oct 2022 06:28:36 -0500 Subject: [PATCH 03/21] Delete and clear data from the new database --- milli/src/update/clear_documents.rs | 2 ++ milli/src/update/delete_documents.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index adeea11fa..0296bc192 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_position_docids, field_id_word_count_docids, word_prefix_position_docids, + script_language_docids, facet_id_f64_docids, facet_id_string_docids, facet_id_exists_docids, @@ -82,6 +83,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; + script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f4a6d396e..d4fff68c2 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -243,6 +243,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_string_docids: _, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, + script_language_docids, facet_id_exists_docids, documents, } = self.index; From a27f329e3a86d438b47d229aaeb1362092887fbe Mon Sep 17 00:00:00 2001 From: f3r10 Date: Fri, 14 Oct 2022 14:05:53 -0500 Subject: [PATCH 04/21] Add tests for checking that detected script and language associated with document(s) were stored during indexing --- milli/src/index.rs | 7 +++++ milli/src/update/index_documents/mod.rs | 35 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 76bc273d9..03f16a126 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::mem::size_of; use std::path::Path; +use charabia::{Language, Script}; use heed::flags::Flags; use heed::types::*; use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; @@ -1194,6 +1195,12 @@ impl Index { pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } + + /* script language docids */ + /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. + pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result> { + self.script_language_docids.get(rtxn, key) + } } #[cfg(test)] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index be97defbd..f736cb55e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1907,4 +1907,39 @@ mod tests { index.add_documents(doc1).unwrap(); } + + #[cfg(feature = "default")] + #[test] + fn store_detected_script_and_language_per_document_during_indexing() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let key_thai = (Script::Thai, Language::Other); + let key_jpn = (Script::Cj, Language::Jpn); + let key_cmn = (Script::Cj, Language::Cmn); + let thai_docs = index.script_language_documents_ids(&rtxn, &key_thai).unwrap().unwrap(); + let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap(); + let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); + let mut expected_thai_docids = RoaringBitmap::new(); + expected_thai_docids.push(4); + assert_eq!(thai_docs, expected_thai_docids); + let mut expected_cj_jpn_docids = RoaringBitmap::new(); + expected_cj_jpn_docids.push(3); + assert_eq!(cj_jpn_docs, expected_cj_jpn_docids); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } } From 34d04f3d3f94adb51aecca199f2084f30a3128d0 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Fri, 14 Oct 2022 16:22:42 -0500 Subject: [PATCH 05/21] Filter from script_language_docids database soft deleted documents --- milli/src/index.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 03f16a126..dc9cb7994 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1199,7 +1199,9 @@ impl Index { /* script language docids */ /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result> { - self.script_language_docids.get(rtxn, key) + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; + let doc_ids = self.script_language_docids.get(rtxn, key)?; + Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) } } From 369c05732e79dbd602c408b7947672bbc8e23b46 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Fri, 14 Oct 2022 16:25:09 -0500 Subject: [PATCH 06/21] Add test checking if from script_language_docids database were removed deleted docids --- milli/src/update/delete_documents.rs | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index d4fff68c2..7e9b11592 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -500,6 +500,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { .execute(self.wtxn)?; } + // Remove the documents ids from the script language database. + let mut iter = script_language_docids.iter_mut(self.wtxn)?; + while let Some((key, mut docids)) = iter.next().transpose()? { + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + + drop(iter); // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( self.wtxn, @@ -1167,4 +1183,33 @@ mod tests { stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } + + #[test] + fn stored_detected_script_and_language_should_not_return_deleted_documents() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + delete_documents(&mut wtxn, &index, &["1"]); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let key_cmn = (Script::Cj, Language::Cmn); + let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } } From fd60a39f1c4d56941cb3cd3f58d8590096458545 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Mon, 17 Oct 2022 06:51:04 -0500 Subject: [PATCH 07/21] Format code --- milli/src/heed_codec/mod.rs | 3 ++- milli/src/index.rs | 10 +++++++--- .../extract/extract_docid_word_positions.rs | 3 ++- .../src/update/index_documents/extract/mod.rs | 18 ++++++++++-------- .../src/update/index_documents/typed_chunk.rs | 11 +++++++---- 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 2ac130f48..f3ca5f0d1 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -5,10 +5,10 @@ mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; +mod script_language_codec; mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; -mod script_language_codec; pub use byte_slice_ref::ByteSliceRefCodec; pub use str_ref::StrRefCodec; @@ -20,6 +20,7 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; +pub use self::script_language_codec::ScriptLanguageCodec; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; pub use self::script_language_codec::ScriptLanguageCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index dc9cb7994..ef26fc305 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -15,12 +15,12 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::heed_codec::ScriptLanguageCodec; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; use crate::heed_codec::StrRefCodec; +use crate::heed_codec::ScriptLanguageCodec; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, @@ -125,7 +125,7 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, - /// Maps the script and language with all the docids that corresponds to it. + /// Maps the script and language with all the docids that corresponds to it. pub script_language_docids: Database, /// Maps the facet field id and the docids for which this field exists @@ -1198,7 +1198,11 @@ impl Index { /* script language docids */ /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. - pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result> { + pub fn script_language_documents_ids( + &self, + rtxn: &RoTxn, + key: &(Script, Language), + ) -> heed::Result> { let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; let doc_ids = self.script_language_docids.get(rtxn, key)?; Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 66b2c768b..8a9f7e04f 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -95,7 +95,8 @@ pub fn extract_docid_word_positions( } } - sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair)) + sorter_into_reader(docid_word_positions_sorter, indexer) + .map(|reader| (documents_ids, reader, script_language_pair)) } /// Transform a JSON value into a string that can be indexed. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 540b8993b..f38bdd497 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -257,13 +257,14 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( - flattened_documents_chunk.clone(), - indexer, - searchable_fields, - stop_words.as_ref(), - max_positions_per_attributes, - )?; + let (documents_ids, docid_word_positions_chunk, script_language_pair) = + extract_docid_word_positions( + flattened_documents_chunk.clone(), + indexer.clone(), + searchable_fields, + stop_words.as_ref(), + max_positions_per_attributes, + )?; // send documents_ids to DB writer let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); @@ -274,7 +275,8 @@ fn send_and_extract_flattened_documents_data( let _ = lmdb_writer_sx .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); - let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); + let _ = + lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); Ok(docid_word_positions_chunk) }, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 920971eec..35f09c051 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -18,7 +18,10 @@ use super::{ClonableMmap, MergeFn}; use crate::facet::FacetType; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz}; +use crate::{ + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, +}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -37,7 +40,7 @@ pub(crate) enum TypedChunk { FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), - ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>) + ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), } /// Write typed chunk in the corresponding LMDB database of the provided index. @@ -224,11 +227,11 @@ pub(crate) fn write_typed_chunk_into_index( let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?; merged_db_values } - None => value + None => value, }; index.script_language_docids.put(wtxn, &key, &final_value)?; } - } + } } Ok((RoaringBitmap::new(), is_merged_database)) From 2d58b28f43ab45d6d0cd1e9c04f9450ec256f1d0 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 19 Oct 2022 07:03:46 -0500 Subject: [PATCH 08/21] Improve script language codec --- milli/src/heed_codec/script_language_codec.rs | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs index af15990ea..7e150723a 100644 --- a/milli/src/heed_codec/script_language_codec.rs +++ b/milli/src/heed_codec/script_language_codec.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; -use std::mem::size_of; use std::str; use charabia::{Language, Script}; @@ -11,16 +10,11 @@ impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { type DItem = (Script, Language); fn bytes_decode(bytes: &'a [u8]) -> Option { - let footer_len = size_of::(); - - if bytes.len() < footer_len { - return None; - } - - let (script, bytes) = bytes.split_at(bytes.len() - footer_len); - let script = str::from_utf8(script).ok()?; + let sep = bytes.iter().position(|b| *b == 0)?; + let (s_bytes, l_bytes) = bytes.split_at(sep); + let script = str::from_utf8(s_bytes).ok()?; let script_name = Script::from_name(script); - let lan = str::from_utf8(bytes).ok()?; + let lan = str::from_utf8(l_bytes).ok()?; let lan_name = Language::from_name(lan); Some((script_name, lan_name)) @@ -31,12 +25,13 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { type EItem = (Script, Language); fn bytes_encode((script, lan): &Self::EItem) -> Option> { - let script_name = script.name(); - let lan_name = lan.name(); + let script_name = script.name().as_bytes(); + let lan_name = lan.name().as_bytes(); - let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len()); - bytes.extend_from_slice(script_name.as_bytes()); - bytes.extend_from_slice(lan_name.as_bytes()); + let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1); + bytes.extend_from_slice(script_name); + bytes.push(0); + bytes.extend_from_slice(lan_name); Some(Cow::Owned(bytes)) } From d8207356f416e018e4e3bf7969c9e27120884335 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 19 Oct 2022 07:09:05 -0500 Subject: [PATCH 09/21] Skip script,language insertion if language is undetected --- .../extract/extract_docid_word_positions.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 8a9f7e04f..e091df6b8 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -71,12 +71,13 @@ pub fn extract_docid_word_positions( .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { - let script = token.script; - let language = token.language.unwrap_or_default(); - let entry = script_language_pair - .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); + if let Some(language) = token.language { + let script = token.script; + let entry = script_language_pair + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); + } let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); From 50bc1562571d2e3710ad9495e3b74ddb1de0e3da Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 19 Oct 2022 07:13:10 -0500 Subject: [PATCH 10/21] Fix tests --- milli/src/update/index_documents/mod.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f736cb55e..2f06558f2 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1925,21 +1925,13 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); - let key_thai = (Script::Thai, Language::Other); let key_jpn = (Script::Cj, Language::Jpn); let key_cmn = (Script::Cj, Language::Cmn); - let thai_docs = index.script_language_documents_ids(&rtxn, &key_thai).unwrap().unwrap(); let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap(); let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); - let mut expected_thai_docids = RoaringBitmap::new(); - expected_thai_docids.push(4); - assert_eq!(thai_docs, expected_thai_docids); - let mut expected_cj_jpn_docids = RoaringBitmap::new(); - expected_cj_jpn_docids.push(3); + let expected_cj_jpn_docids = [3].iter().collect(); assert_eq!(cj_jpn_docs, expected_cj_jpn_docids); - let mut expected_cj_cmn_docids = RoaringBitmap::new(); - expected_cj_cmn_docids.push(1); - expected_cj_cmn_docids.push(5); + let expected_cj_cmn_docids = [1,5].iter().collect(); assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); } } From 7681be5367b99b59cfde97d0b9809d077cbf6b67 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 19 Oct 2022 07:18:11 -0500 Subject: [PATCH 11/21] Format code --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2f06558f2..b41892f7d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1931,7 +1931,7 @@ mod tests { let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); let expected_cj_jpn_docids = [3].iter().collect(); assert_eq!(cj_jpn_docs, expected_cj_jpn_docids); - let expected_cj_cmn_docids = [1,5].iter().collect(); + let expected_cj_cmn_docids = [1, 5].iter().collect(); assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); } } From 2922c5c8997e305b0370e006f12c78b8a5161993 Mon Sep 17 00:00:00 2001 From: f3r10 Date: Wed, 19 Oct 2022 10:32:22 -0500 Subject: [PATCH 12/21] Fix code format --- milli/src/heed_codec/script_language_codec.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs index 7e150723a..b79ba4897 100644 --- a/milli/src/heed_codec/script_language_codec.rs +++ b/milli/src/heed_codec/script_language_codec.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; - use std::str; use charabia::{Language, Script}; From f4569b04ad086479c557a116f90b4edc87f1d86a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Feb 2023 15:23:25 +0100 Subject: [PATCH 13/21] Update Charabia version --- Cargo.lock | 14 ++++++++++++-- milli/Cargo.toml | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dd7d828da..a894cb8c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -666,14 +666,14 @@ dependencies = [ [[package]] name = "charabia" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b57f9571f611796ea38e5a9c12e5ce37476f70397b032757f8dfe0c7b9bc5637" +source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0" dependencies = [ "cow-utils", "csv", "deunicode", "fst", "jieba-rs", + "kvariants", "lindera", "once_cell", "pinyin", @@ -2074,6 +2074,16 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kvariants" +version = "0.1.0" +source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0" +dependencies = [ + "csv", + "once_cell", + "serde", +] + [[package]] name = "language-tags" version = "0.3.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b32592ab9..b6449c5db 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,8 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "1.0.1" byteorder = "1.4.3" -charabia = { version = "0.7.0", default-features = false } +# charabia = { version = "0.7.0", default-features = false } +charabia = { git = "https://github.com/meilisearch/charabia", branch = "fix-script-lang-serialization", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" deserr = "0.1.4" From 77d32d0ee811a19ca0d9be962ca981ef8278b4ca Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Feb 2023 15:24:49 +0100 Subject: [PATCH 14/21] Fix codec deserialization --- milli/src/heed_codec/script_language_codec.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs index b79ba4897..83e8a7241 100644 --- a/milli/src/heed_codec/script_language_codec.rs +++ b/milli/src/heed_codec/script_language_codec.rs @@ -14,7 +14,8 @@ impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { let script = str::from_utf8(s_bytes).ok()?; let script_name = Script::from_name(script); let lan = str::from_utf8(l_bytes).ok()?; - let lan_name = Language::from_name(lan); + // skip '\0' byte between the two strings. + let lan_name = Language::from_name(&lan[1..]); Some((script_name, lan_name)) } From 064158e4e29077749b8eb18dd87a706ec71908e5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Feb 2023 15:34:01 +0100 Subject: [PATCH 15/21] Update test --- milli/src/heed_codec/mod.rs | 1 - milli/src/index.rs | 5 ++--- milli/src/update/delete_documents.rs | 29 +++++++++++++++++++++++----- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index f3ca5f0d1..a4df63e22 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -23,4 +23,3 @@ pub use self::roaring_bitmap_length::{ pub use self::script_language_codec::ScriptLanguageCodec; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; -pub use self::script_language_codec::ScriptLanguageCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index ef26fc305..803c04a50 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -19,8 +19,7 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::StrRefCodec; -use crate::heed_codec::ScriptLanguageCodec; +use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, @@ -154,7 +153,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(18); + options.max_dbs(19); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 7e9b11592..0fbf53f74 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1184,8 +1184,9 @@ mod tests { stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } - #[test] - fn stored_detected_script_and_language_should_not_return_deleted_documents() { + fn stored_detected_script_and_language_should_not_return_deleted_documents_( + deletion_strategy: DeletionStrategy, + ) { use charabia::{Language, Script}; let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1202,14 +1203,32 @@ mod tests { ])) .unwrap(); - delete_documents(&mut wtxn, &index, &["1"]); + let key_cmn = (Script::Cj, Language::Cmn); + let cj_cmn_docs = + index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + + delete_documents(&mut wtxn, &index, &["1"], deletion_strategy); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); - let key_cmn = (Script::Cj, Language::Cmn); - let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); + let cj_cmn_docs = + index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); let mut expected_cj_cmn_docids = RoaringBitmap::new(); expected_cj_cmn_docids.push(5); assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); } + + #[test] + fn stored_detected_script_and_language_should_not_return_deleted_documents() { + stored_detected_script_and_language_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysHard, + ); + stored_detected_script_and_language_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysSoft, + ); + } } From 643d99e0f988e4c7f95cd7cf0666df92cb39a003 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Feb 2023 18:39:54 +0100 Subject: [PATCH 16/21] Add expectancy test --- milli/src/search/mod.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index df59634bb..b5274599c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -446,6 +446,27 @@ mod test { use super::*; use crate::index::tests::TempIndex; + #[test] + fn test_kanji_language_detection() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ])) + .unwrap(); + + let txn = index.write_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + search.query("東京"); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + assert_eq!(documents_ids, vec![1]); + } + #[test] fn test_is_authorized_typos() { let index = TempIndex::new(); From 0bc1a18f524377ef9e8596367dfb17478502f5cd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 1 Feb 2023 18:57:43 +0100 Subject: [PATCH 17/21] Use Languages list detected during indexing at search time --- milli/src/index.rs | 20 ++++++++++++++++++++ milli/src/search/mod.rs | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 803c04a50..c14d131a6 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1206,6 +1206,26 @@ impl Index { let doc_ids = self.script_language_docids.get(rtxn, key)?; Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) } + + pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; + + let mut script_language: HashMap> = HashMap::new(); + for sl in self.script_language_docids.iter(rtxn)? { + let ((script, language), docids) = sl?; + + // keep only Languages that contains at least 1 document. + if !soft_deleted_documents.is_superset(&docids) { + if let Some(languages) = script_language.get_mut(&script) { + (*languages).push(language); + } else { + script_language.insert(script, vec![language]); + } + } + } + + Ok(script_language) + } } #[cfg(test)] diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index b5274599c..f6970fcd1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -152,6 +152,11 @@ impl<'a> Search<'a> { tokbuilder.stop_words(stop_words); } + let script_lang_map = self.index.script_language(self.rtxn)?; + if !script_lang_map.is_empty() { + tokbuilder.allow_list(&script_lang_map); + } + let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); builder From cb8d5f2d4bf4c537a6b91e235d75e0cc66a4d5a3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Feb 2023 14:00:31 +0100 Subject: [PATCH 18/21] Update Charabia to 0.7.1 --- Cargo.lock | 174 ++++++++++++++++++++++++----------------------- milli/Cargo.toml | 3 +- 2 files changed, 91 insertions(+), 86 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a894cb8c6..b607263ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -523,12 +523,6 @@ dependencies = [ "serde", ] -[[package]] -name = "build_const" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ae4235e6dac0694637c763029ecea1a2ec9e4e06ec2729bd21ba4d9c863eb7" - [[package]] name = "bumpalo" version = "3.11.1" @@ -665,16 +659,19 @@ dependencies = [ [[package]] name = "charabia" -version = "0.7.0" -source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad3d9667a6b4e03813162c22c4d58235c2dc25d580d60837ce29199038341c9" dependencies = [ "cow-utils", "csv", "deunicode", "fst", + "irg-kvariants", "jieba-rs", - "kvariants", "lindera", + "lindera-ipadic", + "lindera-ko-dic", "once_cell", "pinyin", "serde", @@ -727,14 +724,9 @@ version = "3.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" dependencies = [ - "atty", "bitflags", - "clap_derive 3.2.18", "clap_lex 0.2.4", "indexmap", - "once_cell", - "strsim", - "termcolor", "textwrap", ] @@ -745,7 +737,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39" dependencies = [ "bitflags", - "clap_derive 4.0.21", + "clap_derive", "clap_lex 0.3.0", "is-terminal", "once_cell", @@ -753,19 +745,6 @@ dependencies = [ "termcolor", ] -[[package]] -name = "clap_derive" -version = "3.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2 1.0.49", - "quote 1.0.23", - "syn 1.0.107", -] - [[package]] name = "clap_derive" version = "4.0.21" @@ -879,15 +858,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" -dependencies = [ - "build_const", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -1333,6 +1303,19 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + [[package]] name = "errno" version = "0.2.8" @@ -1986,6 +1969,17 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +[[package]] +name = "irg-kvariants" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c73214298363629cf9dbfc93b426808865ee3c121029778cb31b1284104fdf78" +dependencies = [ + "csv", + "once_cell", + "serde", +] + [[package]] name = "is-terminal" version = "0.4.2" @@ -2075,13 +2069,12 @@ dependencies = [ ] [[package]] -name = "kvariants" -version = "0.1.0" -source = "git+https://github.com/meilisearch/charabia?branch=fix-script-lang-serialization#c5efba56d433ff783e162009e020baba322afde0" +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" dependencies = [ - "csv", - "once_cell", - "serde", + "bitflags", ] [[package]] @@ -2153,14 +2146,15 @@ dependencies = [ [[package]] name = "lindera" -version = "0.17.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "082ca91ac4d1557028ace9bfb8cee1500d156a4574dda93cfcdcf4caaebb9bd7" +checksum = "0f33a20bb9cbf95572b2d2f40d7040c8d8c7ad09ae20e1f6513db6ef2564dfc5" dependencies = [ "anyhow", "bincode", "byteorder", "encoding", + "kanaria", "lindera-cc-cedict-builder", "lindera-core", "lindera-dictionary", @@ -2169,24 +2163,27 @@ dependencies = [ "lindera-ko-dic", "lindera-ko-dic-builder", "lindera-unidic-builder", + "regex", "serde", "serde_json", "thiserror", + "unicode-blocks", + "unicode-normalization", + "yada", ] [[package]] name = "lindera-cc-cedict-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8967615a6d85320ec2755e1435c36165467ba01a79026adc3f86dad1b668df3" +checksum = "60c3b379251edadbac7a5fdb31e482274e11dae6ab6cc789d0d86cf34369cf49" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding", - "env_logger", + "env_logger 0.10.0", "glob", "lindera-core", "lindera-decompress", @@ -2195,16 +2192,28 @@ dependencies = [ ] [[package]] -name = "lindera-core" -version = "0.17.0" +name = "lindera-compress" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8ed3cea13f73557a4574a179b1518670a3b70bfdad120521313b03cc89380e" +checksum = "a8d0ea3de5625e2381cac94e518d3b56103fde56bc0dce840fe875c1e871b125" +dependencies = [ + "anyhow", + "flate2", + "lindera-decompress", +] + +[[package]] +name = "lindera-core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2281747b98fdd46bcc54ce7fdb6870dad9f67ddb3dc086c47b6704f3e1178cd5" dependencies = [ "anyhow", "bincode", "byteorder", "encoding_rs", "log", + "once_cell", "serde", "thiserror", "yada", @@ -2212,20 +2221,20 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2badb41828f89cfa6452db0a66da77897c0a04478304de26c8b2b36613e08d43" +checksum = "52101bd454754c506305ab897af5ac2ae41fe91e3272c1ff5c6a02a089dfaefd" dependencies = [ "anyhow", - "lzma-rs", + "flate2", "serde", ] [[package]] name = "lindera-dictionary" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e219722c9f56b920c231210e7c25d8b5d35b508e7a2fd69d368916c4b1c926f6" +checksum = "af1c6668848f1d30d216c99093a3ed3fe125c105fa12a4aeed5a1861dc01dd52" dependencies = [ "anyhow", "bincode", @@ -2235,15 +2244,16 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8e87c8362c724e8188fb7d9b6d184cac15d01369295e9bff7812b630d57e3b" +checksum = "693098007200fa43fd5cdc9ca8740f371327369672ce812cd87a1f6344971e31" dependencies = [ "bincode", "byteorder", "encoding", "flate2", "lindera-core", + "lindera-decompress", "lindera-ipadic-builder", "once_cell", "tar", @@ -2251,19 +2261,19 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1439e95852e444a116424086dc64d709c90e8af269ff7d2c2c4020f666f8dfab" +checksum = "7b6b7240d097a8fc37ee8f90ebff02c4db0ba5325ecb0dacb6da3724596798c9" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding_rs", "encoding_rs_io", - "env_logger", + "env_logger 0.10.0", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2273,15 +2283,16 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb15f949220da45872d774b7831bb030855ec083435c907499782f8558c8a203" +checksum = "abd3c5a4addeb61ca66788a3dd1fd51093e6cd8fea1d997042ada5aa60e8cc5e" dependencies = [ "bincode", "byteorder", "encoding", "flate2", "lindera-core", + "lindera-decompress", "lindera-ko-dic-builder", "once_cell", "tar", @@ -2289,18 +2300,18 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde5a7352f4754be4f741e90bf4dff38a12a6572ab3880d0cf688e1166b8d82b" +checksum = "512bb1393a9281e0b13704319d1343b7931416865852d9d7b7c0178431518326" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding", - "env_logger", + "env_logger 0.10.0", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2309,17 +2320,16 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1451b2ed8a7184a5f815d84f99d358c1d67297305831453dfdc0eb5d08e22b5" +checksum = "7f575a27f8ba67c15fe16ebf7d277a0ac04e8c8a0f72670ebc2443da9d41c450" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding", - "env_logger", + "env_logger 0.10.0", "glob", "lindera-core", "lindera-decompress", @@ -2408,16 +2418,6 @@ dependencies = [ "syn 1.0.107", ] -[[package]] -name = "lzma-rs" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba8ecb0450dfabce4ad72085eed0a75dffe8f21f7ada05638564ea9db2d7fb1" -dependencies = [ - "byteorder", - "crc", -] - [[package]] name = "manifest-dir-macros" version = "0.1.16" @@ -2475,7 +2475,7 @@ dependencies = [ "deserr", "dump", "either", - "env_logger", + "env_logger 0.9.3", "file-store", "flate2", "fst", @@ -4113,6 +4113,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" +[[package]] +name = "unicode-blocks" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9de2be6bad6f56ce8373d377e611cbb2265de3a656138065609ce82e217aad70" + [[package]] name = "unicode-ident" version = "1.0.6" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b6449c5db..2e5d3f376 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,8 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "1.0.1" byteorder = "1.4.3" -# charabia = { version = "0.7.0", default-features = false } -charabia = { git = "https://github.com/meilisearch/charabia", branch = "fix-script-lang-serialization", default-features = false } +charabia = { version = "0.7.1", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" deserr = "0.1.4" From 119e6d8811c83dd50ccca9a7bdb45e047b63569c Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 20 Feb 2023 15:33:10 +0100 Subject: [PATCH 19/21] Update milli/src/search/mod.rs Co-authored-by: Tamo --- milli/src/search/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f6970fcd1..451a3d56e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -451,6 +451,7 @@ mod test { use super::*; use crate::index::tests::TempIndex; + #[cfg(feature = "default")] #[test] fn test_kanji_language_detection() { let index = TempIndex::new(); From 23f4e82b53b1d93944a949be2770fe2b6d271cd5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 20 Feb 2023 15:43:29 +0100 Subject: [PATCH 20/21] Add test ensuring that Meilisearch works on kanji only requests --- meilisearch/tests/search/mod.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 60ffa6cee..91ff64d37 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -148,6 +148,28 @@ async fn simple_search() { .await; } +#[cfg(feature = "default")] +#[actix_rt::test] +async fn test_kanji_language_detection() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = json!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ]); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index + .search(json!({"q": "東京"}), |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + }) + .await; +} + #[actix_rt::test] async fn search_multiple_params() { let server = Server::new().await; From bbecab8948748908b730a197823c499f07205986 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 21 Feb 2023 10:18:44 +0100 Subject: [PATCH 21/21] fix clippy --- .../index_documents/extract/extract_docid_word_positions.rs | 4 +++- milli/src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index e091df6b8..2d51fcc1a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -13,6 +13,8 @@ use crate::{ absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, }; +pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; + /// Extracts the word and positions where this word appear and /// prefixes it by the document id. /// @@ -25,7 +27,7 @@ pub fn extract_docid_word_positions( searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader, HashMap<(Script, Language), RoaringBitmap>)> { +) -> Result<(RoaringBitmap, grenad::Reader, ScriptLanguageDocidsMap)> { let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index f38bdd497..c0f07cf79 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -260,7 +260,7 @@ fn send_and_extract_flattened_documents_data( let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( flattened_documents_chunk.clone(), - indexer.clone(), + indexer, searchable_fields, stop_words.as_ref(), max_positions_per_attributes, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 35f09c051..b9b11cfa8 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -224,8 +224,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut new_value_buffer = Vec::new(); serialize_roaring_bitmap(&value, &mut new_value_buffer)?; merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; - let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?; - merged_db_values + RoaringBitmap::deserialize_from(&buffer[..])? } None => value, };