diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index be9b479bb..66b2c768b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; +use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; @@ -25,12 +25,13 @@ pub fn extract_docid_word_positions( searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader)> { +) -> Result<(RoaringBitmap, grenad::Reader, HashMap<(Script, Language), RoaringBitmap>)> { let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); + let mut script_language_pair = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, concat_u32s_array, @@ -70,6 +71,12 @@ pub fn extract_docid_word_positions( .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { + let script = token.script; + let language = token.language.unwrap_or_default(); + let entry = script_language_pair + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); @@ -88,7 +95,7 @@ pub fn extract_docid_word_positions( } } - sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) + sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair)) } /// Transform a JSON value into a string that can be indexed. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index e696ed44b..540b8993b 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( flattened_documents_chunk.clone(), indexer, searchable_fields, @@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data( let _ = lmdb_writer_sx .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); + Ok(docid_word_positions_chunk) }, || { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 16784bd92..920971eec 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,8 +1,10 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; +use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; @@ -16,10 +18,7 @@ use super::{ClonableMmap, MergeFn}; use crate::facet::FacetType; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, -}; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -38,6 +37,7 @@ pub(crate) enum TypedChunk { FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), + ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>) } /// Write typed chunk in the corresponding LMDB database of the provided index. @@ -210,6 +210,25 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } + TypedChunk::ScriptLanguageDocids(hash_pair) => { + let mut buffer = Vec::new(); + for (key, value) in hash_pair { + buffer.clear(); + let final_value = match index.script_language_docids.get(wtxn, &key)? { + Some(db_values) => { + let mut db_value_buffer = Vec::new(); + serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; + let mut new_value_buffer = Vec::new(); + serialize_roaring_bitmap(&value, &mut new_value_buffer)?; + merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; + let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?; + merged_db_values + } + None => value + }; + index.script_language_docids.put(wtxn, &key, &final_value)?; + } + } } Ok((RoaringBitmap::new(), is_merged_database))