diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index e02e492d2..36258b275 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; -pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; +pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -30,11 +30,7 @@ pub fn extract_docid_word_positions( allowed_separators: Option<&[&str]>, dictionary: Option<&[&str]>, max_positions_per_attributes: Option, -) -> Result<( - RoaringBitmap, - grenad::Reader>, - (ScriptLanguageDocidsMap, ScriptLanguageDocidsMap), -)> { +) -> Result<(RoaringBitmap, grenad::Reader>, ScriptLanguageDocidsMap)> { puffin::profile_function!(); let max_positions_per_attributes = max_positions_per_attributes @@ -43,8 +39,7 @@ pub fn extract_docid_word_positions( // initialize destination values. let mut documents_ids = RoaringBitmap::new(); - let mut del_script_language_docids = HashMap::new(); - let mut add_script_language_docids = HashMap::new(); + let mut script_language_docids = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, keep_latest_obkv, @@ -138,25 +133,24 @@ pub fn extract_docid_word_positions( // update script_language_docids deletions. for (script, languages_frequency) in del_script_language_word_count { for (language, _) in languages_frequency { - let entry = del_script_language_docids + let entry = script_language_docids .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.0.push(document_id); } } // update script_language_docids additions. for (script, languages_frequency) in add_script_language_word_count { for (language, _) in languages_frequency { - let entry = add_script_language_docids + let entry = script_language_docids .entry((script, language)) - .or_insert_with(RoaringBitmap::new); - entry.push(document_id); + .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); + entry.1.push(document_id); } } } - let script_language_docids = (del_script_language_docids, add_script_language_docids); sorter_into_reader(docid_word_positions_sorter, indexer) .map(|reader| (documents_ids, reader, script_language_docids)) } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index f2dc7d336..e3ff9b253 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -43,9 +43,7 @@ pub(crate) enum TypedChunk { FieldIdFacetIsEmptyDocids(grenad::Reader>), GeoPoints(grenad::Reader>), VectorPoints(grenad::Reader>), - ScriptLanguageDocids( - (HashMap<(Script, Language), RoaringBitmap>, HashMap<(Script, Language), RoaringBitmap>), - ), + ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } impl TypedChunk { @@ -103,8 +101,8 @@ impl TypedChunk { TypedChunk::VectorPoints(grenad) => { format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::ScriptLanguageDocids((_, addition)) => { - format!("ScriptLanguageDocids {{ number_of_entries: {} }}", addition.len()) + TypedChunk::ScriptLanguageDocids(sl_map) => { + format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) } } } @@ -346,24 +344,25 @@ pub(crate) fn write_typed_chunk_into_index( log::debug!("There are {} entries in the HNSW so far", hnsw_length); index.put_vector_hnsw(wtxn, &new_hnsw)?; } - TypedChunk::ScriptLanguageDocids((deletion, addition)) => { - for (key, value) in deletion { - if let Some(mut db_values) = index.script_language_docids.get(wtxn, &key)? { - db_values -= value; - if db_values.is_empty() { - index.script_language_docids.delete(wtxn, &key)?; - } else { - index.script_language_docids.put(wtxn, &key, &db_values)?; - } - } - } - - for (key, value) in addition { + TypedChunk::ScriptLanguageDocids(sl_map) => { + for (key, (deletion, addition)) in sl_map { + let mut db_key_exists = false; let final_value = match index.script_language_docids.get(wtxn, &key)? { - Some(mut db_values) => db_values | value, - None => value, + Some(db_values) => { + db_key_exists = true; + (db_values - deletion) | addition + } + None => addition, }; - index.script_language_docids.put(wtxn, &key, &final_value)?; + + if final_value.is_empty() { + // If the database entry exists, delete it. + if db_key_exists == true { + index.script_language_docids.delete(wtxn, &key)?; + } + } else { + index.script_language_docids.put(wtxn, &key, &final_value)?; + } } } } @@ -388,13 +387,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { - let new_value = RoaringBitmap::deserialize_from(new_value)?; - let db_value = RoaringBitmap::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(serialize_roaring_bitmap(&value, buffer)?) -} - fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8],