mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Make script language docids map taking a tuple of roaring bitmaps expressing the deletions and the additions
This commit is contained in:
parent
e2bc054604
commit
2597bbd107
@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError};
|
|||||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
|
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
@ -30,11 +30,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||||
RoaringBitmap,
|
|
||||||
grenad::Reader<BufReader<File>>,
|
|
||||||
(ScriptLanguageDocidsMap, ScriptLanguageDocidsMap),
|
|
||||||
)> {
|
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
@ -43,8 +39,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
// initialize destination values.
|
// initialize destination values.
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
let mut del_script_language_docids = HashMap::new();
|
let mut script_language_docids = HashMap::new();
|
||||||
let mut add_script_language_docids = HashMap::new();
|
|
||||||
let mut docid_word_positions_sorter = create_sorter(
|
let mut docid_word_positions_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
keep_latest_obkv,
|
keep_latest_obkv,
|
||||||
@ -138,25 +133,24 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
// update script_language_docids deletions.
|
// update script_language_docids deletions.
|
||||||
for (script, languages_frequency) in del_script_language_word_count {
|
for (script, languages_frequency) in del_script_language_word_count {
|
||||||
for (language, _) in languages_frequency {
|
for (language, _) in languages_frequency {
|
||||||
let entry = del_script_language_docids
|
let entry = script_language_docids
|
||||||
.entry((script, language))
|
.entry((script, language))
|
||||||
.or_insert_with(RoaringBitmap::new);
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||||
entry.push(document_id);
|
entry.0.push(document_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// update script_language_docids additions.
|
// update script_language_docids additions.
|
||||||
for (script, languages_frequency) in add_script_language_word_count {
|
for (script, languages_frequency) in add_script_language_word_count {
|
||||||
for (language, _) in languages_frequency {
|
for (language, _) in languages_frequency {
|
||||||
let entry = add_script_language_docids
|
let entry = script_language_docids
|
||||||
.entry((script, language))
|
.entry((script, language))
|
||||||
.or_insert_with(RoaringBitmap::new);
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||||
entry.push(document_id);
|
entry.1.push(document_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let script_language_docids = (del_script_language_docids, add_script_language_docids);
|
|
||||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||||
}
|
}
|
||||||
|
@ -43,9 +43,7 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||||
VectorPoints(grenad::Reader<BufReader<File>>),
|
VectorPoints(grenad::Reader<BufReader<File>>),
|
||||||
ScriptLanguageDocids(
|
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||||
(HashMap<(Script, Language), RoaringBitmap>, HashMap<(Script, Language), RoaringBitmap>),
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TypedChunk {
|
impl TypedChunk {
|
||||||
@ -103,8 +101,8 @@ impl TypedChunk {
|
|||||||
TypedChunk::VectorPoints(grenad) => {
|
TypedChunk::VectorPoints(grenad) => {
|
||||||
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids((_, addition)) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", addition.len())
|
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -346,24 +344,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
||||||
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids((deletion, addition)) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
for (key, value) in deletion {
|
for (key, (deletion, addition)) in sl_map {
|
||||||
if let Some(mut db_values) = index.script_language_docids.get(wtxn, &key)? {
|
let mut db_key_exists = false;
|
||||||
db_values -= value;
|
|
||||||
if db_values.is_empty() {
|
|
||||||
index.script_language_docids.delete(wtxn, &key)?;
|
|
||||||
} else {
|
|
||||||
index.script_language_docids.put(wtxn, &key, &db_values)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (key, value) in addition {
|
|
||||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||||
Some(mut db_values) => db_values | value,
|
Some(db_values) => {
|
||||||
None => value,
|
db_key_exists = true;
|
||||||
|
(db_values - deletion) | addition
|
||||||
|
}
|
||||||
|
None => addition,
|
||||||
};
|
};
|
||||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
|
||||||
|
if final_value.is_empty() {
|
||||||
|
// If the database entry exists, delete it.
|
||||||
|
if db_key_exists == true {
|
||||||
|
index.script_language_docids.delete(wtxn, &key)?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -388,13 +387,6 @@ fn merge_word_docids_reader_into_fst(
|
|||||||
Ok(builder.into_set())
|
Ok(builder.into_set())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
|
||||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
|
||||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
|
||||||
let value = new_value | db_value;
|
|
||||||
Ok(serialize_roaring_bitmap(&value, buffer)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_cbo_roaring_bitmaps(
|
fn merge_cbo_roaring_bitmaps(
|
||||||
new_value: &[u8],
|
new_value: &[u8],
|
||||||
db_value: &[u8],
|
db_value: &[u8],
|
||||||
|
Loading…
Reference in New Issue
Block a user