Update extract_facet_string_docids to support deladd obkvs

This commit is contained in:
Clément Renault 2023-10-18 18:06:41 +02:00
parent 5c43ff72c1
commit 061f490204
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
2 changed files with 20 additions and 23 deletions

View File

@ -1,13 +1,14 @@
use std::fs::File; use std::fs::File;
use std::io; use std::{io, str};
use heed::BytesEncode; use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::StrRefCodec; use crate::heed_codec::StrRefCodec;
use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
use crate::{FieldId, Result};
/// Extracts the facet string and the documents ids where this facet string appear. /// Extracts the facet string and the documents ids where this facet string appear.
/// ///
@ -15,7 +16,6 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// documents ids from the given chunk of docid facet string positions. /// documents ids from the given chunk of docid facet string positions.
#[logging_timer::time] #[logging_timer::time]
pub fn extract_facet_string_docids<R: io::Read + io::Seek>( pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
// TODO Reader<Key, Obkv<DelAdd, OriginalString>>
docid_fid_facet_string: grenad::Reader<R>, docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> { ) -> Result<grenad::Reader<File>> {
@ -25,17 +25,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let mut facet_string_docids_sorter = create_sorter( let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable, grenad::SortAlgorithm::Stable,
// TODO We must modify the merger to do unions of Del and Add separately merge_deladd_cbo_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
max_memory, max_memory,
); );
let mut buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?; let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
// TODO the value is a Obkv<DelAdd, OriginalString> and must be taken into account
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes); let field_id = FieldId::from_be_bytes(field_id_bytes);
@ -43,22 +42,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
try_split_array_at::<_, 4>(bytes).unwrap(); try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes); let document_id = u32::from_be_bytes(document_id_bytes);
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; let normalized_value = str::from_utf8(normalized_value_bytes)?;
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
let normalised_truncated_value: String;
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
normalised_truncated_value = normalised_value
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
normalised_value = normalised_truncated_value.as_str();
}
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
// document id is encoded in native-endian because of the CBO roaring bitmap codec
// TODO Reader<KeyBytes, Obkv<DelAdd, RoaringBitmap>> buffer.clear();
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
} }
sorter_into_reader(facet_string_docids_sorter, indexer) sorter_into_reader(facet_string_docids_sorter, indexer)

View File

@ -193,6 +193,7 @@ pub fn obkvs_keep_last_addition_merge_deletions<'a>(
inner_merge_del_add_obkvs(obkvs, false) inner_merge_del_add_obkvs(obkvs, false)
} }
/// Do a union of all the CboRoaringBitmaps in the values.
pub fn merge_cbo_roaring_bitmaps<'a>( pub fn merge_cbo_roaring_bitmaps<'a>(
_key: &[u8], _key: &[u8],
values: &[Cow<'a, [u8]>], values: &[Cow<'a, [u8]>],
@ -206,6 +207,8 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
} }
} }
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
/// separately and outputs a new DelAdd with both unions.
pub fn merge_deladd_cbo_roaring_bitmaps<'a>( pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
_key: &[u8], _key: &[u8],
values: &[Cow<'a, [u8]>], values: &[Cow<'a, [u8]>],