Split position DB into fid and relative position DB

This commit is contained in:
Loïc Lecrenier 2023-03-23 09:22:01 +01:00
parent 56b7209f26
commit 9b2653427d
11 changed files with 162 additions and 135 deletions

View file

@ -7,14 +7,17 @@ use super::helpers::{
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{DocumentId, Result};
use crate::{
absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
DocumentId, Result,
};
/// Extracts the word positions and the documents ids where this word appear.
///
/// Returns a grenad reader with the list of extracted words at positions and
/// documents ids from the given chunk of docid word positions.
#[logging_timer::time]
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
for position in read_u32_ne_bytes(value) {
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
let (fid, position) = relative_from_absolute_position(position);
let position = bucketed_position(position);
let position = absolute_from_relative_position(fid, position);
key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
}
sorter_into_reader(word_position_docids_sorter, indexer)
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
Ok(word_position_docids_reader)
}

View file

@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_word_docids::extract_word_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use self::extract_word_position_docids::extract_word_fid_and_position_docids;
use super::helpers::{
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
GrenadParameters, MergeFn, MergeableReader,
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
docid_word_positions_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_word_position_docids,
extract_word_fid_and_position_docids,
merge_cbo_roaring_bitmaps,
TypedChunk::WordPositionDocids,
"word-position-docids",