factorize word fst building

This commit is contained in:
ManyTheFish 2025-04-10 17:33:45 +02:00
parent ed826a8c8b
commit b6e0235abe
4 changed files with 19 additions and 37 deletions

View File

@ -27,6 +27,7 @@ pub use self::enrich::{extract_finite_float_from_value, DocumentId};
pub use self::helpers::*;
pub use self::transform::{Transform, TransformOutput};
use super::facet::clear_facet_levels_based_on_settings_diff;
use super::new::indexer::post_processing::compute_word_fst_no_progress;
use super::new::StdResult;
use crate::database_stats::DatabaseStats;
use crate::documents::{obkv_to_object, DocumentsBatchReader};
@ -472,6 +473,9 @@ where
clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?;
}
// compute the word fst
compute_word_fst_no_progress(self.index, self.wtxn)?;
Ok(())
}).map_err(InternalError::from)??;

View File

@ -13,7 +13,6 @@ use roaring::RoaringBitmap;
use super::helpers::{
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
MergeIgnoreValues,
};
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType;
@ -22,9 +21,7 @@ use crate::index::IndexEmbeddingConfig;
use crate::proximity::MAX_DISTANCE;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
};
use crate::update::index_documents::helpers::{try_split_array_at, KeepLatestObkv};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::ArroyWrapper;
use crate::{
@ -262,7 +259,6 @@ pub(crate) fn write_typed_chunk_into_index(
let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
for typed_chunk in typed_chunks {
let TypedChunk::WordDocids {
word_docids_reader,
@ -272,12 +268,10 @@ pub(crate) fn write_typed_chunk_into_index(
else {
unreachable!();
};
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
word_docids_builder.push(word_docids_reader.into_cursor()?);
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
}
let word_docids_merger = word_docids_builder.build();
@ -307,17 +301,6 @@ pub(crate) fn write_typed_chunk_into_index(
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
// create fst from word docids
let fst_merger = fst_merger_builder.build();
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
let db_fst = index.words_fst(wtxn)?;
// merge new fst with database fst
let union_stream = fst.op().add(db_fst.stream()).union();
let mut builder = fst::SetBuilder::memory();
builder.extend_stream(union_stream)?;
let fst = builder.into_set();
index.put_words_fst(wtxn, &fst)?;
is_merged_database = true;
}
TypedChunk::WordPositionDocids(_) => {
@ -744,23 +727,6 @@ pub fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
GeoPoint::new(xyz_point, (docid, point))
}
fn merge_word_docids_reader_into_fst<MF>(
merger: Merger<CursorClonableMmap, MF>,
) -> Result<fst::Set<Vec<u8>>>
where
MF: MergeFunction,
crate::Error: From<MF::Error>,
{
let mut iter = merger.into_stream_merger_iter()?;
let mut builder = fst::SetBuilder::memory();
while let Some((k, _)) = iter.next()? {
builder.insert(k)?;
}
Ok(builder.into_set())
}
/// Write provided entries in database using serialize_value function.
/// merge_values function is used if an entry already exist in the database.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]

View File

@ -30,7 +30,7 @@ mod document_operation;
mod extract;
mod guess_primary_key;
mod partial_dump;
mod post_processing;
pub(crate) mod post_processing;
mod update_by_function;
mod write;

View File

@ -84,8 +84,20 @@ fn compute_word_fst(
wtxn: &mut RwTxn,
progress: &Progress,
) -> Result<Option<PrefixDelta>> {
let rtxn = index.read_txn()?;
progress.update_progress(PostProcessingWords::WordFst);
compute_word_fst_no_progress(index, wtxn)
}
/// Compute the word fst without updating the progress.
///
/// This is used old indexer.
///
/// TODO: remove this function once the old indexer is removed.
pub fn compute_word_fst_no_progress(
index: &Index,
wtxn: &mut RwTxn,
) -> Result<Option<PrefixDelta>> {
let rtxn = index.read_txn()?;
let words_fst = index.words_fst(&rtxn)?;
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;