mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-04-13 10:01:41 +02:00
factorize word fst building
This commit is contained in:
parent
ed826a8c8b
commit
b6e0235abe
@ -27,6 +27,7 @@ pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||
pub use self::helpers::*;
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use super::facet::clear_facet_levels_based_on_settings_diff;
|
||||
use super::new::indexer::post_processing::compute_word_fst_no_progress;
|
||||
use super::new::StdResult;
|
||||
use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
@ -472,6 +473,9 @@ where
|
||||
clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?;
|
||||
}
|
||||
|
||||
// compute the word fst
|
||||
compute_word_fst_no_progress(self.index, self.wtxn)?;
|
||||
|
||||
Ok(())
|
||||
}).map_err(InternalError::from)??;
|
||||
|
||||
|
@ -13,7 +13,6 @@ use roaring::RoaringBitmap;
|
||||
use super::helpers::{
|
||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
MergeIgnoreValues,
|
||||
};
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
@ -22,9 +21,7 @@ use crate::index::IndexEmbeddingConfig;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
|
||||
};
|
||||
use crate::update::index_documents::helpers::{try_split_array_at, KeepLatestObkv};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::ArroyWrapper;
|
||||
use crate::{
|
||||
@ -262,7 +259,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
@ -272,12 +268,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
|
||||
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||
}
|
||||
|
||||
let word_docids_merger = word_docids_builder.build();
|
||||
@ -307,17 +301,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
let fst_merger = fst_merger_builder.build();
|
||||
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
|
||||
let db_fst = index.words_fst(wtxn)?;
|
||||
|
||||
// merge new fst with database fst
|
||||
let union_stream = fst.op().add(db_fst.stream()).union();
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(union_stream)?;
|
||||
let fst = builder.into_set();
|
||||
index.put_words_fst(wtxn, &fst)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPositionDocids(_) => {
|
||||
@ -744,23 +727,6 @@ pub fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||
GeoPoint::new(xyz_point, (docid, point))
|
||||
}
|
||||
|
||||
fn merge_word_docids_reader_into_fst<MF>(
|
||||
merger: Merger<CursorClonableMmap, MF>,
|
||||
) -> Result<fst::Set<Vec<u8>>>
|
||||
where
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
while let Some((k, _)) = iter.next()? {
|
||||
builder.insert(k)?;
|
||||
}
|
||||
|
||||
Ok(builder.into_set())
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
|
@ -30,7 +30,7 @@ mod document_operation;
|
||||
mod extract;
|
||||
mod guess_primary_key;
|
||||
mod partial_dump;
|
||||
mod post_processing;
|
||||
pub(crate) mod post_processing;
|
||||
mod update_by_function;
|
||||
mod write;
|
||||
|
||||
|
@ -84,8 +84,20 @@ fn compute_word_fst(
|
||||
wtxn: &mut RwTxn,
|
||||
progress: &Progress,
|
||||
) -> Result<Option<PrefixDelta>> {
|
||||
let rtxn = index.read_txn()?;
|
||||
progress.update_progress(PostProcessingWords::WordFst);
|
||||
compute_word_fst_no_progress(index, wtxn)
|
||||
}
|
||||
|
||||
/// Compute the word fst without updating the progress.
|
||||
///
|
||||
/// This is used old indexer.
|
||||
///
|
||||
/// TODO: remove this function once the old indexer is removed.
|
||||
pub fn compute_word_fst_no_progress(
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
) -> Result<Option<PrefixDelta>> {
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
let words_fst = index.words_fst(&rtxn)?;
|
||||
let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
|
||||
|
Loading…
x
Reference in New Issue
Block a user