Lazily compute the FSTs during indexing

This commit is contained in:
ManyTheFish 2025-04-03 16:04:35 +02:00
parent 418fa47963
commit 6a5a834f27
4 changed files with 63 additions and 34 deletions

View File

@ -144,7 +144,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
let mut merger_iter = builder.build().into_stream_merger_iter()?;
let mut current_field_id = None;
let mut fst;
let mut fst_merger_builder: Option<FstMergerBuilder> = None;
let mut fst_merger_builder: Option<FstMergerBuilder<_>> = None;
while let Some((key, deladd)) = merger_iter.next()? {
let (field_id, normalized_facet_string) =
BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
@ -153,13 +153,14 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
if let (Some(current_field_id), Some(fst_merger_builder)) =
(current_field_id, fst_merger_builder)
{
let mmap = fst_merger_builder.build(&mut callback)?;
if let Some(mmap) = fst_merger_builder.build(&mut callback)? {
index.facet_id_string_fst.remap_data_type::<Bytes>().put(
wtxn,
&current_field_id,
&mmap,
)?;
}
}
fst = index.facet_id_string_fst.get(rtxn, &field_id)?;
fst_merger_builder = Some(FstMergerBuilder::new(fst.as_ref())?);
@ -209,9 +210,10 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
}
if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) {
let mmap = fst_merger_builder.build(&mut callback)?;
if let Some(mmap) = fst_merger_builder.build(&mut callback)? {
index.facet_id_string_fst.remap_data_type::<Bytes>().put(wtxn, &field_id, &mmap)?;
}
}
Ok(())
}

View File

@ -1,25 +1,27 @@
use std::fs::File;
use std::io::BufWriter;
use fst::{Set, SetBuilder, Streamer};
use fst::{IntoStreamer, Set, SetBuilder, Streamer};
use memmap2::Mmap;
use tempfile::tempfile;
use crate::update::del_add::DelAdd;
use crate::{InternalError, Result};
pub struct FstMergerBuilder<'a> {
pub struct FstMergerBuilder<'a, D: AsRef<[u8]>> {
fst: Option<&'a Set<D>>,
stream: Option<fst::set::Stream<'a>>,
fst_builder: SetBuilder<BufWriter<File>>,
fst_builder: Option<SetBuilder<BufWriter<File>>>,
last: Option<Vec<u8>>,
inserted_words: usize,
}
impl<'a> FstMergerBuilder<'a> {
pub fn new<D: AsRef<[u8]>>(fst: Option<&'a Set<D>>) -> Result<Self> {
impl<'a, D: AsRef<[u8]>> FstMergerBuilder<'a, D> {
pub fn new(fst: Option<&'a Set<D>>) -> Result<Self> {
Ok(Self {
fst,
stream: fst.map(|fst| fst.stream()),
fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?,
fst_builder: None,
last: None,
inserted_words: 0,
})
@ -110,11 +112,17 @@ impl<'a> FstMergerBuilder<'a> {
is_modified: bool,
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
) -> Result<()> {
if is_modified && self.fst_builder.is_none() {
self.build_new_fst(bytes)?;
}
if let Some(fst_builder) = self.fst_builder.as_mut() {
// Addition: We insert the word
// Deletion: We delete the word by not inserting it
if deladd == DelAdd::Addition {
self.inserted_words += 1;
self.fst_builder.insert(bytes)?;
fst_builder.insert(bytes)?;
}
}
insertion_callback(bytes, deladd, is_modified)?;
@ -122,6 +130,19 @@ impl<'a> FstMergerBuilder<'a> {
Ok(())
}
// Lazily build the new fst
fn build_new_fst(&mut self, bytes: &[u8]) -> Result<()> {
let mut fst_builder = SetBuilder::new(BufWriter::new(tempfile()?))?;
if let Some(fst) = self.fst {
fst_builder.extend_stream(fst.range().lt(bytes).into_stream())?;
}
self.fst_builder = Some(fst_builder);
Ok(())
}
fn drain_stream(
&mut self,
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
@ -142,16 +163,20 @@ impl<'a> FstMergerBuilder<'a> {
pub fn build(
mut self,
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
) -> Result<Mmap> {
) -> Result<Option<Mmap>> {
self.drain_stream(insertion_callback)?;
let fst_file = self
.fst_builder
match self.fst_builder {
Some(fst_builder) => {
let fst_file = fst_builder
.into_inner()?
.into_inner()
.map_err(|_| InternalError::IndexingMergingKeys { process: "building-fst" })?;
let fst_mmap = unsafe { Mmap::map(&fst_file)? };
Ok(fst_mmap)
Ok(Some(fst_mmap))
}
None => Ok(None),
}
}
}

View File

@ -118,7 +118,9 @@ fn compute_word_fst(
}
let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?;
if let Some(word_fst_mmap) = word_fst_mmap {
index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
}
if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
index.main.remap_types::<Str, Bytes>().put(
wtxn,

View File

@ -10,14 +10,14 @@ use crate::index::PrefixSettings;
use crate::update::del_add::DelAdd;
use crate::{InternalError, Prefix, Result};
pub struct WordFstBuilder<'a> {
word_fst_builder: FstMergerBuilder<'a>,
pub struct WordFstBuilder<'a, D: AsRef<[u8]>> {
word_fst_builder: FstMergerBuilder<'a, D>,
prefix_fst_builder: Option<PrefixFstBuilder>,
registered_words: usize,
}
impl<'a> WordFstBuilder<'a> {
pub fn new(words_fst: &'a Set<std::borrow::Cow<'a, [u8]>>) -> Result<Self> {
impl<'a, D: AsRef<[u8]>> WordFstBuilder<'a, D> {
pub fn new(words_fst: &'a Set<D>) -> Result<Self> {
Ok(Self {
word_fst_builder: FstMergerBuilder::new(Some(words_fst))?,
prefix_fst_builder: None,
@ -50,7 +50,7 @@ impl<'a> WordFstBuilder<'a> {
mut self,
index: &crate::Index,
rtxn: &heed::RoTxn,
) -> Result<(Mmap, Option<PrefixData>)> {
) -> Result<(Option<Mmap>, Option<PrefixData>)> {
let words_fst_mmap = self.word_fst_builder.build(&mut |bytes, deladd, is_modified| {
if let Some(prefix_fst_builder) = &mut self.prefix_fst_builder {
prefix_fst_builder.insert_word(bytes, deladd, is_modified)