Simplify facet update after removing Index::faceted_documents_ids

This commit is contained in:
Louis Dureuil 2023-10-23 15:19:33 +02:00
parent 14832cb324
commit 59f88c14b3
No known key found for this signature in database
6 changed files with 13 additions and 43 deletions

View File

@ -1,7 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::mem::size_of;
use std::path::Path; use std::path::Path;
use charabia::{Language, Script}; use charabia::{Language, Script};
@ -14,7 +13,6 @@ use time::OffsetDateTime;
use crate::distance::NDotProductPoint; use crate::distance::NDotProductPoint;
use crate::error::{InternalError, UserError}; use crate::error::{InternalError, UserError};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap; use crate::fields_ids_map::FieldsIdsMap;
use crate::heed_codec::facet::{ use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,

View File

@ -1,7 +1,6 @@
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use time::OffsetDateTime; use time::OffsetDateTime;
use crate::facet::FacetType;
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
pub struct ClearDocuments<'t, 'u, 'i> { pub struct ClearDocuments<'t, 'u, 'i> {
@ -51,7 +50,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// We retrieve the number of documents ids that we are deleting. // We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?; let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
// We clean some of the main engine datastructures. // We clean some of the main engine datastructures.
self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_fst(self.wtxn, &fst::Set::default())?;

View File

@ -1,8 +1,7 @@
use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
use grenad::{CompressionType, Reader}; use grenad::CompressionType;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{BytesEncode, Error, RoTxn, RwTxn}; use heed::{BytesEncode, Error, RoTxn, RwTxn};
use obkv::KvReader; use obkv::KvReader;
@ -82,10 +81,7 @@ impl<'i> FacetsUpdateBulk<'i> {
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { inner.update(wtxn, &field_ids)?;
// TODO: remove the lambda altogether
Ok(())
})?;
Ok(()) Ok(())
} }
@ -99,21 +95,14 @@ pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
pub min_level_size: u8, pub min_level_size: u8,
} }
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
pub fn update( pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
mut self,
wtxn: &mut RwTxn,
field_ids: &[u16],
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
) -> Result<()> {
self.update_level0(wtxn)?; self.update_level0(wtxn)?;
for &field_id in field_ids.iter() { for &field_id in field_ids.iter() {
self.clear_levels(wtxn, field_id)?; self.clear_levels(wtxn, field_id)?;
} }
for &field_id in field_ids.iter() { for &field_id in field_ids.iter() {
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
handle_all_docids(wtxn, field_id, all_docids)?;
for level_reader in level_readers { for level_reader in level_readers {
let mut cursor = level_reader.into_cursor()?; let mut cursor = level_reader.into_cursor()?;
@ -201,16 +190,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
&self, &self,
field_id: FieldId, field_id: FieldId,
txn: &RoTxn, txn: &RoTxn,
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> { ) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
let mut all_docids = RoaringBitmap::new(); let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
for bitmap in bitmaps {
all_docids |= bitmap;
}
Ok(())
})?;
Ok((subwriters, all_docids)) Ok(subwriters)
} }
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
fn read_level_0<'t>( fn read_level_0<'t>(

View File

@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
@ -15,7 +14,7 @@ use crate::heed_codec::ByteSliceRefCodec;
use crate::search::facet::get_highest_level; use crate::search::facet::get_highest_level;
use crate::update::del_add::DelAdd; use crate::update::del_add::DelAdd;
use crate::update::index_documents::valid_lmdb_key; use crate::update::index_documents::valid_lmdb_key;
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; use crate::{CboRoaringBitmapCodec, Index, Result};
enum InsertionResult { enum InsertionResult {
InPlace, InPlace,
@ -30,16 +29,14 @@ enum DeletionResult {
/// Algorithm to incrementally insert and delete elememts into the /// Algorithm to incrementally insert and delete elememts into the
/// `facet_id_(string/f64)_docids` databases. /// `facet_id_(string/f64)_docids` databases.
pub struct FacetsUpdateIncremental<'i> { pub struct FacetsUpdateIncremental {
index: &'i Index,
inner: FacetsUpdateIncrementalInner, inner: FacetsUpdateIncrementalInner,
facet_type: FacetType,
delta_data: grenad::Reader<BufReader<File>>, delta_data: grenad::Reader<BufReader<File>>,
} }
impl<'i> FacetsUpdateIncremental<'i> { impl FacetsUpdateIncremental {
pub fn new( pub fn new(
index: &'i Index, index: &Index,
facet_type: FacetType, facet_type: FacetType,
delta_data: grenad::Reader<BufReader<File>>, delta_data: grenad::Reader<BufReader<File>>,
group_size: u8, group_size: u8,
@ -47,7 +44,6 @@ impl<'i> FacetsUpdateIncremental<'i> {
max_group_size: u8, max_group_size: u8,
) -> Self { ) -> Self {
FacetsUpdateIncremental { FacetsUpdateIncremental {
index,
inner: FacetsUpdateIncrementalInner { inner: FacetsUpdateIncrementalInner {
db: match facet_type { db: match facet_type {
FacetType::String => index FacetType::String => index
@ -61,12 +57,11 @@ impl<'i> FacetsUpdateIncremental<'i> {
max_group_size, max_group_size,
min_level_size, min_level_size,
}, },
facet_type,
delta_data, delta_data,
} }
} }
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
let mut cursor = self.delta_data.into_cursor()?; let mut cursor = self.delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) { if !valid_lmdb_key(key) {

View File

@ -115,7 +115,6 @@ pub struct FacetsUpdate<'i> {
min_level_size: u8, min_level_size: u8,
} }
impl<'i> FacetsUpdate<'i> { impl<'i> FacetsUpdate<'i> {
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
pub fn new( pub fn new(
index: &'i Index, index: &'i Index,
facet_type: FacetType, facet_type: FacetType,

View File

@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
@ -11,9 +10,7 @@ use heed::types::ByteSlice;
use heed::RwTxn; use heed::RwTxn;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::helpers::{ use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn}; use super::{ClonableMmap, MergeFn};
use crate::distance::NDotProductPoint; use crate::distance::NDotProductPoint;
use crate::error::UserError; use crate::error::UserError;