Use an LMDB database to store the external documents ids

This commit is contained in:
Clément Renault 2023-10-28 12:56:46 +02:00 committed by Louis Dureuil
parent fdf3f7f627
commit dfab6293c9
No known key found for this signature in database
7 changed files with 79 additions and 141 deletions

View file

@ -1,7 +1,7 @@
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
use crate::{FieldDistribution, Index, Result};
pub struct ClearDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -20,6 +20,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
let Index {
env: _env,
main: _main,
external_documents_ids,
word_docids,
exact_word_docids,
word_prefix_docids,
@ -54,7 +55,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// We clean some of the main engine datastructures.
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
@ -62,6 +62,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.delete_vector_hnsw(self.wtxn)?;
// Clear the other databases.
external_documents_ids.clear(self.wtxn)?;
word_docids.clear(self.wtxn)?;
exact_word_docids.clear(self.wtxn)?;
word_prefix_docids.clear(self.wtxn)?;

View file

@ -162,7 +162,7 @@ impl<'a, 'i> Transform<'a, 'i> {
FA: Fn() -> bool + Sync,
{
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let external_documents_ids = self.index.external_documents_ids();
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
let primary_key = cursor.primary_key().to_string();
@ -221,7 +221,7 @@ impl<'a, 'i> Transform<'a, 'i> {
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
Entry::Occupied(entry) => *entry.get() as u32,
Entry::Vacant(entry) => {
let docid = match external_documents_ids.get(entry.key()) {
let docid = match external_documents_ids.get(wtxn, entry.key())? {
Some(docid) => {
// If it was already in the list of replaced documents it means it was deleted
// by the remove_document method. We should starts as if it never existed.
@ -373,7 +373,7 @@ impl<'a, 'i> Transform<'a, 'i> {
to_remove.sort_unstable();
to_remove.dedup();
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let external_documents_ids = self.index.external_documents_ids();
let mut documents_deleted = 0;
let mut document_sorter_buffer = Vec::new();
@ -410,7 +410,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// If the document was already in the db we mark it as a `to_delete` document.
// Then we push the document in sorters in deletion mode.
let deleted_from_db = match external_documents_ids.get(&to_remove) {
let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
Some(docid) => {
self.replaced_documents_ids.insert(docid);

View file

@ -194,10 +194,8 @@ pub(crate) fn write_typed_chunk_into_index(
db.delete(wtxn, &BEU32::new(docid))?;
}
}
let mut external_documents_docids = index.external_documents_ids(wtxn)?.into_static();
external_documents_docids.apply(operations);
index.put_external_documents_ids(wtxn, &external_documents_docids)?;
let external_documents_docids = index.external_documents_ids();
external_documents_docids.apply(wtxn, operations)?;
index.put_documents_ids(wtxn, &docids)?;
}
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {