expose the size methods

This commit is contained in:
Irevoire 2022-08-11 11:15:46 +02:00
parent e96b852107
commit 4aae07d5f5
No known key found for this signature in database
GPG Key ID: 7A6A970C96104F1B
14 changed files with 39 additions and 17 deletions

View File

@ -5,7 +5,7 @@ use std::fs::{create_dir_all, remove_dir_all};
use std::path::Path;
use criterion::{criterion_group, criterion_main, Criterion};
use heed::{EnvOpenOptions, RwTxn};
use milli::heed::{EnvOpenOptions, RwTxn};
use milli::update::{
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
};

View File

@ -6,8 +6,8 @@ use std::num::ParseFloatError;
use std::path::Path;
use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::heed::EnvOpenOptions;
use milli::update::{
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
};

View File

@ -11,7 +11,6 @@ byte-unit = { version = "4.0.14", features = ["serde"] }
color-eyre = "0.6.1"
csv = "1.1.6"
eyre = "0.6.7"
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
indicatif = "0.16.2"
milli = { path = "../milli" }
mimalloc = { version = "0.1.29", default-features = false }

View File

@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
};
use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
use milli::{Index, Object};
use milli::{heed, Index, Object};
use structopt::StructOpt;
#[global_allocator]

View File

@ -9,7 +9,6 @@ publish = false
[dependencies]
anyhow = "1.0.56"
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
milli = { path = "../milli" }
mimalloc = { version = "0.1.29", default-features = false }
stderrlog = "0.5.1"

View File

@ -1,7 +1,7 @@
use std::path::PathBuf;
use byte_unit::Byte;
use heed::{CompactionOption, Env, EnvOpenOptions};
use milli::heed::{CompactionOption, Env, EnvOpenOptions};
use structopt::StructOpt;
use Command::*;

View File

@ -10,7 +10,6 @@ publish = false
anyhow = "1.0.56"
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
crossbeam-channel = "0.5.2"
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
memmap2 = "0.5.3"
milli = { path = "../milli" }
mimalloc = { version = "0.1.29", default-features = false }

View File

@ -17,8 +17,8 @@ use byte_unit::Byte;
use either::Either;
use flate2::read::GzDecoder;
use futures::{stream, FutureExt, StreamExt};
use heed::EnvOpenOptions;
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::heed::EnvOpenOptions;
use milli::tokenizer::TokenizerBuilder;
use milli::update::UpdateIndexingStep::*;
use milli::update::{

View File

@ -6,6 +6,7 @@ use std::sync::Arc;
use crossbeam_channel::Sender;
use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
use heed::{Database, Env, EnvOpenOptions};
use milli::heed;
use serde::{Deserialize, Serialize};
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;

View File

@ -9,7 +9,6 @@ publish = false
anyhow = "1.0.56"
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
csv = "1.1.6"
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
milli = { path = "../milli" }
mimalloc = { version = "0.1.29", default-features = false }
roaring = "0.9.0"

View File

@ -7,7 +7,7 @@ use byte_unit::Byte;
use heed::EnvOpenOptions;
use milli::facet::FacetType;
use milli::index::db_name::*;
use milli::{FieldId, Index};
use milli::{heed, FieldId, Index};
use structopt::StructOpt;
use Command::*;

View File

@ -18,8 +18,7 @@ fst = "0.4.7"
fxhash = "0.2.1"
geoutils = "0.4.1"
grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
# heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
heed = { git = "https://github.com/meilisearch/heed", branch = "compute_size", default-features = false, features = ["lmdb", "sync-read-txn"] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.5.3"

View File

@ -223,6 +223,16 @@ impl Index {
self.env.path()
}
/// Returns the size used by the index without the cached pages.
pub fn used_size(&self) -> Result<u64> {
Ok(self.env.non_free_pages_size()?)
}
/// Returns the real size used by the index.
pub fn on_disk_size(&self) -> Result<u64> {
Ok(self.env.real_disk_size()?)
}
pub fn copy_to_path<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> {
self.env.copy_to_path(path, option).map_err(Into::into)
}

View File

@ -20,10 +20,6 @@ use crate::{
RoaringBitmapCodec, SmallString32, BEU32,
};
/// The threshold we use to determine after which number of documents we want to clear the
/// soft-deleted database and delete documents for real.
const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000;
pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
@ -129,7 +125,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// if we have less documents to delete than the threshold we simply save them in
// the `soft_deleted_documents_ids` bitmap and early exit.
if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD {
let size_used = self.index.used_size()?;
let map_size = self.index.env.map_size()? as u64;
let nb_documents = self.index.number_of_documents(&self.wtxn)?;
let nb_soft_deleted = soft_deleted_docids.len();
let percentage_available = 100 - (size_used * 100 / map_size);
let estimated_document_size = size_used / (nb_documents + nb_soft_deleted);
let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted;
let percentage_used_by_soft_deleted_documents =
estimated_size_used_by_soft_deleted * 100 / map_size;
// if we have more than 10% of disk space available and the soft deleted
// documents uses less than 10% of the total space available,
// we skip the deletion. Eg.
// - With 100Go of disk and 20Go used including 5Go of soft-deleted documents
// We dont delete anything.
// - With 100Go of disk and 95Go used including 1mo of soft-deleted documents
// We run the deletion.
// - With 100Go of disk and 50Go used including 15Go of soft-deleted documents
// We run the deletion.
if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 {
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
return Ok(DocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),