mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-12 06:24:29 +01:00
Merge #607
607: Better threshold r=Kerollmops a=irevoire # Pull Request ## What does this PR do? Fixes #570 This PR tries to improve the threshold used to trigger the real deletion of documents. The deletion is now triggered in two cases; - 10% of the total available space is used by soft deleted documents - 90% of the total available space is used. In this context, « total available space » means the `map_size` of lmdb. And the size used by the soft deleted documents is actually an estimation. We can't determine precisely the size used by one document thus what we do is; take the total space used, divide it by the number of documents + soft deleted documents to estimate the size of one average document. Then multiply the size of one avg document by the number of soft deleted document. -------- <img width="808" alt="image" src="https://user-images.githubusercontent.com/7032172/185083075-92cf379e-8ae1-4bfc-9ca6-93b54e6ab4e9.png"> Here we can see we have a ~10GB drift in the end between the space used by the soft deleted and the real space used by the documents. Personally I don’t think that's a big issue because once the red line reach 90GB everything will be freed but now you know. If you have an idea on how to improve this estimation I would love to hear it. It look like the difference is linear so maybe we could simply multiply the current estimation by two? Co-authored-by: Irevoire <tamo@meilisearch.com>
This commit is contained in:
commit
79094bcbcf
@ -5,7 +5,7 @@ use std::fs::{create_dir_all, remove_dir_all};
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
use heed::{EnvOpenOptions, RwTxn};
|
use milli::heed::{EnvOpenOptions, RwTxn};
|
||||||
use milli::update::{
|
use milli::update::{
|
||||||
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
|
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
|
||||||
};
|
};
|
||||||
|
@ -6,8 +6,8 @@ use std::num::ParseFloatError;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use criterion::BenchmarkId;
|
use criterion::BenchmarkId;
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
|
use milli::heed::EnvOpenOptions;
|
||||||
use milli::update::{
|
use milli::update::{
|
||||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
};
|
};
|
||||||
|
@ -11,7 +11,6 @@ byte-unit = { version = "4.0.14", features = ["serde"] }
|
|||||||
color-eyre = "0.6.1"
|
color-eyre = "0.6.1"
|
||||||
csv = "1.1.6"
|
csv = "1.1.6"
|
||||||
eyre = "0.6.7"
|
eyre = "0.6.7"
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
|
||||||
indicatif = "0.16.2"
|
indicatif = "0.16.2"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
mimalloc = { version = "0.1.29", default-features = false }
|
mimalloc = { version = "0.1.29", default-features = false }
|
||||||
|
@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{
|
|||||||
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
|
ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
|
||||||
};
|
};
|
||||||
use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
|
use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
|
||||||
use milli::{Index, Object};
|
use milli::{heed, Index, Object};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
|
@ -9,7 +9,6 @@ publish = false
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.56"
|
anyhow = "1.0.56"
|
||||||
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
|
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
|
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
mimalloc = { version = "0.1.29", default-features = false }
|
mimalloc = { version = "0.1.29", default-features = false }
|
||||||
stderrlog = "0.5.1"
|
stderrlog = "0.5.1"
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use byte_unit::Byte;
|
use byte_unit::Byte;
|
||||||
use heed::{CompactionOption, Env, EnvOpenOptions};
|
use milli::heed::{CompactionOption, Env, EnvOpenOptions};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use Command::*;
|
use Command::*;
|
||||||
|
|
||||||
|
@ -10,7 +10,6 @@ publish = false
|
|||||||
anyhow = "1.0.56"
|
anyhow = "1.0.56"
|
||||||
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
|
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
|
||||||
crossbeam-channel = "0.5.2"
|
crossbeam-channel = "0.5.2"
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
|
|
||||||
memmap2 = "0.5.3"
|
memmap2 = "0.5.3"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
mimalloc = { version = "0.1.29", default-features = false }
|
mimalloc = { version = "0.1.29", default-features = false }
|
||||||
|
@ -17,8 +17,8 @@ use byte_unit::Byte;
|
|||||||
use either::Either;
|
use either::Either;
|
||||||
use flate2::read::GzDecoder;
|
use flate2::read::GzDecoder;
|
||||||
use futures::{stream, FutureExt, StreamExt};
|
use futures::{stream, FutureExt, StreamExt};
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
|
use milli::heed::EnvOpenOptions;
|
||||||
use milli::tokenizer::TokenizerBuilder;
|
use milli::tokenizer::TokenizerBuilder;
|
||||||
use milli::update::UpdateIndexingStep::*;
|
use milli::update::UpdateIndexingStep::*;
|
||||||
use milli::update::{
|
use milli::update::{
|
||||||
|
@ -6,6 +6,7 @@ use std::sync::Arc;
|
|||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
|
use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson};
|
||||||
use heed::{Database, Env, EnvOpenOptions};
|
use heed::{Database, Env, EnvOpenOptions};
|
||||||
|
use milli::heed;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
|
||||||
|
@ -9,7 +9,6 @@ publish = false
|
|||||||
anyhow = "1.0.56"
|
anyhow = "1.0.56"
|
||||||
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
|
byte-unit = { version = "4.0.14", default-features = false, features = ["std"] }
|
||||||
csv = "1.1.6"
|
csv = "1.1.6"
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" }
|
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
mimalloc = { version = "0.1.29", default-features = false }
|
mimalloc = { version = "0.1.29", default-features = false }
|
||||||
roaring = "0.9.0"
|
roaring = "0.9.0"
|
||||||
|
@ -7,7 +7,7 @@ use byte_unit::Byte;
|
|||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use milli::facet::FacetType;
|
use milli::facet::FacetType;
|
||||||
use milli::index::db_name::*;
|
use milli::index::db_name::*;
|
||||||
use milli::{FieldId, Index};
|
use milli::{heed, FieldId, Index};
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
use Command::*;
|
use Command::*;
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ fst = "0.4.7"
|
|||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
geoutils = "0.4.1"
|
geoutils = "0.4.1"
|
||||||
grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
|
grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
|
||||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||||
json-depth-checker = { path = "../json-depth-checker" }
|
json-depth-checker = { path = "../json-depth-checker" }
|
||||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||||
memmap2 = "0.5.3"
|
memmap2 = "0.5.3"
|
||||||
|
@ -116,6 +116,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
}
|
}
|
||||||
)]
|
)]
|
||||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
||||||
|
#[error("{}", HeedError::BadOpenOptions)]
|
||||||
|
InvalidLmdbOpenOptions,
|
||||||
#[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")]
|
#[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")]
|
||||||
SortRankingRuleMissing,
|
SortRankingRuleMissing,
|
||||||
#[error("The database file is in an invalid state.")]
|
#[error("The database file is in an invalid state.")]
|
||||||
@ -244,6 +246,7 @@ impl From<HeedError> for Error {
|
|||||||
HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })),
|
HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })),
|
||||||
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
|
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
|
||||||
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
|
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
|
||||||
|
HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -223,6 +223,16 @@ impl Index {
|
|||||||
self.env.path()
|
self.env.path()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the size used by the index without the cached pages.
|
||||||
|
pub fn used_size(&self) -> Result<u64> {
|
||||||
|
Ok(self.env.non_free_pages_size()?)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the real size used by the index.
|
||||||
|
pub fn on_disk_size(&self) -> Result<u64> {
|
||||||
|
Ok(self.env.real_disk_size()?)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn copy_to_path<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> {
|
pub fn copy_to_path<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> {
|
||||||
self.env.copy_to_path(path, option).map_err(Into::into)
|
self.env.copy_to_path(path, option).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
@ -20,10 +20,6 @@ use crate::{
|
|||||||
RoaringBitmapCodec, SmallString32, BEU32,
|
RoaringBitmapCodec, SmallString32, BEU32,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The threshold we use to determine after which number of documents we want to clear the
|
|
||||||
/// soft-deleted database and delete documents for real.
|
|
||||||
const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000;
|
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
@ -129,7 +125,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// if we have less documents to delete than the threshold we simply save them in
|
// if we have less documents to delete than the threshold we simply save them in
|
||||||
// the `soft_deleted_documents_ids` bitmap and early exit.
|
// the `soft_deleted_documents_ids` bitmap and early exit.
|
||||||
if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD {
|
let size_used = self.index.used_size()?;
|
||||||
|
let map_size = self.index.env.map_size()? as u64;
|
||||||
|
let nb_documents = self.index.number_of_documents(&self.wtxn)?;
|
||||||
|
let nb_soft_deleted = soft_deleted_docids.len();
|
||||||
|
|
||||||
|
let percentage_available = 100 - (size_used * 100 / map_size);
|
||||||
|
let estimated_document_size = size_used / (nb_documents + nb_soft_deleted);
|
||||||
|
let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted;
|
||||||
|
let percentage_used_by_soft_deleted_documents =
|
||||||
|
estimated_size_used_by_soft_deleted * 100 / map_size;
|
||||||
|
|
||||||
|
// if we have more than 10% of disk space available and the soft deleted
|
||||||
|
// documents uses less than 10% of the total space available,
|
||||||
|
// we skip the deletion. Eg.
|
||||||
|
// - With 100Go of disk and 20Go used including 5Go of soft-deleted documents
|
||||||
|
// We don’t delete anything.
|
||||||
|
// - With 100Go of disk and 95Go used including 1mo of soft-deleted documents
|
||||||
|
// We run the deletion.
|
||||||
|
// - With 100Go of disk and 50Go used including 15Go of soft-deleted documents
|
||||||
|
// We run the deletion.
|
||||||
|
if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 {
|
||||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
|
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
|
||||||
return Ok(DocumentDeletionResult {
|
return Ok(DocumentDeletionResult {
|
||||||
deleted_documents: self.to_delete_docids.len(),
|
deleted_documents: self.to_delete_docids.len(),
|
||||||
|
@ -278,27 +278,30 @@ where
|
|||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||||
|
|
||||||
|
let pool_params = GrenadParameters {
|
||||||
|
chunk_compression_type: self.indexer_config.chunk_compression_type,
|
||||||
|
chunk_compression_level: self.indexer_config.chunk_compression_level,
|
||||||
|
max_memory: self.indexer_config.max_memory,
|
||||||
|
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
||||||
|
};
|
||||||
|
let documents_chunk_size =
|
||||||
|
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
|
||||||
|
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||||
|
|
||||||
// Run extraction pipeline in parallel.
|
// Run extraction pipeline in parallel.
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
let params = GrenadParameters {
|
|
||||||
chunk_compression_type: self.indexer_config.chunk_compression_type,
|
|
||||||
chunk_compression_level: self.indexer_config.chunk_compression_level,
|
|
||||||
max_memory: self.indexer_config.max_memory,
|
|
||||||
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
|
||||||
};
|
|
||||||
|
|
||||||
// split obkv file into several chunks
|
// split obkv file into several chunks
|
||||||
let original_chunk_iter = grenad_obkv_into_chunks(
|
let original_chunk_iter = grenad_obkv_into_chunks(
|
||||||
original_documents,
|
original_documents,
|
||||||
params.clone(),
|
pool_params.clone(),
|
||||||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
|
documents_chunk_size,
|
||||||
);
|
);
|
||||||
|
|
||||||
// split obkv file into several chunks
|
// split obkv file into several chunks
|
||||||
let flattened_chunk_iter = grenad_obkv_into_chunks(
|
let flattened_chunk_iter = grenad_obkv_into_chunks(
|
||||||
flattened_documents,
|
flattened_documents,
|
||||||
params.clone(),
|
pool_params.clone(),
|
||||||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB
|
documents_chunk_size,
|
||||||
);
|
);
|
||||||
|
|
||||||
let result = original_chunk_iter
|
let result = original_chunk_iter
|
||||||
@ -308,14 +311,14 @@ where
|
|||||||
extract::data_from_obkv_documents(
|
extract::data_from_obkv_documents(
|
||||||
original_chunk,
|
original_chunk,
|
||||||
flattened_chunk,
|
flattened_chunk,
|
||||||
params,
|
pool_params,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
faceted_fields,
|
faceted_fields,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
stop_words,
|
stop_words,
|
||||||
self.indexer_config.max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
exact_attributes,
|
exact_attributes,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user