Create a small tool to measure the size of inernal databases

This commit is contained in:
Kerollmops 2023-06-23 22:57:57 +02:00
parent 040b5a5b6f
commit ef9875256b
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 252 additions and 11 deletions

44
Cargo.lock generated
View File

@ -309,6 +309,15 @@ version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]]
name = "anstream"
version = "0.3.2"
@ -778,9 +787,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.3.0"
version = "4.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
checksum = "6320c6d1c98b6981da7bb2dcecbd0be9dc98d42165fa8326b21000f7dbfde6d0"
dependencies = [
"clap_builder",
"clap_derive",
@ -789,9 +798,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.3.0"
version = "4.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
checksum = "2e53afce1efce6ed1f633cf0e57612fe51db54a1ee4fd8f8503d078fe02d69ae"
dependencies = [
"anstream",
"anstyle",
@ -802,9 +811,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.3.0"
version = "4.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f"
dependencies = [
"heck",
"proc-macro2",
@ -1599,7 +1608,7 @@ name = "fuzzers"
version = "1.2.0"
dependencies = [
"arbitrary",
"clap 4.3.0",
"clap 4.3.6",
"fastrand",
"milli",
"serde",
@ -1949,6 +1958,16 @@ dependencies = [
"uuid 1.3.3",
]
[[package]]
name = "index-stats"
version = "0.1.0"
dependencies = [
"anyhow",
"clap 4.3.6",
"milli",
"piechart",
]
[[package]]
name = "indexmap"
version = "1.9.3"
@ -2534,7 +2553,7 @@ dependencies = [
"byte-unit",
"bytes",
"cargo_toml",
"clap 4.3.0",
"clap 4.3.6",
"crossbeam-channel",
"deserr",
"dump",
@ -3097,6 +3116,15 @@ dependencies = [
"siphasher",
]
[[package]]
name = "piechart"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79730372879e285c066c9289e164f4033ff665a866396dfa478f58f5adcd4089"
dependencies = [
"ansi_term",
]
[[package]]
name = "pin-project-lite"
version = "0.2.9"

View File

@ -10,6 +10,7 @@ members = [
"file-store",
"permissive-json-pointer",
"milli",
"index-stats",
"filter-parser",
"flatten-serde-json",
"json-depth-checker",

12
index-stats/Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "index-stats"
description = "A small program that computes internal stats of a Meilisearch index"
version = "0.1.0"
edition = "2021"
publish = false
[dependencies]
anyhow = "1.0.71"
clap = { version = "4.3.5", features = ["derive"] }
milli = { path = "../milli" }
piechart = "1.0.0"

200
index-stats/src/main.rs Normal file
View File

@ -0,0 +1,200 @@
use std::cmp::Reverse;
use std::path::PathBuf;
use clap::Parser;
use milli::heed::{types::ByteSlice, EnvOpenOptions, PolyDatabase, RoTxn};
use milli::index::db_name::*;
use milli::index::Index;
use piechart::{Chart, Color, Data};
/// Simple program to greet a person
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// The path to the LMDB Meilisearch index database.
path: PathBuf,
}
fn main() -> anyhow::Result<()> {
let Args { path } = Args::parse();
let env = EnvOpenOptions::new().max_dbs(24).open(path)?;
// TODO not sure to keep that...
// if removed put the pub(crate) back in the Index struct
matches!(
Option::<Index>::None,
Some(Index {
env: _,
main: _,
word_docids: _,
exact_word_docids: _,
word_prefix_docids: _,
exact_word_prefix_docids: _,
word_pair_proximity_docids: _,
word_prefix_pair_proximity_docids: _,
prefix_word_pair_proximity_docids: _,
word_position_docids: _,
word_fid_docids: _,
field_id_word_count_docids: _,
word_prefix_position_docids: _,
word_prefix_fid_docids: _,
script_language_docids: _,
facet_id_exists_docids: _,
facet_id_is_null_docids: _,
facet_id_is_empty_docids: _,
facet_id_f64_docids: _,
facet_id_string_docids: _,
field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
documents: _,
})
);
let mut wtxn = env.write_txn()?;
let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
let word_docids = env.create_poly_database(&mut wtxn, Some(WORD_DOCIDS))?;
let exact_word_docids = env.create_poly_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
let word_prefix_docids = env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
let exact_word_prefix_docids =
env.create_poly_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
let word_pair_proximity_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids =
env.create_poly_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
let word_prefix_pair_proximity_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let prefix_word_pair_proximity_docids =
env.create_poly_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_position_docids = env.create_poly_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
let word_fid_docids = env.create_poly_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
let field_id_word_count_docids =
env.create_poly_database(&mut wtxn, Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
let word_prefix_position_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_POSITION_DOCIDS))?;
let word_prefix_fid_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
let facet_id_f64_docids = env.create_poly_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_exists_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s =
env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings =
env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let documents = env.create_poly_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?;
let list = [
(main, MAIN),
(word_docids, WORD_DOCIDS),
(exact_word_docids, EXACT_WORD_DOCIDS),
(word_prefix_docids, WORD_PREFIX_DOCIDS),
(exact_word_prefix_docids, EXACT_WORD_PREFIX_DOCIDS),
(word_pair_proximity_docids, WORD_PAIR_PROXIMITY_DOCIDS),
(script_language_docids, SCRIPT_LANGUAGE_DOCIDS),
(word_prefix_pair_proximity_docids, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS),
(prefix_word_pair_proximity_docids, PREFIX_WORD_PAIR_PROXIMITY_DOCIDS),
(word_position_docids, WORD_POSITION_DOCIDS),
(word_fid_docids, WORD_FIELD_ID_DOCIDS),
(field_id_word_count_docids, FIELD_ID_WORD_COUNT_DOCIDS),
(word_prefix_position_docids, WORD_PREFIX_POSITION_DOCIDS),
(word_prefix_fid_docids, WORD_PREFIX_FIELD_ID_DOCIDS),
(facet_id_f64_docids, FACET_ID_F64_DOCIDS),
(facet_id_string_docids, FACET_ID_STRING_DOCIDS),
(facet_id_exists_docids, FACET_ID_EXISTS_DOCIDS),
(facet_id_is_null_docids, FACET_ID_IS_NULL_DOCIDS),
(facet_id_is_empty_docids, FACET_ID_IS_EMPTY_DOCIDS),
(field_id_docid_facet_f64s, FIELD_ID_DOCID_FACET_F64S),
(field_id_docid_facet_strings, FIELD_ID_DOCID_FACET_STRINGS),
(documents, DOCUMENTS),
];
let rtxn = env.read_txn()?;
let result: Result<Vec<_>, _> =
list.into_iter().map(|(db, name)| compute_stats(&rtxn, db).map(|s| (s, name))).collect();
let mut stats = result?;
println!("{:>30} Number of Entries", "");
stats.sort_by_key(|(s, _)| Reverse(s.number_of_entries));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.number_of_entries as f32, *n)));
Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data);
print!("\r\n\r\n\r\n");
println!("{:>30} Size of Entries", "");
stats.sort_by_key(|(s, _)| Reverse(s.size_of_entries));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_entries as f32, *n)));
Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data);
print!("\r\n\r\n\r\n");
println!("{:>30} Size of Data", "");
stats.sort_by_key(|(s, _)| Reverse(s.size_of_data));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_data as f32, *n)));
Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data);
print!("\r\n\r\n\r\n");
println!("{:>30} Size of Keys", "");
stats.sort_by_key(|(s, _)| Reverse(s.size_of_keys));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_keys as f32, *n)));
Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data);
Ok(())
}
fn compute_graph_data<'a>(stats: impl IntoIterator<Item = (f32, &'a str)>) -> Vec<Data> {
let mut colors = [
Color::Red,
Color::Green,
Color::Yellow,
Color::Blue,
Color::Purple,
Color::Cyan,
Color::White,
]
.into_iter()
.cycle();
let mut characters = ['▴', '▵', '▾', '▿', '▪', '▫', '•', '◦'].into_iter().cycle();
stats
.into_iter()
.map(|(value, name)| Data {
label: (*name).into(),
value,
color: Some(colors.next().unwrap().into()),
fill: characters.next().unwrap(),
})
.collect()
}
#[derive(Debug)]
pub struct Stats {
pub number_of_entries: u64,
pub size_of_keys: u64,
pub size_of_data: u64,
pub size_of_entries: u64,
}
fn compute_stats(rtxn: &RoTxn, db: PolyDatabase) -> anyhow::Result<Stats> {
let mut number_of_entries = 0;
let mut size_of_keys = 0;
let mut size_of_data = 0;
for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
let (key, data) = result?;
number_of_entries += 1;
size_of_keys += key.len() as u64;
size_of_data += data.len() as u64;
}
Ok(Stats {
number_of_entries,
size_of_keys,
size_of_data,
size_of_entries: size_of_keys + size_of_data,
})
}

View File

@ -93,10 +93,10 @@ pub mod db_name {
#[derive(Clone)]
pub struct Index {
/// The LMDB environment which this index is associated with.
pub(crate) env: heed::Env,
pub env: heed::Env,
/// Contains many different types (e.g. the fields ids map).
pub(crate) main: PolyDatabase,
pub main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
@ -150,7 +150,7 @@ pub struct Index {
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
impl Index {