From ef9875256b400c3b4cd551b09a74594f2477f1c1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 23 Jun 2023 22:57:57 +0200 Subject: [PATCH] Create a small tool to measure the size of inernal databases --- Cargo.lock | 44 +++++++-- Cargo.toml | 1 + index-stats/Cargo.toml | 12 +++ index-stats/src/main.rs | 200 ++++++++++++++++++++++++++++++++++++++++ milli/src/index.rs | 6 +- 5 files changed, 252 insertions(+), 11 deletions(-) create mode 100644 index-stats/Cargo.toml create mode 100644 index-stats/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 46218fc34..5196adf13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -309,6 +309,15 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anstream" version = "0.3.2" @@ -778,9 +787,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.0" +version = "4.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc" +checksum = "6320c6d1c98b6981da7bb2dcecbd0be9dc98d42165fa8326b21000f7dbfde6d0" dependencies = [ "clap_builder", "clap_derive", @@ -789,9 +798,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.0" +version = "4.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" +checksum = "2e53afce1efce6ed1f633cf0e57612fe51db54a1ee4fd8f8503d078fe02d69ae" dependencies = [ "anstream", "anstyle", @@ -802,9 +811,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.3.0" +version = "4.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" +checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" dependencies = [ "heck", "proc-macro2", @@ -1599,7 +1608,7 @@ name = "fuzzers" version = "1.2.0" dependencies = [ "arbitrary", - "clap 4.3.0", + "clap 4.3.6", "fastrand", "milli", "serde", @@ -1949,6 +1958,16 @@ dependencies = [ "uuid 1.3.3", ] +[[package]] +name = "index-stats" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 4.3.6", + "milli", + "piechart", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -2534,7 +2553,7 @@ dependencies = [ "byte-unit", "bytes", "cargo_toml", - "clap 4.3.0", + "clap 4.3.6", "crossbeam-channel", "deserr", "dump", @@ -3097,6 +3116,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "piechart" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79730372879e285c066c9289e164f4033ff665a866396dfa478f58f5adcd4089" +dependencies = [ + "ansi_term", +] + [[package]] name = "pin-project-lite" version = "0.2.9" diff --git a/Cargo.toml b/Cargo.toml index f7e5758d5..d1778b654 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "file-store", "permissive-json-pointer", "milli", + "index-stats", "filter-parser", "flatten-serde-json", "json-depth-checker", diff --git a/index-stats/Cargo.toml b/index-stats/Cargo.toml new file mode 100644 index 000000000..901cf52ad --- /dev/null +++ b/index-stats/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "index-stats" +description = "A small program that computes internal stats of a Meilisearch index" +version = "0.1.0" +edition = "2021" +publish = false + +[dependencies] +anyhow = "1.0.71" +clap = { version = "4.3.5", features = ["derive"] } +milli = { path = "../milli" } +piechart = "1.0.0" diff --git a/index-stats/src/main.rs b/index-stats/src/main.rs new file mode 100644 index 000000000..3232580e2 --- /dev/null +++ b/index-stats/src/main.rs @@ -0,0 +1,200 @@ +use std::cmp::Reverse; +use std::path::PathBuf; + +use clap::Parser; +use milli::heed::{types::ByteSlice, EnvOpenOptions, PolyDatabase, RoTxn}; +use milli::index::db_name::*; +use milli::index::Index; +use piechart::{Chart, Color, Data}; + +/// Simple program to greet a person +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// The path to the LMDB Meilisearch index database. + path: PathBuf, +} + +fn main() -> anyhow::Result<()> { + let Args { path } = Args::parse(); + let env = EnvOpenOptions::new().max_dbs(24).open(path)?; + + // TODO not sure to keep that... + // if removed put the pub(crate) back in the Index struct + matches!( + Option::::None, + Some(Index { + env: _, + main: _, + word_docids: _, + exact_word_docids: _, + word_prefix_docids: _, + exact_word_prefix_docids: _, + word_pair_proximity_docids: _, + word_prefix_pair_proximity_docids: _, + prefix_word_pair_proximity_docids: _, + word_position_docids: _, + word_fid_docids: _, + field_id_word_count_docids: _, + word_prefix_position_docids: _, + word_prefix_fid_docids: _, + script_language_docids: _, + facet_id_exists_docids: _, + facet_id_is_null_docids: _, + facet_id_is_empty_docids: _, + facet_id_f64_docids: _, + facet_id_string_docids: _, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + documents: _, + }) + ); + + let mut wtxn = env.write_txn()?; + let main = env.create_poly_database(&mut wtxn, Some(MAIN))?; + let word_docids = env.create_poly_database(&mut wtxn, Some(WORD_DOCIDS))?; + let exact_word_docids = env.create_poly_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?; + let word_prefix_docids = env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; + let exact_word_prefix_docids = + env.create_poly_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?; + let word_pair_proximity_docids = + env.create_poly_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let script_language_docids = + env.create_poly_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; + let word_prefix_pair_proximity_docids = + env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let prefix_word_pair_proximity_docids = + env.create_poly_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; + let word_position_docids = env.create_poly_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; + let word_fid_docids = env.create_poly_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; + let field_id_word_count_docids = + env.create_poly_database(&mut wtxn, Some(FIELD_ID_WORD_COUNT_DOCIDS))?; + let word_prefix_position_docids = + env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_POSITION_DOCIDS))?; + let word_prefix_fid_docids = + env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_FIELD_ID_DOCIDS))?; + let facet_id_f64_docids = env.create_poly_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?; + let facet_id_string_docids = + env.create_poly_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_exists_docids = + env.create_poly_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?; + let facet_id_is_null_docids = + env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?; + let facet_id_is_empty_docids = + env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?; + let field_id_docid_facet_f64s = + env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?; + let field_id_docid_facet_strings = + env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?; + let documents = env.create_poly_database(&mut wtxn, Some(DOCUMENTS))?; + wtxn.commit()?; + + let list = [ + (main, MAIN), + (word_docids, WORD_DOCIDS), + (exact_word_docids, EXACT_WORD_DOCIDS), + (word_prefix_docids, WORD_PREFIX_DOCIDS), + (exact_word_prefix_docids, EXACT_WORD_PREFIX_DOCIDS), + (word_pair_proximity_docids, WORD_PAIR_PROXIMITY_DOCIDS), + (script_language_docids, SCRIPT_LANGUAGE_DOCIDS), + (word_prefix_pair_proximity_docids, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS), + (prefix_word_pair_proximity_docids, PREFIX_WORD_PAIR_PROXIMITY_DOCIDS), + (word_position_docids, WORD_POSITION_DOCIDS), + (word_fid_docids, WORD_FIELD_ID_DOCIDS), + (field_id_word_count_docids, FIELD_ID_WORD_COUNT_DOCIDS), + (word_prefix_position_docids, WORD_PREFIX_POSITION_DOCIDS), + (word_prefix_fid_docids, WORD_PREFIX_FIELD_ID_DOCIDS), + (facet_id_f64_docids, FACET_ID_F64_DOCIDS), + (facet_id_string_docids, FACET_ID_STRING_DOCIDS), + (facet_id_exists_docids, FACET_ID_EXISTS_DOCIDS), + (facet_id_is_null_docids, FACET_ID_IS_NULL_DOCIDS), + (facet_id_is_empty_docids, FACET_ID_IS_EMPTY_DOCIDS), + (field_id_docid_facet_f64s, FIELD_ID_DOCID_FACET_F64S), + (field_id_docid_facet_strings, FIELD_ID_DOCID_FACET_STRINGS), + (documents, DOCUMENTS), + ]; + + let rtxn = env.read_txn()?; + let result: Result, _> = + list.into_iter().map(|(db, name)| compute_stats(&rtxn, db).map(|s| (s, name))).collect(); + let mut stats = result?; + + println!("{:>30} Number of Entries", ""); + stats.sort_by_key(|(s, _)| Reverse(s.number_of_entries)); + let data = compute_graph_data(stats.iter().map(|(s, n)| (s.number_of_entries as f32, *n))); + Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data); + print!("\r\n\r\n\r\n"); + + println!("{:>30} Size of Entries", ""); + stats.sort_by_key(|(s, _)| Reverse(s.size_of_entries)); + let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_entries as f32, *n))); + Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data); + print!("\r\n\r\n\r\n"); + + println!("{:>30} Size of Data", ""); + stats.sort_by_key(|(s, _)| Reverse(s.size_of_data)); + let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_data as f32, *n))); + Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data); + print!("\r\n\r\n\r\n"); + + println!("{:>30} Size of Keys", ""); + stats.sort_by_key(|(s, _)| Reverse(s.size_of_keys)); + let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_keys as f32, *n))); + Chart::new().radius(20).aspect_ratio(6).legend(true).draw(&data); + + Ok(()) +} + +fn compute_graph_data<'a>(stats: impl IntoIterator) -> Vec { + let mut colors = [ + Color::Red, + Color::Green, + Color::Yellow, + Color::Blue, + Color::Purple, + Color::Cyan, + Color::White, + ] + .into_iter() + .cycle(); + + let mut characters = ['▴', '▵', '▾', '▿', '▪', '▫', '•', '◦'].into_iter().cycle(); + + stats + .into_iter() + .map(|(value, name)| Data { + label: (*name).into(), + value, + color: Some(colors.next().unwrap().into()), + fill: characters.next().unwrap(), + }) + .collect() +} + +#[derive(Debug)] +pub struct Stats { + pub number_of_entries: u64, + pub size_of_keys: u64, + pub size_of_data: u64, + pub size_of_entries: u64, +} + +fn compute_stats(rtxn: &RoTxn, db: PolyDatabase) -> anyhow::Result { + let mut number_of_entries = 0; + let mut size_of_keys = 0; + let mut size_of_data = 0; + + for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? { + let (key, data) = result?; + number_of_entries += 1; + size_of_keys += key.len() as u64; + size_of_data += data.len() as u64; + } + + Ok(Stats { + number_of_entries, + size_of_keys, + size_of_data, + size_of_entries: size_of_keys + size_of_data, + }) +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 1ccef13dd..c4898f5cf 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -93,10 +93,10 @@ pub mod db_name { #[derive(Clone)] pub struct Index { /// The LMDB environment which this index is associated with. - pub(crate) env: heed::Env, + pub env: heed::Env, /// Contains many different types (e.g. the fields ids map). - pub(crate) main: PolyDatabase, + pub main: PolyDatabase, /// A word and all the documents ids containing the word. pub word_docids: Database, @@ -150,7 +150,7 @@ pub struct Index { pub field_id_docid_facet_strings: Database, /// Maps the document id to the document as an obkv store. - pub(crate) documents: Database, ObkvCodec>, + pub documents: Database, ObkvCodec>, } impl Index {