Introduce the database-stats infos subcommand

This commit is contained in:
Kerollmops 2020-10-01 11:39:30 +02:00 committed by Clément Renault
parent 079742b4d3
commit 891e0188dd
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -18,7 +18,7 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const DOCUMENTS_DB_NAME: &str = "documents"; const DOCUMENTS_DB_NAME: &str = "documents";
const DATABASE_NAMES: &[&str] = &[ const ALL_DATABASE_NAMES: &[&str] = &[
MAIN_DB_NAME, MAIN_DB_NAME,
WORD_DOCIDS_DB_NAME, WORD_DOCIDS_DB_NAME,
DOCID_WORD_POSITIONS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME,
@ -26,6 +26,12 @@ const DATABASE_NAMES: &[&str] = &[
DOCUMENTS_DB_NAME, DOCUMENTS_DB_NAME,
]; ];
const POSTINGS_DATABASE_NAMES: &[&str] = &[
WORD_DOCIDS_DB_NAME,
DOCID_WORD_POSITIONS_DB_NAME,
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
];
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
#[structopt(name = "milli-info", about = "A stats crawler for milli.")] #[structopt(name = "milli-info", about = "A stats crawler for milli.")]
struct Opt { struct Opt {
@ -85,13 +91,16 @@ enum Command {
/// Outputs the average number of positions for each document words. /// Outputs the average number of positions for each document words.
AverageNumberOfPositionsByWord, AverageNumberOfPositionsByWord,
/// Outputs some statistics about the words pairs proximities /// Outputs some statistics about the given database (e.g. median, quartiles,
/// (median, quartiles, percentiles, minimum, maximum, averge). /// percentiles, minimum, maximum, averge, key size, value size).
WordPairProximityStats, DatabaseStats {
#[structopt(possible_values = POSTINGS_DATABASE_NAMES)]
database: String,
},
/// Outputs the size in bytes of the specified database. /// Outputs the size in bytes of the specified database.
SizeOfDatabase { SizeOfDatabase {
#[structopt(possible_values = DATABASE_NAMES)] #[structopt(possible_values = ALL_DATABASE_NAMES)]
database: String, database: String,
}, },
@ -152,7 +161,7 @@ fn main() -> anyhow::Result<()> {
average_number_of_positions_by_word(&index, &rtxn) average_number_of_positions_by_word(&index, &rtxn)
}, },
SizeOfDatabase { database } => size_of_database(&index, &rtxn, &database), SizeOfDatabase { database } => size_of_database(&index, &rtxn, &database),
WordPairProximityStats => word_pair_proximity_stats(&index, &rtxn), DatabaseStats { database } => database_stats(&index, &rtxn, &database),
WordPairProximitiesDocids { full_display, word1, word2 } => { WordPairProximitiesDocids { full_display, word1, word2 } => {
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
}, },
@ -384,21 +393,27 @@ fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Re
Ok(()) Ok(())
} }
fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> {
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{Error, BytesDecode}; use heed::{Error, BytesDecode};
use milli::CboRoaringBitmapCodec; use roaring::RoaringBitmap;
use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>(
db: heed::PolyDatabase,
rtxn: &'a heed::RoTxn,
name: &str,
) -> anyhow::Result<()>
{
let mut key_size = 0u64; let mut key_size = 0u64;
let mut val_size = 0u64; let mut val_size = 0u64;
let mut values_length = Vec::new(); let mut values_length = Vec::new();
let db = index.word_pair_proximity_docids.as_polymorph();
for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? { for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
let (key, val) = result?; let (key, val) = result?;
key_size += key.len() as u64; key_size += key.len() as u64;
val_size += val.len() as u64; val_size += val.len() as u64;
let val = CboRoaringBitmapCodec::bytes_decode(val).ok_or(Error::Decoding)?; let val = DC::bytes_decode(val).ok_or(Error::Decoding)?;
values_length.push(val.len() as u32); values_length.push(val.len() as u32);
} }
@ -415,7 +430,7 @@ fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Resul
let count = values_length.len(); let count = values_length.len();
let sum = values_length.iter().map(|l| *l as u64).sum::<u64>(); let sum = values_length.iter().map(|l| *l as u64).sum::<u64>();
println!("word-pair-proximity-docids stats on the lengths"); println!("The {} database stats on the lengths", name);
println!("\tnumber of proximity pairs: {}", count); println!("\tnumber of proximity pairs: {}", count);
println!("\tfirst quartile: {}", first_quartile); println!("\tfirst quartile: {}", first_quartile);
println!("\tmedian: {}", median); println!("\tmedian: {}", median);
@ -426,7 +441,6 @@ fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Resul
println!("\tminimum: {}", minimum); println!("\tminimum: {}", minimum);
println!("\tmaximum: {}", maximum); println!("\tmaximum: {}", maximum);
println!("\taverage: {}", sum as f64 / count as f64); println!("\taverage: {}", sum as f64 / count as f64);
println!();
println!("\ttotal key size: {} bytes", key_size); println!("\ttotal key size: {} bytes", key_size);
println!("\ttotal val size: {} bytes", val_size); println!("\ttotal val size: {} bytes", val_size);
println!("\ttotal size: {} bytes", key_size + val_size); println!("\ttotal size: {} bytes", key_size + val_size);
@ -434,6 +448,23 @@ fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Resul
Ok(()) Ok(())
} }
match name {
WORD_DOCIDS_DB_NAME => {
let db = index.word_docids.as_polymorph();
compute_stats::<RoaringBitmapCodec>(*db, rtxn, name)
},
DOCID_WORD_POSITIONS_DB_NAME => {
let db = index.docid_word_positions.as_polymorph();
compute_stats::<BoRoaringBitmapCodec>(*db, rtxn, name)
},
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => {
let db = index.word_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
unknown => anyhow::bail!("unknown database {:?}", unknown),
}
}
fn word_pair_proximities_docids( fn word_pair_proximities_docids(
index: &Index, index: &Index,
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,