diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 7150e5962..4529e6621 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -12,6 +12,20 @@ use Command::*; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +const MAIN_DB_NAME: &str = "main"; +const WORD_DOCIDS_DB_NAME: &str = "word-docids"; +const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; +const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; +const DOCUMENTS_DB_NAME: &str = "documents"; + +const DATABASE_NAMES: &[&str] = &[ + MAIN_DB_NAME, + WORD_DOCIDS_DB_NAME, + DOCID_WORD_POSITIONS_DB_NAME, + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + DOCUMENTS_DB_NAME, +]; + #[derive(Debug, StructOpt)] #[structopt(name = "milli-info", about = "A stats crawler for milli.")] struct Opt { @@ -74,6 +88,12 @@ enum Command { /// Outputs the average number of documents for each words pair. AverageNumberOfDocumentByWordPairProximity, + /// Outputs the size in bytes of the specified database. + SizeOfDatabase { + #[structopt(possible_values = DATABASE_NAMES)] + database: String, + }, + /// Outputs a CSV with the proximities for the two specidied words and /// the documents ids where these relations appears. /// @@ -130,6 +150,7 @@ fn main() -> anyhow::Result<()> { AverageNumberOfPositionsByWord => { average_number_of_positions_by_word(&index, &rtxn) }, + SizeOfDatabase { database } => size_of_database(&index, &rtxn, &database), AverageNumberOfDocumentByWordPairProximity => { average_number_of_document_by_word_pair_proximity(&index, &rtxn) } @@ -336,6 +357,33 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any Ok(()) } +fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { + use heed::types::ByteSlice; + + let database = match name { + MAIN_DB_NAME => &index.main, + WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), + DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), + DOCUMENTS_DB_NAME => index.documents.as_polymorph(), + otherwise => anyhow::bail!("unknown database {:?}", otherwise), + }; + + let mut key_size: u64 = 0; + let mut val_size: u64 = 0; + for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { + let (k, v) = result?; + key_size += k.len() as u64; + val_size += v.len() as u64; + } + + eprintln!("The {} database weigh {} bytes in terms of keys and {} bytes in terms of values.", + name, key_size, val_size, + ); + + Ok(()) +} + fn average_number_of_document_by_word_pair_proximity( index: &Index, rtxn: &heed::RoTxn, diff --git a/src/lib.rs b/src/lib.rs index 0e94fcef7..850869064 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,10 +43,10 @@ pub struct Index { pub word_docids: Database, /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, - /// Maps the document id to the document as a CSV line. - pub documents: Database, ByteSlice>, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, + /// Maps the document id to the document as a CSV line. + pub documents: Database, ByteSlice>, } impl Index { @@ -55,8 +55,8 @@ impl Index { main: env.create_poly_database(None)?, word_docids: env.create_database(Some("word-docids"))?, docid_word_positions: env.create_database(Some("docid-word-positions"))?, - documents: env.create_database(Some("documents"))?, word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?, + documents: env.create_database(Some("documents"))?, }) }