From bd63da0a0edad945644fbd874e22e22509b3e5ae Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 28 Dec 2020 16:46:16 +0100 Subject: [PATCH 1/3] Add missing databases to the infos subcommand --- infos/src/main.rs | 84 +++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 51 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 0d2b7abb5..9f16c7c0e 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -19,6 +19,8 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; +const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids"; +const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const DOCUMENTS_DB_NAME: &str = "documents"; @@ -28,6 +30,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_DOCIDS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + FACET_FIELD_ID_VALUE_DOCIDS_NAME, + FIELD_ID_DOCID_FACET_VALUES_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -116,9 +120,6 @@ enum Command { field_name: String, }, - /// Outputs the total size of all the docid-word-positions keys and values. - TotalDocidWordPositionsSize, - /// Outputs the average number of *different* words by document. AverageNumberOfWordsByDoc, @@ -132,10 +133,10 @@ enum Command { database: String, }, - /// Outputs the size in bytes of the specified database. + /// Outputs the size in bytes of the specified databases names. SizeOfDatabase { #[structopt(possible_values = ALL_DATABASE_NAMES)] - database: String, + databases: Vec, }, /// Outputs a CSV with the proximities for the two specidied words and @@ -209,12 +210,11 @@ fn main() -> anyhow::Result<()> { facet_values_docids(&index, &rtxn, !full_display, field_name) }, FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), - TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { average_number_of_positions_by_word(&index, &rtxn) }, - SizeOfDatabase { database } => size_of_database(&index, &rtxn, &database), + SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), DatabaseStats { database } => database_stats(&index, &rtxn, &database), WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) @@ -620,28 +620,6 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - Ok(()) } -fn total_docid_word_positions_size(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { - use heed::types::ByteSlice; - - let mut total_key_size = 0; - let mut total_val_size = 0; - let mut count = 0; - - let iter = index.docid_word_positions.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)?; - for result in iter { - let (key, val) = result?; - total_key_size += key.len(); - total_val_size += val.len(); - count += 1; - } - - println!("number of keys: {}", count); - println!("total key size: {}", total_key_size); - println!("total value size: {}", total_val_size); - - Ok(()) -} - fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; use milli::{DocumentId, BEU32StrCodec}; @@ -703,33 +681,37 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any Ok(()) } -fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { +fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; - let database = match name { - MAIN_DB_NAME => &index.main, - WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - DOCUMENTS_DB_NAME => index.documents.as_polymorph(), - unknown => anyhow::bail!("unknown database {:?}", unknown), - }; + for name in names { + let database = match name.as_str() { + MAIN_DB_NAME => &index.main, + WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), + WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), + DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(), + DOCUMENTS_DB_NAME => index.documents.as_polymorph(), + unknown => anyhow::bail!("unknown database {:?}", unknown), + }; - let mut key_size: u64 = 0; - let mut val_size: u64 = 0; - for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { - let (k, v) = result?; - key_size += k.len() as u64; - val_size += v.len() as u64; + let mut key_size: u64 = 0; + let mut val_size: u64 = 0; + for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { + let (k, v) = result?; + key_size += k.len() as u64; + val_size += v.len() as u64; + } + + println!("The {} database weigh:", name); + println!("\ttotal key size: {} bytes", key_size); + println!("\ttotal val size: {} bytes", val_size); + println!("\ttotal size: {} bytes", key_size + val_size); } - println!("The {} database weigh:", name); - println!("\ttotal key size: {} bytes", key_size); - println!("\ttotal val size: {} bytes", val_size); - println!("\ttotal size: {} bytes", key_size + val_size); - Ok(()) } From 3d02b19fbd9a88f51c9358e583afcda9ba9e3eef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 5 Mar 2021 16:13:21 +0100 Subject: [PATCH 2/3] Introduce the docids-words-positions subcommand to the infos crate --- infos/src/main.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 9f16c7c0e..c966a0143 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -114,6 +114,16 @@ enum Command { field_name: String, }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. + DocidsWordsPositions { + /// Display the whole positions in detail. + #[structopt(long)] + full_display: bool, + + /// If defined, only retrieve the documents that corresponds to these internal ids. + internal_documents_ids: Vec, + }, + /// Outputs some facets statistics for the given facet name. FacetStats { /// The field name in the document. @@ -209,6 +219,9 @@ fn main() -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + DocidsWordsPositions { full_display, internal_documents_ids } => { + docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) + }, FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -525,6 +538,39 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn docids_words_positions( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + internal_ids: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["document_id", "word", "positions"])?; + + let iter: Box> = if internal_ids.is_empty() { + Box::new(index.docid_word_positions.iter(rtxn)?) + } else { + let vec: heed::Result> = internal_ids.into_iter().map(|id| { + index.docid_word_positions.prefix_iter(rtxn, &(id, "")) + }).collect(); + Box::new(vec?.into_iter().flatten()) + }; + + for result in iter { + let ((id, word), positions) = result?; + let positions = if debug { + format!("{:?}", positions) + } else { + format!("{:?}", positions.iter().collect::>()) + }; + wtr.write_record(&[&id.to_string(), word, &positions])?; + } + + Ok(wtr.flush()?) +} + fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; From 18844d60b569268d7eef3d76a5533823a65e858d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 5 Mar 2021 16:37:18 +0100 Subject: [PATCH 3/3] Simplify the output of database sizes in the infos crate --- infos/src/main.rs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index c966a0143..376679656 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -30,9 +30,9 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_DOCIDS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, FACET_FIELD_ID_VALUE_DOCIDS_NAME, FIELD_ID_DOCID_FACET_VALUES_NAME, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -145,6 +145,8 @@ enum Command { /// Outputs the size in bytes of the specified databases names. SizeOfDatabase { + /// The name of the database to measure the size of, if not specified it's equivalent + /// to specifying all the databases names. #[structopt(possible_values = ALL_DATABASE_NAMES)] databases: Vec, }, @@ -730,6 +732,12 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; + let names = if names.is_empty() { + ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() + } else { + names + }; + for name in names { let database = match name.as_str() { MAIN_DB_NAME => &index.main, @@ -753,9 +761,9 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a } println!("The {} database weigh:", name); - println!("\ttotal key size: {} bytes", key_size); - println!("\ttotal val size: {} bytes", val_size); - println!("\ttotal size: {} bytes", key_size + val_size); + println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); + println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); + println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); } Ok(()) @@ -810,9 +818,9 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu println!("\tminimum: {}", minimum); println!("\tmaximum: {}", maximum); println!("\taverage: {}", sum as f64 / count as f64); - println!("\ttotal key size: {} bytes", key_size); - println!("\ttotal val size: {} bytes", val_size); - println!("\ttotal size: {} bytes", key_size + val_size); + println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); + println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); + println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); Ok(()) }