From 58237bd67f727a68f6128c4b926eaeacbc70e90f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 29 Sep 2020 18:32:48 +0200 Subject: [PATCH] Introduce the average-number-of-document-by-word-pair-proximity infos subcommand --- src/bin/infos.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 74db42419..7150e5962 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -71,6 +71,8 @@ enum Command { /// Outputs the average number of positions for each document words. AverageNumberOfPositionsByWord, + /// Outputs the average number of documents for each words pair. + AverageNumberOfDocumentByWordPairProximity, /// Outputs a CSV with the proximities for the two specidied words and /// the documents ids where these relations appears. @@ -128,6 +130,9 @@ fn main() -> anyhow::Result<()> { AverageNumberOfPositionsByWord => { average_number_of_positions_by_word(&index, &rtxn) }, + AverageNumberOfDocumentByWordPairProximity => { + average_number_of_document_by_word_pair_proximity(&index, &rtxn) + } WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) }, @@ -331,6 +336,32 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any Ok(()) } +fn average_number_of_document_by_word_pair_proximity( + index: &Index, + rtxn: &heed::RoTxn, +) -> anyhow::Result<()> +{ + use heed::types::DecodeIgnore; + use milli::RoaringBitmapCodec; + + let mut values_length = Vec::new(); + let mut count = 0; + + let db = index.word_pair_proximity_docids.as_polymorph(); + for result in db.iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)? { + let ((), val) = result?; + values_length.push(val.len() as u32); + count += 1; + } + + let values_length_sum = values_length.into_iter().map(|c| c as usize).sum::() as f64; + let count = count as f64; + + println!("average number of documents by words pairs proximities: {}", values_length_sum / count); + + Ok(()) +} + fn word_pair_proximities_docids( index: &Index, rtxn: &heed::RoTxn,