Introduce the average-number-of-document-by-word-pair-proximity infos subcommand

This commit is contained in:
Kerollmops 2020-09-29 18:32:48 +02:00
parent 991be8950e
commit 58237bd67f
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -71,6 +71,8 @@ enum Command {
/// Outputs the average number of positions for each document words.
AverageNumberOfPositionsByWord,
/// Outputs the average number of documents for each words pair.
AverageNumberOfDocumentByWordPairProximity,
/// Outputs a CSV with the proximities for the two specidied words and
/// the documents ids where these relations appears.
@ -128,6 +130,9 @@ fn main() -> anyhow::Result<()> {
AverageNumberOfPositionsByWord => {
average_number_of_positions_by_word(&index, &rtxn)
},
AverageNumberOfDocumentByWordPairProximity => {
average_number_of_document_by_word_pair_proximity(&index, &rtxn)
}
WordPairProximitiesDocids { full_display, word1, word2 } => {
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
},
@ -331,6 +336,32 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any
Ok(())
}
fn average_number_of_document_by_word_pair_proximity(
index: &Index,
rtxn: &heed::RoTxn,
) -> anyhow::Result<()>
{
use heed::types::DecodeIgnore;
use milli::RoaringBitmapCodec;
let mut values_length = Vec::new();
let mut count = 0;
let db = index.word_pair_proximity_docids.as_polymorph();
for result in db.iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)? {
let ((), val) = result?;
values_length.push(val.len() as u32);
count += 1;
}
let values_length_sum = values_length.into_iter().map(|c| c as usize).sum::<usize>() as f64;
let count = count as f64;
println!("average number of documents by words pairs proximities: {}", values_length_sum / count);
Ok(())
}
fn word_pair_proximities_docids(
index: &Index,
rtxn: &heed::RoTxn,