Introduce the word-pair-proximities-docids infos subcommand

This commit is contained in:
Clément Renault 2020-09-22 13:52:24 +02:00
parent d6fa9c0414
commit a58ae5eb2a
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -71,6 +71,24 @@ enum Command {
/// Outputs the average number of positions for each document words.
AverageNumberOfPositions,
/// Outputs a CSV with the proximities for the two specidied words and
/// the documents ids where these relations appears.
///
/// `word1`, `word2` defines the word pair specified and sorted.
/// `proximity` defines the proximity between the two specified words.
/// `documents_ids` defines the documents ids where the relation appears.
WordPairProximitiesDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// First word of the word pair.
word1: String,
/// Second word of the word pair.
word2: String,
},
/// Outputs the words FST to disk.
///
/// One can use the FST binary helper to dissect and analyze it,
@ -107,6 +125,9 @@ fn main() -> anyhow::Result<()> {
TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositions => average_number_of_positions(&index, &rtxn),
WordPairProximitiesDocids { full_display, word1, word2 } => {
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
},
ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
}
}
@ -306,3 +327,47 @@ fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Res
Ok(())
}
fn word_pair_proximities_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
word1: String,
word2: String,
) -> anyhow::Result<()>
{
use heed::types::ByteSlice;
use milli::RoaringBitmapCodec;
let (w1, w2) = if word1 > word2 { (word2, word1) } else { (word1, word2) };
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?;
// Create the prefix key with only the pair of words.
let mut prefix = Vec::with_capacity(w1.len() + w2.len() + 1);
prefix.extend_from_slice(w1.as_bytes());
prefix.push(0);
prefix.extend_from_slice(w2.as_bytes());
let db = index.word_pair_proximity_docids.as_polymorph();
let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?;
for result in iter {
let (key, docids) = result?;
// Skip keys that are longer than the requested one,
// a longer key means that the second word is a prefix of the request word.
if key.len() != prefix.len() + 1 { continue; }
let proximity = key.last().unwrap();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[&w1, &w2, &proximity.to_string(), &docids])?;
}
Ok(wtr.flush()?)
}