From a58ae5eb2a5be72f3298b1acc7cf0421033e4bdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Sep 2020 13:52:24 +0200 Subject: [PATCH] Introduce the word-pair-proximities-docids infos subcommand --- src/bin/infos.rs | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 6ae3bda68..1b8eb5d24 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -71,6 +71,24 @@ enum Command { /// Outputs the average number of positions for each document words. AverageNumberOfPositions, + /// Outputs a CSV with the proximities for the two specidied words and + /// the documents ids where these relations appears. + /// + /// `word1`, `word2` defines the word pair specified and sorted. + /// `proximity` defines the proximity between the two specified words. + /// `documents_ids` defines the documents ids where the relation appears. + WordPairProximitiesDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// First word of the word pair. + word1: String, + + /// Second word of the word pair. + word2: String, + }, + /// Outputs the words FST to disk. /// /// One can use the FST binary helper to dissect and analyze it, @@ -107,6 +125,9 @@ fn main() -> anyhow::Result<()> { TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositions => average_number_of_positions(&index, &rtxn), + WordPairProximitiesDocids { full_display, word1, word2 } => { + word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) + }, ExportWordsFst { output } => export_words_fst(&index, &rtxn, output), } } @@ -306,3 +327,47 @@ fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Res Ok(()) } + +fn word_pair_proximities_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + word1: String, + word2: String, +) -> anyhow::Result<()> +{ + use heed::types::ByteSlice; + use milli::RoaringBitmapCodec; + + let (w1, w2) = if word1 > word2 { (word2, word1) } else { (word1, word2) }; + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?; + + // Create the prefix key with only the pair of words. + let mut prefix = Vec::with_capacity(w1.len() + w2.len() + 1); + prefix.extend_from_slice(w1.as_bytes()); + prefix.push(0); + prefix.extend_from_slice(w2.as_bytes()); + + let db = index.word_pair_proximity_docids.as_polymorph(); + let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; + for result in iter { + let (key, docids) = result?; + + // Skip keys that are longer than the requested one, + // a longer key means that the second word is a prefix of the request word. + if key.len() != prefix.len() + 1 { continue; } + + let proximity = key.last().unwrap(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[&w1, &w2, &proximity.to_string(), &docids])?; + } + + Ok(wtr.flush()?) +}