Introduce the word-pair-proximities-docids infos subcommand

2025-07-04 20:37:15 +02:00 · 2020-09-22 13:52:24 +02:00 · 2020-09-22 13:52:24 +02:00 · a58ae5eb2a
commit a58ae5eb2a
parent d6fa9c0414
1 changed files with 65 additions and 0 deletions
--- a/src/bin/infos.rs
+++ b/src/bin/infos.rs
@ -71,6 +71,24 @@ enum Command {
    /// Outputs the average number of positions for each document words.
    AverageNumberOfPositions,

+    /// Outputs a CSV with the proximities for the two specidied words and
+    /// the documents ids where these relations appears.
+    ///
+    /// `word1`, `word2` defines the word pair specified and sorted.
+    /// `proximity` defines the proximity between the two specified words.
+    /// `documents_ids` defines the documents ids where the relation appears.
+    WordPairProximitiesDocids {
+        /// Display the whole documents ids in details.
+        #[structopt(long)]
+        full_display: bool,
+
+        /// First word of the word pair.
+        word1: String,
+
+        /// Second word of the word pair.
+        word2: String,
+    },
+
    /// Outputs the words FST to disk.
    ///
    /// One can use the FST binary helper to dissect and analyze it,
@ -107,6 +125,9 @@ fn main() -> anyhow::Result<()> {
        TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
        AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
        AverageNumberOfPositions => average_number_of_positions(&index, &rtxn),
+        WordPairProximitiesDocids { full_display, word1, word2 } => {
+            word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
+        },
        ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
    }
 }
@ -306,3 +327,47 @@ fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Res

    Ok(())
 }
+
+fn word_pair_proximities_docids(
+    index: &Index,
+    rtxn: &heed::RoTxn,
+    debug: bool,
+    word1: String,
+    word2: String,
+) -> anyhow::Result<()>
+{
+    use heed::types::ByteSlice;
+    use milli::RoaringBitmapCodec;
+
+    let (w1, w2) = if word1 > word2 { (word2, word1) } else { (word1, word2) };
+
+    let stdout = io::stdout();
+    let mut wtr = csv::Writer::from_writer(stdout.lock());
+    wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?;
+
+    // Create the prefix key with only the pair of words.
+    let mut prefix = Vec::with_capacity(w1.len() + w2.len() + 1);
+    prefix.extend_from_slice(w1.as_bytes());
+    prefix.push(0);
+    prefix.extend_from_slice(w2.as_bytes());
+
+    let db = index.word_pair_proximity_docids.as_polymorph();
+    let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?;
+    for result in iter {
+        let (key, docids) = result?;
+
+        // Skip keys that are longer than the requested one,
+        // a longer key means that the second word is a prefix of the request word.
+        if key.len() != prefix.len() + 1 { continue; }
+
+        let proximity = key.last().unwrap();
+        let docids = if debug {
+            format!("{:?}", docids)
+        } else {
+            format!("{:?}", docids.iter().collect::<Vec<_>>())
+        };
+        wtr.write_record(&[&w1, &w2, &proximity.to_string(), &docids])?;
+    }
+
+    Ok(wtr.flush()?)
+}