Fix the infos binary and add commands

2024-12-25 06:00:08 +01:00 · 2020-09-06 17:14:20 +02:00 · 2020-09-06 17:14:20 +02:00 · dec460ce52
commit dec460ce52
parent daa3673c1c
1 changed files with 38 additions and 168 deletions
--- a/src/bin/infos.rs
+++ b/src/bin/infos.rs
@ -39,45 +39,27 @@ enum Command {
    ///
    /// `word` are displayed and ordered by frequency.
    /// `document_frequency` defines the number of documents which contains the word.
-    /// `frequency` defines the number times the word appears in all the documents.
    MostCommonWords {
        /// The maximum number of frequencies to return.
        #[structopt(default_value = "10")]
        limit: usize,
    },

-    /// Outputs a CSV with the frequencies of the specified words.
-    ///
-    /// Read the documentation of the `most-common-words` command
-    /// for more information about the CSV headers.
-    WordsFrequencies {
-        /// The words you want to retrieve frequencies of.
-        words: Vec<String>,
-    },
-
    /// Outputs a CSV with the biggest entries of the database.
-    BiggestValueSizes {
+    BiggestValues {
        /// The maximum number of sizes to return.
        #[structopt(default_value = "10")]
        limit: usize,
    },

-    /// Outputs a CSV with the document ids for all the positions of the given words.
-    WordPositionDocIds {
-        /// Show the value entirely, not just the debug version.
-        #[structopt(long)]
-        full_display: bool,
-        /// The words you want to display the values of.
-        words: Vec<String>,
-    },
-
-    /// Outputs a CSV with all the positions of the given words.
-    WordPositions {
-        /// Show the value entirely, not just the debug version.
-        #[structopt(long)]
-        full_display: bool,
-        /// The words you want to display the values of.
-        words: Vec<String>,
+    /// Outputs the words FST to disk.
+    ///
+    /// One can use the FST binary helper to dissect and analyze it,
+    /// you can install it using `cargo install fst-bin`.
+    ExportWordsFst {
+        /// The path where the FST will be written.
+        #[structopt(short, long, default_value = "words.fst")]
+        output: PathBuf,
    },
 }

@ -101,80 +83,29 @@ fn main() -> anyhow::Result<()> {

    match opt.command {
        MostCommonWords { limit } => most_common_words(&index, &rtxn, limit),
-        WordsFrequencies { words } => words_frequencies(&index, &rtxn, words),
-        BiggestValueSizes { limit } => biggest_value_sizes(&index, &rtxn, limit),
-        WordPositionDocIds { full_display, words } => word_position_doc_ids(&index, &rtxn, !full_display, words),
-        WordPositions { full_display, words } => word_positions(&index, &rtxn, !full_display, words),
+        BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit),
+        ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
    }
 }

 fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
    use std::collections::BinaryHeap;
    use std::cmp::Reverse;
-    use roaring::RoaringBitmap;

    let mut heap = BinaryHeap::with_capacity(limit + 1);
-    let mut prev = None as Option<(String, u64, RoaringBitmap)>;
-    for result in index.word_position_docids.iter(rtxn)? {
+    for result in index.word_docids.iter(rtxn)? {
        if limit == 0 { break }
-
-        let ((word, _position), postings) = result?;
-        match prev.as_mut() {
-            Some((prev_word, freq, docids)) if prev_word == word => {
-                *freq += postings.len();
-                docids.union_with(&postings);
-            },
-            Some((prev_word, freq, docids)) => {
-                heap.push(Reverse((docids.len(), *freq, prev_word.to_string())));
-                if heap.len() > limit { heap.pop(); }
-                prev = Some((word.to_string(), postings.len(), postings))
-            },
-            None => prev = Some((word.to_string(), postings.len(), postings)),
-        }
-    }
-
-    if let Some((prev_word, freq, docids)) = prev {
-        heap.push(Reverse((docids.len(), freq, prev_word.to_string())));
+        let (word, docids) = result?;
+        heap.push((Reverse(docids.len()), word));
        if heap.len() > limit { heap.pop(); }
    }

    let stdout = io::stdout();
    let mut wtr = csv::Writer::from_writer(stdout.lock());
-    wtr.write_record(&["word", "document_frequency", "frequency"])?;
+    wtr.write_record(&["word", "document_frequency"])?;

-    for Reverse((document_frequency, frequency, word)) in heap.into_sorted_vec() {
-        wtr.write_record(&[word, document_frequency.to_string(), frequency.to_string()])?;
-    }
-
-    Ok(wtr.flush()?)
-}
-
-fn words_frequencies(index: &Index, rtxn: &heed::RoTxn, words: Vec<String>) -> anyhow::Result<()> {
-    use heed::BytesDecode;
-    use heed::types::ByteSlice;
-    use milli::heed_codec::{RoaringBitmapCodec, StrBEU32Codec};
-    use roaring::RoaringBitmap;
-
-    let stdout = io::stdout();
-    let mut wtr = csv::Writer::from_writer(stdout.lock());
-    wtr.write_record(&["word", "document_frequency", "frequency"])?;
-
-    for word in words {
-        let mut document_frequency = RoaringBitmap::new();
-        let mut frequency = 0;
-        let db = index.word_position_docids.as_polymorph();
-        for result in db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, word.as_bytes())? {
-            let (bytes, postings) = result?;
-            let (w, _position) = StrBEU32Codec::bytes_decode(bytes).unwrap();
-
-            // if the word is not exactly the word we requested then it means
-            // we found a word that *starts with* the requested word and we must stop.
-            if word != w { break }
-
-            document_frequency.union_with(&postings);
-            frequency += postings.len();
-        }
-        wtr.write_record(&[word, document_frequency.len().to_string(), frequency.to_string()])?;
+    for (Reverse(document_frequency), word) in heap.into_sorted_vec() {
+        wtr.write_record(&[word, &document_frequency.to_string()])?;
    }

    Ok(wtr.flush()?)
@ -183,15 +114,12 @@ fn words_frequencies(index: &Index, rtxn: &heed::RoTxn, words: Vec<String>) -> a
 fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
    use std::cmp::Reverse;
    use std::collections::BinaryHeap;
-    use heed::BytesDecode;
    use heed::types::{Str, ByteSlice};
-    use milli::heed_codec::StrBEU32Codec;
+    use milli::heed_codec::BEU32StrCodec;

    let main_name = "main";
-    let word_positions_name = "word_positions";
-    let word_position_docids_name = "word_position_docids";
-    let word_four_positions_docids_name = "word_four_positions_docids";
-    let word_attribute_docids_name = "word_attribute_docids";
+    let word_docids_name = "word_docids";
+    let docid_word_positions_name = "docid_word_positions";

    let mut heap = BinaryHeap::with_capacity(limit + 1);

@ -211,33 +139,16 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
            if heap.len() > limit { heap.pop(); }
        }

-        for result in index.word_positions.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
+        for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
            let (word, value) = result?;
-            heap.push(Reverse((value.len(), word.to_string(), word_positions_name)));
+            heap.push(Reverse((value.len(), word.to_string(), word_docids_name)));
            if heap.len() > limit { heap.pop(); }
        }

-        for result in index.word_position_docids.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)? {
-            let (key_bytes, value) = result?;
-            let (word, position) = StrBEU32Codec::bytes_decode(key_bytes).unwrap();
-            let key = format!("{} {}", word, position);
-            heap.push(Reverse((value.len(), key, word_position_docids_name)));
-            if heap.len() > limit { heap.pop(); }
-        }
-
-        for result in index.word_four_positions_docids.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)? {
-            let (key_bytes, value) = result?;
-            let (word, lower_position) = StrBEU32Codec::bytes_decode(key_bytes).unwrap();
-            let key = format!("{} {}..{}", word, lower_position, lower_position + 4);
-            heap.push(Reverse((value.len(), key, word_four_positions_docids_name)));
-            if heap.len() > limit { heap.pop(); }
-        }
-
-        for result in index.word_attribute_docids.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)? {
-            let (key_bytes, value) = result?;
-            let (word, attribute) = StrBEU32Codec::bytes_decode(key_bytes).unwrap();
-            let key = format!("{} {}", word, attribute);
-            heap.push(Reverse((value.len(), key, word_attribute_docids_name)));
+        for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? {
+            let ((docid, word), value) = result?;
+            let key = format!("{} {}", docid, word);
+            heap.push(Reverse((value.len(), key, docid_word_positions_name)));
            if heap.len() > limit { heap.pop(); }
        }
    }
@ -253,61 +164,20 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
    Ok(wtr.flush()?)
 }

-fn word_position_doc_ids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> {
-    use heed::BytesDecode;
-    use heed::types::ByteSlice;
-    use milli::heed_codec::{RoaringBitmapCodec, StrBEU32Codec};
+fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> {
+    use std::fs::File;
+    use std::io::Write as _;

-    let stdout = io::stdout();
-    let mut wtr = csv::Writer::from_writer(stdout.lock());
-    wtr.write_record(&["word", "position", "document_ids"])?;
+    let mut output = File::create(&output)
+        .with_context(|| format!("failed to create {} file", output.display()))?;

-    let mut non_debug = Vec::new();
-    for word in words {
-        let db = index.word_position_docids.as_polymorph();
-        for result in db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, word.as_bytes())? {
-            let (bytes, postings) = result?;
-            let (w, position) = StrBEU32Codec::bytes_decode(bytes).unwrap();
-
-            // if the word is not exactly the word we requested then it means
-            // we found a word that *starts with* the requested word and we must stop.
-            if word != w { break }
-
-            let postings_string = if debug {
-                format!("{:?}", postings)
-            } else {
-                non_debug.clear();
-                non_debug.extend(postings);
-                format!("{:?}", non_debug)
-            };
-
-            wtr.write_record(&[&word, &position.to_string(), &postings_string])?;
-        }
+    match index.fst(rtxn)? {
+        Some(fst) =>  output.write_all(fst.as_fst().as_bytes())?,
+        None => {
+            let fst = fst::Set::default();
+            output.write_all(fst.as_fst().as_bytes())?;
+        },
    }

-    Ok(wtr.flush()?)
-}
-
-fn word_positions(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> {
-    let stdout = io::stdout();
-    let mut wtr = csv::Writer::from_writer(stdout.lock());
-    wtr.write_record(&["word", "positions"])?;
-
-    let mut non_debug = Vec::new();
-    for word in words {
-        let postings = index.word_positions.get(rtxn, &word)?
-            .with_context(|| format!("could not find word {:?}", &word))?;
-
-        let postings_string = if debug {
-            format!("{:?}", postings)
-        } else {
-            non_debug.clear();
-            non_debug.extend(postings);
-            format!("{:?}", non_debug)
-        };
-
-        wtr.write_record(&[word, postings_string])?;
-    }
-
-    Ok(wtr.flush()?)
+    Ok(())
 }