From ad11c5fb3ff14dbe95dc473de94d35e8157f030e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 7 Sep 2020 22:36:35 +0200 Subject: [PATCH] Introduce the words-docids command for the infos binary --- .gitignore | 1 + src/bin/infos.rs | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/.gitignore b/.gitignore index 43ec51292..0ffc4c5eb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target *.csv *.mmdb +*.svg diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 7fce7d853..6ae3bda68 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -52,6 +52,16 @@ enum Command { limit: usize, }, + /// Outputs a CSV with the documents ids where the given words appears. + WordsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The words to display the documents ids of. + words: Vec, + }, + /// Outputs the total size of all the docid-word-positions keys and values. TotalDocidWordPositionsSize, @@ -93,6 +103,7 @@ fn main() -> anyhow::Result<()> { match opt.command { MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit), + WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositions => average_number_of_positions(&index, &rtxn), @@ -176,6 +187,25 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho Ok(wtr.flush()?) } +fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec) -> anyhow::Result<()> { + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "documents_ids"])?; + + for word in words { + if let Some(docids) = index.word_docids.get(rtxn, &word)? { + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[word, docids])?; + } + } + + Ok(wtr.flush()?) +} + fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { use std::fs::File; use std::io::Write as _;