From 3d02b19fbd9a88f51c9358e583afcda9ba9e3eef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 5 Mar 2021 16:13:21 +0100 Subject: [PATCH] Introduce the docids-words-positions subcommand to the infos crate --- infos/src/main.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 9f16c7c0e..c966a0143 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -114,6 +114,16 @@ enum Command { field_name: String, }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. + DocidsWordsPositions { + /// Display the whole positions in detail. + #[structopt(long)] + full_display: bool, + + /// If defined, only retrieve the documents that corresponds to these internal ids. + internal_documents_ids: Vec, + }, + /// Outputs some facets statistics for the given facet name. FacetStats { /// The field name in the document. @@ -209,6 +219,9 @@ fn main() -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + DocidsWordsPositions { full_display, internal_documents_ids } => { + docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) + }, FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -525,6 +538,39 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn docids_words_positions( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + internal_ids: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["document_id", "word", "positions"])?; + + let iter: Box> = if internal_ids.is_empty() { + Box::new(index.docid_word_positions.iter(rtxn)?) + } else { + let vec: heed::Result> = internal_ids.into_iter().map(|id| { + index.docid_word_positions.prefix_iter(rtxn, &(id, "")) + }).collect(); + Box::new(vec?.into_iter().flatten()) + }; + + for result in iter { + let ((id, word), positions) = result?; + let positions = if debug { + format!("{:?}", positions) + } else { + format!("{:?}", positions.iter().collect::>()) + }; + wtr.write_record(&[&id.to_string(), word, &positions])?; + } + + Ok(wtr.flush()?) +} + fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?;