From 85d51ab2287337ea8540d1171b0cd60a8d25641a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 2 Dec 2020 10:30:28 +0100 Subject: [PATCH 1/2] Introduce an infos subcommand to export documents from an index --- src/subcommand/infos.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index f8138660b..1093c0f48 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -145,6 +145,11 @@ enum Command { output: PathBuf, }, + /// Outputs the documents as JSON lines to the standard output. + /// + /// All of the fields are extracted, not just the displayed ones. + ExportDocuments, + /// A command that patches the old external ids /// into the new external ids format. PatchToNewExternalIds, @@ -183,13 +188,14 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) }, ExportWordsFst { output } => export_words_fst(&index, &rtxn, output), + ExportDocuments => export_documents(&index, &rtxn), PatchToNewExternalIds => { drop(rtxn); let mut wtxn = index.write_txn()?; let result = patch_to_new_external_ids(&index, &mut wtxn); wtxn.commit()?; result - } + }, } } @@ -489,6 +495,25 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyho let words_fst = index.words_fst(rtxn)?; output.write_all(words_fst.as_fst().as_bytes())?; +fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { + use std::io::{BufWriter, Write as _}; + use crate::obkv_to_json; + + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout); + + let fields_ids_map = index.fields_ids_map(rtxn)?; + let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); + + for result in index.documents.iter(rtxn)? { + let (_id, obkv) = result?; + let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; + serde_json::to_writer(&mut out, &document)?; + writeln!(&mut out)?; + } + + out.into_inner()?; + Ok(()) } From 16755b26e2540aa7c26ca07ba8b8b44acb90f12f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 2 Dec 2020 10:43:22 +0100 Subject: [PATCH 2/2] Make the export words FST export infos subcommand outputs to stdout --- src/subcommand/infos.rs | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 1093c0f48..54ce620cb 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -135,15 +135,11 @@ enum Command { word2: String, }, - /// Outputs the words FST to disk. + /// Outputs the words FST to standard output. /// /// One can use the FST binary helper to dissect and analyze it, /// you can install it using `cargo install fst-bin`. - ExportWordsFst { - /// The path where the FST will be written. - #[structopt(short, long, default_value = "words.fst")] - output: PathBuf, - }, + ExportWordsFst, /// Outputs the documents as JSON lines to the standard output. /// @@ -187,7 +183,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) }, - ExportWordsFst { output } => export_words_fst(&index, &rtxn, output), + ExportWordsFst => export_words_fst(&index, &rtxn), ExportDocuments => export_documents(&index, &rtxn), PatchToNewExternalIds => { drop(rtxn); @@ -485,15 +481,15 @@ fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow: Ok(()) } -fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { - use std::fs::File; +fn export_words_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use std::io::Write as _; - let mut output = File::create(&output) - .with_context(|| format!("failed to create {} file", output.display()))?; - + let mut stdout = io::stdout(); let words_fst = index.words_fst(rtxn)?; - output.write_all(words_fst.as_fst().as_bytes())?; + stdout.write_all(words_fst.as_fst().as_bytes())?; + + Ok(()) +} fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _};