From 45330a5e479d921e150c2a92c3929e78e2694235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:19:27 +0100 Subject: [PATCH 1/5] Avoid creating a default empty database in the infos crate --- infos/src/main.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 3f41b7d42..1ebf39969 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -200,6 +200,11 @@ fn run(opt: Opt) -> anyhow::Result<()> { let mut options = EnvOpenOptions::new(); options.map_size(opt.database_size.get_bytes() as usize); + // Return an error if the database does not exist. + if !opt.database.exists() { + anyhow::bail!("The database ({}) does not exist.", opt.database.display()); + } + // Open the LMDB database. let index = Index::new(options, opt.database)?; let rtxn = index.read_txn()?; From b59fe77ec75c098b2c3c3599acd5a9a59432013b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:19:52 +0100 Subject: [PATCH 2/5] Avoid creating a default empty database in the search crate --- search/src/main.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/search/src/main.rs b/search/src/main.rs index d2e727417..7e9443e5f 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -57,7 +57,11 @@ fn run(opt: Opt) -> anyhow::Result<()> { .timestamp(stderrlog::Timestamp::Off) .init()?; - std::fs::create_dir_all(&opt.database)?; + // Return an error if the database does not exist. + if !opt.database.exists() { + anyhow::bail!("The database ({}) does not exist.", opt.database.display()); + } + let mut options = EnvOpenOptions::new(); options.map_size(opt.database_size.get_bytes() as usize); From 78bede1ffbc11a2123ce9d1dc9e7a52e3290d022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:21:44 +0100 Subject: [PATCH 3/5] Fix error displaying of the workspace members --- infos/src/main.rs | 11 +---------- search/src/main.rs | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 1ebf39969..c627edad8 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -179,18 +179,9 @@ enum Command { PatchToNewExternalIds, } -fn main() -> Result<(), ()> { +fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); - match run(opt) { - Ok(()) => Ok(()), - Err(e) => { - eprintln!("{}", e); - Err(()) - }, - } -} -fn run(opt: Opt) -> anyhow::Result<()> { stderrlog::new() .verbosity(opt.verbose) .show_level(false) diff --git a/search/src/main.rs b/search/src/main.rs index 7e9443e5f..f7f95b730 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -39,18 +39,9 @@ pub struct Opt { print_facet_distribution: bool, } -fn main() -> Result<(), ()> { +fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); - match run(opt) { - Ok(()) => Ok(()), - Err(e) => { - eprintln!("{}", e); - Err(()) - }, - } -} -fn run(opt: Opt) -> anyhow::Result<()> { stderrlog::new() .verbosity(opt.verbose) .show_level(false) From 4884b324e63262dcfbfbf6ee2308678177cd547d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:24:12 +0100 Subject: [PATCH 4/5] Remove the useless external ids patch method in the infos crate --- infos/src/main.rs | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index c627edad8..92eeebf83 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -21,7 +21,6 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const DOCUMENTS_DB_NAME: &str = "documents"; -const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids"; const ALL_DATABASE_NAMES: &[&str] = &[ MAIN_DB_NAME, @@ -173,10 +172,6 @@ enum Command { /// /// All of the fields are extracted, not just the displayed ones. ExportDocuments, - - /// A command that patches the old external ids - /// into the new external ids format. - PatchToNewExternalIds, } fn main() -> anyhow::Result<()> { @@ -224,32 +219,9 @@ fn main() -> anyhow::Result<()> { ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments => export_documents(&index, &rtxn), - PatchToNewExternalIds => { - drop(rtxn); - let mut wtxn = index.write_txn()?; - let result = patch_to_new_external_ids(&index, &mut wtxn); - wtxn.commit()?; - result - }, } } -fn patch_to_new_external_ids(index: &Index, wtxn: &mut heed::RwTxn) -> anyhow::Result<()> { - use heed::types::ByteSlice; - - if let Some(documents_ids) = index.main.get::<_, ByteSlice, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)? { - let documents_ids = documents_ids.to_owned(); - index.main.put::<_, ByteSlice, ByteSlice>( - wtxn, - milli::index::HARD_EXTERNAL_DOCUMENTS_IDS_KEY.as_bytes(), - &documents_ids, - )?; - index.main.delete::<_, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)?; - } - - Ok(()) -} - fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::collections::BinaryHeap; use std::cmp::Reverse; From 1eb7ce5cdbad9274673b64ad8f051f7ce98bce02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 23 Feb 2021 21:08:52 +0100 Subject: [PATCH 5/5] Improve the export-documents infos command by accepting internal ids --- infos/src/main.rs | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 92eeebf83..91157aaad 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -171,7 +171,10 @@ enum Command { /// Outputs the documents as JSON lines to the standard output. /// /// All of the fields are extracted, not just the displayed ones. - ExportDocuments, + ExportDocuments { + /// If defined, only retrieve the documents that corresponds to these internal ids. + internal_documents_ids: Vec, + }, } fn main() -> anyhow::Result<()> { @@ -218,7 +221,9 @@ fn main() -> anyhow::Result<()> { }, ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), - ExportDocuments => export_documents(&index, &rtxn), + ExportDocuments { internal_documents_ids } => { + export_documents(&index, &rtxn, internal_documents_ids) + }, } } @@ -583,9 +588,9 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result< Ok(()) } -fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { +fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; - use milli::obkv_to_json; + use milli::{BEU32, obkv_to_json}; let stdout = io::stdout(); let mut out = BufWriter::new(stdout); @@ -593,8 +598,18 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(rtxn)?; let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); - for result in index.documents.iter(rtxn)? { - let (_id, obkv) = result?; + let iter: Box> = if internal_ids.is_empty() { + Box::new(index.documents.iter(rtxn)?.map(|result| { + result.map(|(_id, obkv)| obkv) + })) + } else { + Box::new(internal_ids.into_iter().flat_map(|id| { + index.documents.get(rtxn, &BEU32::new(id)).transpose() + })) + }; + + for result in iter { + let obkv = result?; let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; serde_json::to_writer(&mut out, &document)?; writeln!(&mut out)?;