Introduce an infos subcommand that patches the external documents ids

This commit is contained in:
Clément Renault 2020-11-22 18:21:22 +01:00
parent 27f3ef5f7a
commit 05c95dfdc6
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -13,6 +13,7 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids";
const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const DOCUMENTS_DB_NAME: &str = "documents"; const DOCUMENTS_DB_NAME: &str = "documents";
const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids";
const ALL_DATABASE_NAMES: &[&str] = &[ const ALL_DATABASE_NAMES: &[&str] = &[
MAIN_DB_NAME, MAIN_DB_NAME,
@ -137,6 +138,10 @@ enum Command {
#[structopt(short, long, default_value = "words.fst")] #[structopt(short, long, default_value = "words.fst")]
output: PathBuf, output: PathBuf,
}, },
/// A command that patches the old external ids
/// into the new external ids format.
PatchToNewExternalIds,
} }
pub fn run(opt: Opt) -> anyhow::Result<()> { pub fn run(opt: Opt) -> anyhow::Result<()> {
@ -171,8 +176,31 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2)
}, },
ExportWordsFst { output } => export_words_fst(&index, &rtxn, output), ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
PatchToNewExternalIds => {
drop(rtxn);
let mut wtxn = index.write_txn()?;
let result = patch_to_new_external_ids(&index, &mut wtxn);
wtxn.commit()?;
result
} }
} }
}
fn patch_to_new_external_ids(index: &Index, wtxn: &mut heed::RwTxn) -> anyhow::Result<()> {
use heed::types::ByteSlice;
if let Some(documents_ids) = index.main.get::<_, ByteSlice, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)? {
let documents_ids = documents_ids.to_owned();
index.main.put::<_, ByteSlice, ByteSlice>(
wtxn,
crate::index::HARD_EXTERNAL_DOCUMENTS_IDS_KEY.as_bytes(),
&documents_ids,
)?;
index.main.delete::<_, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)?;
}
Ok(())
}
fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> {
use std::collections::BinaryHeap; use std::collections::BinaryHeap;