Add database in infos crate

This commit is contained in:
many 2021-05-31 16:14:51 +02:00
parent c701f8bf36
commit b8e6db0feb
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA

View File

@ -23,6 +23,7 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids";
const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids";
const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids";
const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s";
@ -39,6 +40,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME,
FACET_ID_F64_DOCIDS_DB_NAME,
FACET_ID_STRING_DOCIDS_DB_NAME,
FIELD_ID_DOCID_FACET_F64S_DB_NAME,
@ -155,6 +157,15 @@ enum Command {
prefixes: Vec<String>,
},
FieldIdWordCountDocids {
/// Display the whole documents ids in details.
#[structopt(long)]
full_display: bool,
/// The field name in the document.
field_name: String,
},
/// Outputs a CSV with the documents ids, words and the positions where this word appears.
DocidsWordsPositions {
/// Display the whole positions in detail.
@ -271,6 +282,9 @@ fn main() -> anyhow::Result<()> {
WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
},
FieldIdWordCountDocids { full_display, field_name } => {
field_id_word_count_docids(&index, &rtxn, !full_display, field_name)
},
DocidsWordsPositions { full_display, internal_documents_ids } => {
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
},
@ -357,6 +371,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
field_id_word_count_docids,
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s: _,
@ -372,6 +387,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let word_level_position_docids_name = "word_level_position_docids";
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
let field_id_word_count_docids_name = "field_id_word_count_docids";
let facet_id_f64_docids_name = "facet_id_f64_docids";
let facet_id_string_docids_name = "facet_id_string_docids";
let documents_name = "documents";
@ -443,6 +459,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); }
}
for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((field_id, word_count), docids) = result?;
let key = format!("{} {}", field_id, word_count);
heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name)));
if heap.len() > limit { heap.pop(); }
}
let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
@ -676,6 +699,39 @@ fn word_prefixes_level_positions_docids(
Ok(wtr.flush()?)
}
fn field_id_word_count_docids(
index: &Index,
rtxn: &heed::RoTxn,
debug: bool,
field_name: String
) -> anyhow::Result<()>
{
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["field_name", "word_count", "docids"])?;
let field_id = index.fields_ids_map(rtxn)?
.id(&field_name)
.with_context(|| format!("unknown field name: {}", &field_name))?;
let left = (field_id, 1);
let right = (field_id, 11);
let iter = index.field_id_word_count_docids
.range(rtxn, &(left..=right))?;
for result in iter {
let ((_, word_count), docids) = result?;
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?;
}
Ok(wtr.flush()?)
}
fn docids_words_positions(
index: &Index,
rtxn: &heed::RoTxn,
@ -870,6 +926,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
word_prefix_pair_proximity_docids,
word_level_position_docids,
word_prefix_level_position_docids,
field_id_word_count_docids,
facet_id_f64_docids,
facet_id_string_docids,
field_id_docid_facet_f64s,
@ -893,6 +950,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(),
FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(),
FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(),
FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(),
@ -999,6 +1057,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu
let db = index.word_prefix_pair_proximity_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => {
let db = index.field_id_word_count_docids.as_polymorph();
compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name)
},
unknown => anyhow::bail!("unknown database {:?}", unknown),
}
}