Introduce a facets stats infos subcommand

This commit is contained in:
Kerollmops 2020-11-17 14:50:32 +01:00 committed by Clément Renault
parent 9ec95679e1
commit 7a6e6eb5e2
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -89,6 +89,12 @@ enum Command {
field_name: String, field_name: String,
}, },
/// Outputs some facets statistics for the given facet name.
FacetStats {
/// The field name in the document.
field_name: String,
},
/// Outputs the total size of all the docid-word-positions keys and values. /// Outputs the total size of all the docid-word-positions keys and values.
TotalDocidWordPositionsSize, TotalDocidWordPositionsSize,
@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
FacetValuesDocids { full_display, field_name } => { FacetValuesDocids { full_display, field_name } => {
facet_values_docids(&index, &rtxn, !full_display, field_name) facet_values_docids(&index, &rtxn, !full_display, field_name)
}, },
FacetStats { field_name } => facet_stats(&index, &rtxn, field_name),
TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
AverageNumberOfPositionsByWord => { AverageNumberOfPositionsByWord => {
@ -399,6 +406,62 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
Ok(wtr.flush()?) Ok(wtr.flush()?)
} }
fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> {
use crate::facet::FacetType;
use crate::heed_codec::facet::{
FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec,
};
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let faceted_fields = index.faceted_fields(&rtxn)?;
let field_id = fields_ids_map.id(&field_name)
.with_context(|| format!("field {} not found", field_name))?;
let field_type = faceted_fields.get(&field_id)
.with_context(|| format!("field {} is not faceted", field_name))?;
let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?;
let iter = match field_type {
FacetType::String => {
let iter = iter.remap_key_type::<FacetValueStringCodec>()
.map(|r| r.map(|_| 0u8));
Box::new(iter) as Box<dyn Iterator<Item=_>>
},
FacetType::Float => {
let iter = iter.remap_key_type::<FacetLevelValueF64Codec>()
.map(|r| r.map(|((_, level, _, _), _)| level));
Box::new(iter)
},
FacetType::Integer => {
let iter = iter.remap_key_type::<FacetLevelValueI64Codec>()
.map(|r| r.map(|((_, level, _, _), _)| level));
Box::new(iter)
},
};
println!("The database {:?} facet stats", field_name);
let mut level_size = 0;
let mut current_level = None;
for result in iter {
let level = result?;
if let Some(current) = current_level {
if current != level {
println!("\tnumber of groups at level {}: {}", current, level_size);
level_size = 0;
}
}
current_level = Some(level);
level_size += 1;
}
if let Some(current) = current_level {
println!("\tnumber of groups at level {}: {}", current, level_size);
}
Ok(())
}
fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> {
use std::fs::File; use std::fs::File;
use std::io::Write as _; use std::io::Write as _;