Compute the biggest values of the words_level_positions_docids

This commit is contained in:
Kerollmops 2021-03-17 16:09:18 +01:00 committed by many
parent f713828406
commit 8bd4f5d93e
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
2 changed files with 20 additions and 8 deletions

View File

@ -346,6 +346,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let docid_word_positions_name = "docid_word_positions";
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let word_level_position_docids_name = "word_level_position_docids";
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
let documents_name = "documents";
@ -402,6 +403,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); }
}
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((word, level, left, right), value) = result?;
let key = format!("{} {} {:?}", word, level, left..=right);
heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
if heap.len() > limit { heap.pop(); }
}
let faceted_fields = index.faceted_fields_ids(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
for (field_id, field_type) in faceted_fields {
@ -549,7 +557,7 @@ fn words_level_positions_docids(
{
let stdout = io::stdout();
let mut wtr = csv::Writer::from_writer(stdout.lock());
wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?;
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
for word in words.iter().map(AsRef::as_ref) {
let range = {
@ -561,14 +569,18 @@ fn words_level_positions_docids(
let ((w, level, left, right), docids) = result?;
if word != w { break }
let level = level.to_string();
let count = docids.len().to_string();
let docids = if debug {
format!("{:?}", docids)
} else {
format!("{:?}", docids.iter().collect::<Vec<_>>())
};
let position_range = format!("{:?}", left..=right);
let position_range = if level == 0 {
format!("{:?}", left)
} else {
format!("{:?}", left..=right)
};
let level = level.to_string();
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
}
}

View File

@ -104,16 +104,16 @@ fn compute_positions_levels(
for result in words_db.iter(rtxn)? {
let (word, ()) = result?;
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
.prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))?
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
let level_0_range = {
let left = (word, 0, u32::min_value(), u32::min_value());
let right = (word, 0, u32::max_value(), u32::max_value());
left..=right
};
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
.range(rtxn, &level_0_range)?
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
// Groups sizes are always a power of the original level_group_size and therefore a group
// always maps groups of the previous level and never splits previous levels groups in half.
let group_size_iter = (1u8..)
@ -132,7 +132,7 @@ fn compute_positions_levels(
let mut group_docids = RoaringBitmap::new();
for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
let ((_field_id, _level, value, _right), docids) = result?;
let ((_word, _level, value, _right), docids) = result?;
if i == 0 {
left = value;