Take the words-prefixes into account while computing the biggest values

This commit is contained in:
Clément Renault 2021-02-03 10:35:19 +01:00
parent b3a21d5a50
commit 5e7b26791b
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -321,6 +321,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let main_name = "main"; let main_name = "main";
let word_docids_name = "word_docids"; let word_docids_name = "word_docids";
let word_prefix_docids_name = "word_prefix_docids";
let docid_word_positions_name = "docid_word_positions"; let docid_word_positions_name = "docid_word_positions";
let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids";
let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids";
@ -329,8 +330,16 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
let mut heap = BinaryHeap::with_capacity(limit + 1); let mut heap = BinaryHeap::with_capacity(limit + 1);
if limit > 0 { if limit > 0 {
// Fetch the words FST
let words_fst = index.words_fst(rtxn)?; let words_fst = index.words_fst(rtxn)?;
heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); let length = words_fst.as_fst().as_bytes().len();
heap.push(Reverse((length, format!("words-fst"), main_name)));
if heap.len() > limit { heap.pop(); }
// Fetch the word prefix FST
let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
let length = words_prefixes_fst.as_fst().as_bytes().len();
heap.push(Reverse((length, format!("words-prefixes-fst"), main_name)));
if heap.len() > limit { heap.pop(); } if heap.len() > limit { heap.pop(); }
if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? {
@ -344,6 +353,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); } if heap.len() > limit { heap.pop(); }
} }
for result in word_prefix_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name)));
if heap.len() > limit { heap.pop(); }
}
for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? { for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? {
let ((docid, word), value) = result?; let ((docid, word), value) = result?;
let key = format!("{} {}", docid, word); let key = format!("{} {}", docid, word);