From 3069bf4f4a3ad50a89a1573b49dec92c61107678 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 15:40:38 +0100 Subject: [PATCH] Fix and improve the words-level-positions computation --- infos/src/main.rs | 6 ++-- milli/src/update/index_documents/store.rs | 2 +- milli/src/update/words_level_positions.rs | 42 ++++++++--------------- 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index e4d59c641..c219c5758 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -558,7 +558,9 @@ fn words_level_positions_docids( left..=right }; for result in index.word_level_position_docids.range(rtxn, &range)? { - let ((word, level, left, right), docids) = result?; + let ((w, level, left, right), docids) = result?; + if word != w { break } + let level = level.to_string(); let count = docids.len().to_string(); let docids = if debug { @@ -567,7 +569,7 @@ fn words_level_positions_docids( format!("{:?}", docids.iter().collect::>()) }; let position_range = format!("{:?}", left..=right); - wtr.write_record(&[word, &level, &position_range, &count, &docids])?; + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 358552768..0f97476d9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -388,7 +388,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { key_buffer.push(0); // level 0 for position in positions { - key_buffer.truncate(word.len()); + key_buffer.truncate(word.len() + 1); let position_bytes = position.to_be_bytes(); key_buffer.extend_from_slice(position_bytes.as_bytes()); key_buffer.extend_from_slice(position_bytes.as_bytes()); diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 0a7bc484d..77cec246a 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::DecodeIgnore; +use heed::types::{DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; @@ -56,10 +56,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?; - let entries = compute_positions_levels( self.wtxn, + self.index.word_docids.remap_data_type::(), self.index.word_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -74,7 +73,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), + *self.index.word_level_position_docids.as_polymorph(), entries, |_, _| anyhow::bail!("invalid facet level merging"), WriteMethod::Append, @@ -84,25 +83,11 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { } } -fn clear_non_zero_levels_positions( - wtxn: &mut heed::RwTxn, - db: heed::Database, -) -> heed::Result<()> -{ - let mut iter = db.iter_mut(wtxn)?.lazily_decode_data(); - while let Some(result) = iter.next() { - let ((_, level, _, _), _) = result?; - if level != 0 { - iter.del_current()?; - } - } - Ok(()) -} - -/// Generates all the words positions levels (including the level zero). +/// Generates all the words positions levels based on the levels zero (including the level zero). fn compute_positions_levels( rtxn: &heed::RoTxn, - db: heed::Database, + words_db: heed::Database, + words_positions_db: heed::Database, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -116,11 +101,11 @@ fn compute_positions_levels( create_writer(compression_type, compression_level, file) })?; - for result in db.iter(rtxn)? { - let ((word, level, left, right), docids) = result?; + for result in words_db.iter(rtxn)? { + let (word, ()) = result?; - let first_level_size = db.remap_data_type::() - .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))? + let first_level_size = words_positions_db.remap_data_type::() + .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))? .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; let level_0_range = { @@ -136,14 +121,17 @@ fn compute_positions_levels( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); // As specified in the documentation, we also write the level 0 entries. - write_level_entry(&mut writer, word, level, left, right, &docids)?; + for result in words_positions_db.range(rtxn, &level_0_range)? { + let ((word, level, left, right), docids) = result?; + write_level_entry(&mut writer, word, level, left, right, &docids)?; + } for (level, group_size) in group_size_iter { let mut left = 0; let mut right = 0; let mut group_docids = RoaringBitmap::new(); - for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { let ((_field_id, _level, value, _right), docids) = result?; if i == 0 {