Remove cached Allocations struct from wpppd indexing

This commit is contained in:
Loïc Lecrenier 2022-08-10 12:47:07 +02:00
parent ef75a77464
commit 1bc4788e59

View File

@ -265,9 +265,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
) -> Result<()> { ) -> Result<()> {
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
// This is an optimisation, to reuse allocations between loop iterations
let mut allocations = Allocations::default();
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
let prefixes = PrefixTrieNode::from_sorted_prefixes( let prefixes = PrefixTrieNode::from_sorted_prefixes(
common_prefix_fst_words common_prefix_fst_words
@ -297,7 +294,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
} }
}, },
&prefixes, &prefixes,
&mut allocations,
self.max_proximity, self.max_proximity,
// and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
|key, value| { |key, value| {
@ -340,7 +336,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
&mut db_iter, &mut db_iter,
|db_iter| db_iter.next().transpose().map_err(|e| e.into()), |db_iter| db_iter.next().transpose().map_err(|e| e.into()),
&prefixes, &prefixes,
&mut allocations,
self.max_proximity, self.max_proximity,
|key, value| writer.insert(key, value).map_err(|e| e.into()), |key, value| writer.insert(key, value).map_err(|e| e.into()),
)?; )?;
@ -393,7 +388,6 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, Option<((&'a [u8], &'a [u8], u8), &'a [u8])>,
>, >,
prefixes: &PrefixTrieNode, prefixes: &PrefixTrieNode,
allocations: &mut Allocations,
max_proximity: u8, max_proximity: u8,
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
@ -406,8 +400,8 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
// Optimisation: true if there are no potential prefixes for the current word2 based on its first letter // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter
let mut empty_prefixes = false; let mut empty_prefixes = false;
let mut prefix_buffer = allocations.take_byte_vector(); let mut prefix_buffer = Vec::with_capacity(8);
let mut merge_buffer = allocations.take_byte_vector(); let mut merge_buffer = Vec::with_capacity(65_536);
while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
// skip this iteration if the proximity is over the threshold // skip this iteration if the proximity is over the threshold
@ -426,7 +420,7 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
// than the previous start of word2, then we'll need to flush the batch // than the previous start of word2, then we'll need to flush the batch
let word1_different_than_prev = word1 != batch.word1; let word1_different_than_prev = word1 != batch.word1;
if word1_different_than_prev || word2_start_different_than_prev { if word1_different_than_prev || word2_start_different_than_prev {
batch.flush(allocations, &mut merge_buffer, &mut insert)?; batch.flush(&mut merge_buffer, &mut insert)?;
// don't forget to reset the value of batch.word1 and prev_word2_start // don't forget to reset the value of batch.word1 and prev_word2_start
if word1_different_than_prev { if word1_different_than_prev {
prefix_search_start.0 = 0; prefix_search_start.0 = 0;
@ -448,19 +442,17 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
&mut prefix_buffer, &mut prefix_buffer,
&prefix_search_start, &prefix_search_start,
|prefix_buffer| { |prefix_buffer| {
let mut value = allocations.take_byte_vector();
value.extend_from_slice(&data);
let prefix_len = prefix_buffer.len(); let prefix_len = prefix_buffer.len();
prefix_buffer.push(0); prefix_buffer.push(0);
prefix_buffer.push(proximity); prefix_buffer.push(proximity);
batch.insert(&prefix_buffer, value, allocations); batch.insert(&prefix_buffer, data.to_vec());
prefix_buffer.truncate(prefix_len); prefix_buffer.truncate(prefix_len);
}, },
); );
prefix_buffer.clear(); prefix_buffer.clear();
} }
} }
batch.flush(allocations, &mut merge_buffer, &mut insert)?; batch.flush(&mut merge_buffer, &mut insert)?;
Ok(()) Ok(())
} }
/** /**
@ -482,17 +474,13 @@ struct PrefixAndProximityBatch {
impl PrefixAndProximityBatch { impl PrefixAndProximityBatch {
/// Insert the new key and value into the batch /// Insert the new key and value into the batch
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>, allocations: &mut Allocations) { fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
Ok(position) => { Ok(position) => {
self.batch[position].1.push(Cow::Owned(new_value)); self.batch[position].1.push(Cow::Owned(new_value));
} }
Err(position) => { Err(position) => {
let mut key = allocations.take_byte_vector(); self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)]));
key.extend_from_slice(new_key);
let mut mergeable_data = allocations.take_mergeable_data_vector();
mergeable_data.push(Cow::Owned(new_value));
self.batch.insert(position, (key, mergeable_data));
} }
} }
} }
@ -502,7 +490,6 @@ impl PrefixAndProximityBatch {
/// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap.
fn flush( fn flush(
&mut self, &mut self,
allocations: &mut Allocations,
merge_buffer: &mut Vec<u8>, merge_buffer: &mut Vec<u8>,
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
@ -512,7 +499,7 @@ impl PrefixAndProximityBatch {
} }
merge_buffer.clear(); merge_buffer.clear();
let mut buffer = allocations.take_byte_vector(); let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1);
buffer.extend_from_slice(word1); buffer.extend_from_slice(word1);
buffer.push(0); buffer.push(0);
@ -528,8 +515,6 @@ impl PrefixAndProximityBatch {
}; };
insert(buffer.as_slice(), data)?; insert(buffer.as_slice(), data)?;
merge_buffer.clear(); merge_buffer.clear();
allocations.reclaim_byte_vector(key);
allocations.reclaim_mergeable_data_vector(mergeable_data);
} }
Ok(()) Ok(())
@ -591,36 +576,6 @@ pub fn write_into_lmdb_database_without_merging(
Ok(()) Ok(())
} }
struct Allocations {
byte_vectors: Vec<Vec<u8>>,
mergeable_data_vectors: Vec<Vec<Cow<'static, [u8]>>>,
}
impl Default for Allocations {
fn default() -> Self {
Self {
byte_vectors: Vec::with_capacity(65_536),
mergeable_data_vectors: Vec::with_capacity(4096),
}
}
}
impl Allocations {
fn take_byte_vector(&mut self) -> Vec<u8> {
self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16))
}
fn take_mergeable_data_vector(&mut self) -> Vec<Cow<'static, [u8]>> {
self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8))
}
fn reclaim_byte_vector(&mut self, mut data: Vec<u8>) {
data.clear();
self.byte_vectors.push(data);
}
fn reclaim_mergeable_data_vector(&mut self, mut data: Vec<Cow<'static, [u8]>>) {
data.clear();
self.mergeable_data_vectors.push(data);
}
}
#[derive(Default, Debug)] #[derive(Default, Debug)]
struct PrefixTrieNode { struct PrefixTrieNode {
children: Vec<(PrefixTrieNode, u8)>, children: Vec<(PrefixTrieNode, u8)>,
@ -970,7 +925,6 @@ mod tests {
let mut result = vec![]; let mut result = vec![];
let mut allocations = Allocations::default();
let mut iter = let mut iter =
IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| {
((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice())
@ -979,7 +933,6 @@ mod tests {
&mut iter, &mut iter,
|iter| Ok(iter.next()), |iter| Ok(iter.next()),
&prefixes, &prefixes,
&mut allocations,
2, 2,
|k, v| { |k, v| {
let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap();