From 306593144d74ed673592cf772e12d2225d1e5518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 6 Jul 2022 18:20:15 +0200 Subject: [PATCH 01/17] Refactor word prefix pair proximity indexation --- infos/src/main.rs | 1 + milli/src/heed_codec/mod.rs | 1 + milli/src/heed_codec/str_str_u8_codec.rs | 35 +- milli/src/lib.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 1 + .../word_prefix_pair_proximity_docids/mod.rs | 468 ++++++++++++++++++ .../readme.md | 144 ++++++ 7 files changed, 649 insertions(+), 3 deletions(-) create mode 100644 milli/src/update/word_prefix_pair_proximity_docids/mod.rs create mode 100644 milli/src/update/word_prefix_pair_proximity_docids/readme.md diff --git a/infos/src/main.rs b/infos/src/main.rs index feec17557..4e05ce0a5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -1153,6 +1153,7 @@ fn word_pair_proximities_docids( prefix.extend_from_slice(word1.as_bytes()); prefix.push(0); prefix.extend_from_slice(word2.as_bytes()); + prefix.push(0); let db = index.word_pair_proximity_docids.as_polymorph(); let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 2f2a01192..02235f26d 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -16,3 +16,4 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::StrStrU8Codec; +pub use self::str_str_u8_codec::UncheckedStrStrU8Codec; diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 2454e7d56..888e08752 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -9,9 +9,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_last()?; let s1_end = bytes.iter().position(|b| *b == 0)?; - let (s1_bytes, s2_bytes) = bytes.split_at(s1_end); + let (s1_bytes, rest) = bytes.split_at(s1_end); + let rest = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; - let s2 = str::from_utf8(&s2_bytes[1..]).ok()?; + let (_, s2_bytes) = rest.split_last()?; + let s2 = str::from_utf8(s2_bytes).ok()?; Some((s1, s2, *n)) } } @@ -24,6 +26,35 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { bytes.extend_from_slice(s1.as_bytes()); bytes.push(0); bytes.extend_from_slice(s2.as_bytes()); + bytes.push(0); + bytes.push(*n); + Some(Cow::Owned(bytes)) + } +} +pub struct UncheckedStrStrU8Codec; + +impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { + type DItem = (&'a [u8], &'a [u8], u8); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (n, bytes) = bytes.split_last()?; + let s1_end = bytes.iter().position(|b| *b == 0)?; + let (s1_bytes, rest) = bytes.split_at(s1_end); + let rest = &rest[1..]; + let (_, s2_bytes) = rest.split_last()?; + Some((s1_bytes, s2_bytes, *n)) + } +} + +impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { + type EItem = (&'a [u8], &'a [u8], u8); + + fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); + bytes.extend_from_slice(s1); + bytes.push(0); + bytes.extend_from_slice(s2); + bytes.push(0); bytes.push(*n); Some(Cow::Owned(bytes)) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 85b25cad1..ac88ebdab 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, UncheckedStrStrU8Codec, }; pub use self::index::Index; pub use self::search::{ diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 90349eb93..5117bfaba 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -142,6 +142,7 @@ fn document_word_positions_into_sorter<'b>( key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); + key_buffer.push(0); key_buffer.push(prox as u8); word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs new file mode 100644 index 000000000..119c0c53e --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -0,0 +1,468 @@ +use grenad::CompressionType; +use heed::types::ByteSlice; + +use heed::BytesDecode; +use log::debug; + +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; +use std::time::Instant; + +use crate::update::index_documents::{ + create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, +}; +use crate::{Index, Result, UncheckedStrStrU8Codec}; + +pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + max_proximity: u8, + max_prefix_length: usize, +} + +impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { + WordPrefixPairProximityDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_nb_chunks: None, + max_memory: None, + max_proximity: 4, + max_prefix_length: 2, + } + } + + /// Set the maximum proximity required to make a prefix be part of the words prefixes + /// database. If two words are too far from the threshold the associated documents will + /// not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 7 and will be clamped + /// to this bound otherwise. + pub fn max_proximity(&mut self, value: u8) -> &mut Self { + self.max_proximity = value.max(7); + self + } + + /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words + /// prefixes database. If the prefix length is higher than the threshold, the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 2. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] + pub fn execute<'a>( + mut self, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = self + .index + .word_prefix_pair_proximity_docids + .remap_data_type::() + .iter_mut(self.wtxn)?; + while let Some(((_, w2, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(w2.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + // We construct a Trie of all the prefixes that are smaller than the max prefix length + // This is an optimisation that allows us to iterate over all prefixes of a word quickly. + let new_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + let common_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + let mut allocations = Allocations::default(); + let mut batch = PrefixAndProximityBatch::default(); + + if !common_prefix_fst_words.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + + while let Some((key, data)) = cursor.move_on_next()? { + let (word1, word2, proximity) = + UncheckedStrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + + if proximity <= self.max_proximity { + batch.flush_if_necessary( + word1, + word2, + &mut allocations, + &mut |key, value| { + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + self.insert_word_prefix_pair_proximity_docids_into_batch( + word2, + proximity, + data, + &common_prefix_fst_words, + &mut batch, + &mut allocations, + )?; + } + } + batch.flush(&mut allocations, &mut |key, value| { + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + })?; + } + + if !new_prefix_fst_words.is_empty() { + let mut db_iter = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(self.wtxn)?; + + let mut writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); + + while let Some(((word1, word2, proximity), data)) = db_iter.next().transpose()? { + if proximity <= self.max_proximity { + batch.flush_if_necessary( + word1, + word2, + &mut allocations, + &mut |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + self.insert_word_prefix_pair_proximity_docids_into_batch( + word2, + proximity, + data, + &new_prefix_fst_words, + &mut batch, + &mut allocations, + )?; + } + } + batch.flush(&mut allocations, &mut |key, value| { + writer.insert(key, value).map_err(|e| e.into()) + })?; + + drop(db_iter); + writer_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } + + Ok(()) + } + + fn insert_word_prefix_pair_proximity_docids_into_batch<'b, 'c>( + &self, + word2: &[u8], + proximity: u8, + data: &'b [u8], + prefixes: &'c PrefixTrieNode, + writer: &'b mut PrefixAndProximityBatch, + allocations: &mut Allocations, + ) -> Result<()> { + let mut prefix_buffer = allocations.take_byte_vector(); + prefixes.for_each_prefix_of(word2, &mut prefix_buffer, |prefix| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + writer.insert(prefix, proximity, value, allocations); + }); + allocations.reclaim_byte_vector(prefix_buffer); + Ok(()) + } +} + +/** +A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). +The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. + +It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. + +A batch is valid only for a specific `word1`. Also, all prefixes stored in the batch start with the same letter. Make sure to +call [`self.flush_if_necessary`](Self::flush_if_necessary) before inserting a list of sorted `(prefix, proximity)` (and where each +`prefix` starts with the same letter) in order to uphold these invariants. + +The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content +can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: +- key : (word1, prefix, proximity) as bytes +- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes +*/ +#[derive(Default)] +struct PrefixAndProximityBatch { + batch: Vec<(Vec, Vec>)>, + word1: Vec, + word2_start: u8, +} + +impl PrefixAndProximityBatch { + fn insert( + &mut self, + new_prefix: &[u8], + new_proximity: u8, + new_value: Vec, + allocations: &mut Allocations, + ) { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_prefix); + key.push(0); + key.push(new_proximity); + + if let Some(position) = self.batch.iter().position(|(k, _)| k >= &key) { + let (existing_key, existing_data) = &mut self.batch[position]; + if existing_key == &key { + existing_data.push(Cow::Owned(new_value)); + } else { + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.insert(position, (key, mergeable_data)); + } + } else { + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.push((key, mergeable_data)); + } + } + + /// Call [`self.flush`](Self::flush) if `word1` changed or if `word2` begins with a different letter than the + /// previous word2. Update `prev_word1` and `prev_word2_start` with the new values from `word1` and `word2`. + fn flush_if_necessary( + &mut self, + word1: &[u8], + word2: &[u8], + allocations: &mut Allocations, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let word2_start = word2[0]; + if word1 != self.word1 { + self.flush(allocations, insert)?; + self.word1.clear(); + self.word1.extend_from_slice(word1); + if word2_start != self.word2_start { + self.word2_start = word2_start; + } + } + if word2_start != self.word2_start { + self.flush(allocations, insert)?; + self.word2_start = word2_start; + } + Ok(()) + } + + /// Empties the batch, calling `insert` on each element. + /// + /// The key given to insert is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + fn flush( + &mut self, + allocations: &mut Allocations, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let PrefixAndProximityBatch { batch, word1: prev_word1, word2_start: _ } = self; + let mut buffer = allocations.take_byte_vector(); + buffer.extend_from_slice(prev_word1.as_slice()); + buffer.push(0); + + for (key, mergeable_data) in batch.drain(..) { + buffer.truncate(prev_word1.len() + 1); + buffer.extend_from_slice(key.as_slice()); + let data = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; + insert(buffer.as_slice(), &data)?; + + allocations.reclaim_byte_vector(key); + allocations.reclaim_mergeable_data_vector(mergeable_data); + } + Ok(()) + } +} + +fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], +) -> Result<()> { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; + } + } + Ok(()) +} + +// This is adapted from `sorter_into_lmdb_database` +pub fn writer_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, +) -> Result<()> { + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + + let before = Instant::now(); + + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + insert_into_database(wtxn, database, k, v)?; + } + } + + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); + Ok(()) +} + +struct Allocations { + byte_vectors: Vec>, + mergeable_data_vectors: Vec>>, +} +impl Default for Allocations { + fn default() -> Self { + Self { + byte_vectors: Vec::with_capacity(65_536), + mergeable_data_vectors: Vec::with_capacity(4096), + } + } +} +impl Allocations { + fn take_byte_vector(&mut self) -> Vec { + self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) + } + fn take_mergeable_data_vector(&mut self) -> Vec> { + self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) + } + + fn reclaim_byte_vector(&mut self, mut data: Vec) { + data.clear(); + self.byte_vectors.push(data); + } + fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { + data.clear(); + self.mergeable_data_vectors.push(data); + } +} + +#[derive(Default, Debug)] +struct PrefixTrieNode { + children: Vec<(PrefixTrieNode, u8)>, + is_end_node: bool, +} + +impl PrefixTrieNode { + fn is_empty(&self) -> bool { + self.children.is_empty() + } + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { + let mut node = PrefixTrieNode::default(); + for prefix in prefixes { + node.insert_sorted_prefix(prefix.as_bytes().into_iter()); + } + node + } + fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { + if let Some(&c) = prefix.next() { + if let Some((node, byte)) = self.children.last_mut() { + if *byte == c { + node.insert_sorted_prefix(prefix); + return; + } + } + let mut new_node = PrefixTrieNode::default(); + new_node.insert_sorted_prefix(prefix); + self.children.push((new_node, c)); + } else { + self.is_end_node = true; + } + } + fn for_each_prefix_of(&self, word: &[u8], buffer: &mut Vec, mut do_fn: impl FnMut(&[u8])) { + let mut cur_node = self; + for &byte in word { + buffer.push(byte); + if let Some((child_node, _)) = cur_node.children.iter().find(|(_, c)| *c == byte) { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer.as_slice()); + } + } else { + break; + } + } + } + // fn print(&self, buffer: &mut String, ident: usize) { + // let mut spaces = String::new(); + // for _ in 0..ident { + // spaces.push(' ') + // } + // for (child, c) in &self.children { + // buffer.push(char::from_u32(*c as u32).unwrap()); + // println!("{spaces}{buffer}:"); + // child.print(buffer, ident + 4); + // buffer.pop(); + // } + // } +} diff --git a/milli/src/update/word_prefix_pair_proximity_docids/readme.md b/milli/src/update/word_prefix_pair_proximity_docids/readme.md new file mode 100644 index 000000000..7e467e92d --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids/readme.md @@ -0,0 +1,144 @@ +## What is WordPrefixPairProximityDocids? +The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. + +The prefixes present in this database are only those that correspond to many different words present in the documents. + +## How is it created/updated? (simplified version) +To compute it, we have access to (mainly) two inputs: + +* a list of sorted prefixes, such as: +``` +c +ca +cat +d +do +dog +``` +Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. + +* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: +``` +good dog 3 -> docids1: [2, 5, 6] +good doggo 1 -> docids2: [8] +good dogma 1 -> docids3: [7, 19, 20] +good ghost 2 -> docids4: [1] +horror cathedral 4 -> docids5: [1, 2] +``` + +I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: + +1. ==Outer loop:== First, we iterate over each word pair and its proximity: +``` +word1 : good +word2 : dog +proximity: 3 +``` +2. ==Inner loop:== Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +``` +Outer loop 1: +------------------------------ +word1 : good +word2 : dog +proximity: 3 +docids : docids1 + +prefixes: [d, do, dog] + +batch: [ + (d, 3) -> [docids1] + (do, 3) -> [docids1] + (dog, 3) -> [docids1] +] +``` +3. For illustration purpose, let's run through a second iteration of the outer loop: +``` +Outer loop 2: +------------------------------ +word1 : good +word2 : doggo +proximity: 1 +docids : docids2 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2] + (d, 3) -> [docids1] + (do, 1) -> [docids2] + (do, 3) -> [docids1] + (dog, 1) -> [docids2] + (dog, 3) -> [docids1] +] +``` +Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. + +4. And a third: +``` +Outer loop 3: +------------------------------ +word1 : good +word2 : dogma +proximity: 1 +docids : docids3 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2, docids3] + (d, 3) -> [docids1] + (do, 1) -> [docids2, docids3] + (do, 3) -> [docids1] + (dog, 1) -> [docids2, docids3] + (dog, 3) -> [docids1] +] +``` +Notice that there were some conflicts which were resolved by merging the conflicting values together. + +5. On the fourth iteration of the outer loop, we have: +``` +Outer loop 4: +------------------------------ +word1 : good +word2 : ghost +proximity: 2 +``` +Because `word2` begins with a different letter than the previous `word2`, we know that: +1. All the prefixes of `word2` are greater than the prefixes of the previous word2 +2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. +Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. + +6. ==Flushing the batch==: to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +``` +Flushing Batch loop 1: +------------------------------ +word1 : good +word2 : d +proximity: 1 +docids : [docids2, docids3] +``` +We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. +Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. + +7. That's it! ... except... + +## How is it created/updated (continued) + +I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: + +* For the list of sorted prefixes, we have: + * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents + * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents + +* For the list of word pairs and proximities, we have: + * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents + * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. + +To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: + +1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. +3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. + + From 86807ca848e1a3573cd8a7e010230d9852312c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 13 Jul 2022 19:35:17 +0200 Subject: [PATCH 02/17] Refactor word prefix pair proximity indexation further --- .../word_prefix_pair_proximity_docids/mod.rs | 511 +++++++++++------- 1 file changed, 326 insertions(+), 185 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index 119c0c53e..5b073bb95 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -5,6 +5,7 @@ use heed::BytesDecode; use log::debug; use std::borrow::Cow; +use std::cmp::Ordering; use std::collections::HashSet; use std::io::BufReader; use std::time::Instant; @@ -72,6 +73,84 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + let mut allocations = Allocations::default(); + + let mut count = 0; + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + Self::execute_on_word_pairs_and_prefixes( + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some(((word1, word2, proximity), value))) + } else { + Ok(None) + } + }, + &prefixes, + &mut allocations, + self.max_proximity, + |key, value| { + count += 1; + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + dbg!(count); + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut db_iter = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(self.wtxn)?; + + let mut writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); + + Self::execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + &mut allocations, + self.max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + writer_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. @@ -89,131 +168,71 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } } - // We construct a Trie of all the prefixes that are smaller than the max prefix length - // This is an optimisation that allows us to iterate over all prefixes of a word quickly. - let new_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - let common_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - let mut allocations = Allocations::default(); - let mut batch = PrefixAndProximityBatch::default(); - - if !common_prefix_fst_words.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - - while let Some((key, data)) = cursor.move_on_next()? { - let (word1, word2, proximity) = - UncheckedStrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - - if proximity <= self.max_proximity { - batch.flush_if_necessary( - word1, - word2, - &mut allocations, - &mut |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - self.insert_word_prefix_pair_proximity_docids_into_batch( - word2, - proximity, - data, - &common_prefix_fst_words, - &mut batch, - &mut allocations, - )?; - } - } - batch.flush(&mut allocations, &mut |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - })?; - } - - if !new_prefix_fst_words.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - while let Some(((word1, word2, proximity), data)) = db_iter.next().transpose()? { - if proximity <= self.max_proximity { - batch.flush_if_necessary( - word1, - word2, - &mut allocations, - &mut |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - self.insert_word_prefix_pair_proximity_docids_into_batch( - word2, - proximity, - data, - &new_prefix_fst_words, - &mut batch, - &mut allocations, - )?; - } - } - batch.flush(&mut allocations, &mut |key, value| { - writer.insert(key, value).map_err(|e| e.into()) - })?; - - drop(db_iter); - writer_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - Ok(()) } - fn insert_word_prefix_pair_proximity_docids_into_batch<'b, 'c>( - &self, - word2: &[u8], - proximity: u8, - data: &'b [u8], - prefixes: &'c PrefixTrieNode, - writer: &'b mut PrefixAndProximityBatch, + fn execute_on_word_pairs_and_prefixes( + iter: &mut Iter, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut Iter, + ) -> Result< + Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, allocations: &mut Allocations, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + let mut empty_prefixes = false; + let mut prefix_buffer = allocations.take_byte_vector(); - prefixes.for_each_prefix_of(word2, &mut prefix_buffer, |prefix| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); - writer.insert(prefix, proximity, value, allocations); - }); - allocations.reclaim_byte_vector(prefix_buffer); + + while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + if proximity > max_proximity { + continue; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + let word1_different_than_prev = word1 != batch.word1; + if word1_different_than_prev || word2_start_different_than_prev { + batch.flush(allocations, &mut insert)?; + if word1_different_than_prev { + prefix_search_start.0 = 0; + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + // word2_start_different_than_prev == true + prev_word2_start = word2[0]; + } + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + let prefix_len = prefix_buffer.len(); + prefix_buffer.push(0); + prefix_buffer.push(proximity); + batch.insert(&prefix_buffer, value, allocations); + prefix_buffer.truncate(prefix_len); + }, + ); + prefix_buffer.clear(); + } + } + batch.flush(allocations, &mut insert)?; Ok(()) } } @@ -224,10 +243,6 @@ The keys are sorted and conflicts are resolved by merging the vectors of bitstri It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. -A batch is valid only for a specific `word1`. Also, all prefixes stored in the batch start with the same letter. Make sure to -call [`self.flush_if_necessary`](Self::flush_if_necessary) before inserting a list of sorted `(prefix, proximity)` (and where each -`prefix` starts with the same letter) in order to uphold these invariants. - The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: - key : (word1, prefix, proximity) as bytes @@ -235,91 +250,95 @@ can be inserted into the database in sorted order. When it is flushed, it calls */ #[derive(Default)] struct PrefixAndProximityBatch { - batch: Vec<(Vec, Vec>)>, word1: Vec, - word2_start: u8, + batch: Vec<(Vec, Vec>)>, } impl PrefixAndProximityBatch { - fn insert( - &mut self, - new_prefix: &[u8], - new_proximity: u8, - new_value: Vec, - allocations: &mut Allocations, - ) { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_prefix); - key.push(0); - key.push(new_proximity); - - if let Some(position) = self.batch.iter().position(|(k, _)| k >= &key) { - let (existing_key, existing_data) = &mut self.batch[position]; - if existing_key == &key { - existing_data.push(Cow::Owned(new_value)); - } else { + fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { + // this is a macro instead of a closure because the borrow checker will complain + // about the closure moving `new_value` + macro_rules! insert_new_key_value { + () => { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_key); let mut mergeable_data = allocations.take_mergeable_data_vector(); mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert(position, (key, mergeable_data)); + self.batch.push((key, mergeable_data)); + }; + ($idx:expr) => { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_key); + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.insert($idx, (key, mergeable_data)); + }; + } + + if self.batch.is_empty() { + insert_new_key_value!(); + } else if self.batch.len() == 1 { + let (existing_key, existing_data) = &mut self.batch[0]; + match new_key.cmp(&existing_key) { + Ordering::Less => { + insert_new_key_value!(0); + } + Ordering::Equal => { + existing_data.push(Cow::Owned(new_value)); + } + Ordering::Greater => { + insert_new_key_value!(); + } } } else { - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.push((key, mergeable_data)); - } - } - - /// Call [`self.flush`](Self::flush) if `word1` changed or if `word2` begins with a different letter than the - /// previous word2. Update `prev_word1` and `prev_word2_start` with the new values from `word1` and `word2`. - fn flush_if_necessary( - &mut self, - word1: &[u8], - word2: &[u8], - allocations: &mut Allocations, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let word2_start = word2[0]; - if word1 != self.word1 { - self.flush(allocations, insert)?; - self.word1.clear(); - self.word1.extend_from_slice(word1); - if word2_start != self.word2_start { - self.word2_start = word2_start; + match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + Ok(position) => { + self.batch[position].1.push(Cow::Owned(new_value)); + } + Err(position) => { + insert_new_key_value!(position); + } } } - if word2_start != self.word2_start { - self.flush(allocations, insert)?; - self.word2_start = word2_start; - } - Ok(()) } /// Empties the batch, calling `insert` on each element. /// - /// The key given to insert is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. fn flush( &mut self, allocations: &mut Allocations, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { - let PrefixAndProximityBatch { batch, word1: prev_word1, word2_start: _ } = self; + let PrefixAndProximityBatch { word1, batch } = self; + if batch.is_empty() { + return Ok(()); + } + let mut buffer = allocations.take_byte_vector(); - buffer.extend_from_slice(prev_word1.as_slice()); + buffer.extend_from_slice(word1); buffer.push(0); for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(prev_word1.len() + 1); + buffer.truncate(word1.len() + 1); buffer.extend_from_slice(key.as_slice()); - let data = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; - insert(buffer.as_slice(), &data)?; - + let merged; + let data = if mergeable_data.len() > 1 { + merged = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; + &merged + } else { + &mergeable_data[0] + }; + insert(buffer.as_slice(), data)?; allocations.reclaim_byte_vector(key); allocations.reclaim_mergeable_data_vector(mergeable_data); } + Ok(()) } } +// This is adapted from `sorter_into_lmdb_database` fn insert_into_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, @@ -356,7 +375,8 @@ pub fn writer_into_lmdb_database( ) -> Result<()> { let file = writer.into_inner()?; let reader = grenad::Reader::new(BufReader::new(file))?; - + let len = reader.len(); + dbg!(len); let before = Instant::now(); if database.is_empty(wtxn)? { @@ -413,10 +433,44 @@ struct PrefixTrieNode { is_end_node: bool, } +#[derive(Debug)] +struct PrefixTrieNodeSearchStart(usize); + impl PrefixTrieNode { fn is_empty(&self) -> bool { self.children.is_empty() } + + /// Returns false if the trie does not contain a prefix of the given word. + /// Returns true if the trie *may* contain a prefix of the given word. + /// + /// Moves the search start to the first node equal to the first letter of the word, + /// or to 0 otherwise. + fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { + let byte = word[0]; + if self.children[search_start.0].1 == byte { + return true; + } else if let Some(position) = + self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) + { + let (_, c) = self.children[search_start.0 + position]; + // dbg!(position, c, byte); + if c == byte { + // dbg!(); + search_start.0 += position; + true + } else { + // dbg!(); + search_start.0 = 0; + false + } + } else { + // dbg!(); + search_start.0 = 0; + false + } + } + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { let mut node = PrefixTrieNode::default(); for prefix in prefixes { @@ -439,17 +493,41 @@ impl PrefixTrieNode { self.is_end_node = true; } } - fn for_each_prefix_of(&self, word: &[u8], buffer: &mut Vec, mut do_fn: impl FnMut(&[u8])) { + fn for_each_prefix_of( + &self, + word: &[u8], + buffer: &mut Vec, + search_start: &PrefixTrieNodeSearchStart, + mut do_fn: impl FnMut(&mut Vec), + ) { + let first_byte = word[0]; let mut cur_node = self; - for &byte in word { - buffer.push(byte); - if let Some((child_node, _)) = cur_node.children.iter().find(|(_, c)| *c == byte) { + buffer.push(first_byte); + if let Some((child_node, c)) = + cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) + { + if *c == first_byte { cur_node = child_node; if cur_node.is_end_node { - do_fn(buffer.as_slice()); + do_fn(buffer); + } + for &byte in &word[1..] { + buffer.push(byte); + if let Some((child_node, c)) = + cur_node.children.iter().find(|(_, c)| *c >= byte) + { + if *c == byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + } else { + break; + } + } else { + break; + } } - } else { - break; } } } @@ -466,3 +544,66 @@ impl PrefixTrieNode { // } // } } +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_trie() { + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", + "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", + "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", + "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", + "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", + "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", + "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", + "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", + "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", + "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", + "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", + "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", + "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", + "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", + "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", + "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", + ])); + // let mut buffer = String::new(); + // trie.print(&mut buffer, 0); + // buffer.clear(); + let mut search_start = PrefixTrieNodeSearchStart(0); + let mut buffer = vec![]; + + let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); + println!("{search_start:?}"); + println!("is empty: {is_empty}"); + trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { + let s = std::str::from_utf8(x).unwrap(); + println!("{s}"); + }); + buffer.clear(); + trie.for_each_prefix_of("trans".as_bytes(), &mut buffer, &search_start, |x| { + let s = std::str::from_utf8(x).unwrap(); + println!("{s}"); + }); + buffer.clear(); + + trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { + let s = std::str::from_utf8(x).unwrap(); + println!("{s}"); + }); + buffer.clear(); + // trie.for_each_prefix_of("1", |x| { + // println!("{x}"); + // }); + // trie.for_each_prefix_of("19", |x| { + // println!("{x}"); + // }); + // trie.for_each_prefix_of("21", |x| { + // println!("{x}"); + // }); + // let mut buffer = vec![]; + // trie.for_each_prefix_of("integ", &mut buffer, |x| { + // println!("{x}"); + // }); + } +} From d3501141596657fbc74b7f38024d7727d6501519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 14 Jul 2022 11:25:10 +0200 Subject: [PATCH 03/17] Add tests for WordPrefixPairProximityDocIds --- .../word_prefix_pair_proximity_docids/mod.rs | 373 ++++++++++++------ 1 file changed, 242 insertions(+), 131 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index 5b073bb95..a5ece8005 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -88,7 +88,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { if !prefixes.is_empty() { let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - Self::execute_on_word_pairs_and_prefixes( + execute_on_word_pairs_and_prefixes( &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { @@ -113,7 +113,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { }, )?; } - dbg!(count); let prefixes = PrefixTrieNode::from_sorted_prefixes( new_prefix_fst_words @@ -136,7 +135,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { tempfile::tempfile()?, ); - Self::execute_on_word_pairs_and_prefixes( + execute_on_word_pairs_and_prefixes( &mut db_iter, |db_iter| db_iter.next().transpose().map_err(|e| e.into()), &prefixes, @@ -145,7 +144,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { |key, value| writer.insert(key, value).map_err(|e| e.into()), )?; drop(db_iter); - writer_into_lmdb_database( + writer_of_new_elements_into_lmdb_database( self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), writer, @@ -170,73 +169,71 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { Ok(()) } - - fn execute_on_word_pairs_and_prefixes( - iter: &mut Iter, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut Iter, - ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - allocations: &mut Allocations, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, - ) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - let mut empty_prefixes = false; - - let mut prefix_buffer = allocations.take_byte_vector(); - - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { - if proximity > max_proximity { - continue; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - let word1_different_than_prev = word1 != batch.word1; - if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut insert)?; - if word1_different_than_prev { - prefix_search_start.0 = 0; - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - // word2_start_different_than_prev == true - prev_word2_start = word2[0]; - } - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); - let prefix_len = prefix_buffer.len(); - prefix_buffer.push(0); - prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, value, allocations); - prefix_buffer.truncate(prefix_len); - }, - ); - prefix_buffer.clear(); - } - } - batch.flush(allocations, &mut insert)?; - Ok(()) - } } +fn execute_on_word_pairs_and_prefixes( + iter: &mut Iter, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut Iter, + ) -> Result< + Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, + allocations: &mut Allocations, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + let mut empty_prefixes = false; + + let mut prefix_buffer = allocations.take_byte_vector(); + + while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + if proximity > max_proximity { + continue; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + let word1_different_than_prev = word1 != batch.word1; + if word1_different_than_prev || word2_start_different_than_prev { + batch.flush(allocations, &mut insert)?; + if word1_different_than_prev { + prefix_search_start.0 = 0; + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + // word2_start_different_than_prev == true + prev_word2_start = word2[0]; + } + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + let prefix_len = prefix_buffer.len(); + prefix_buffer.push(0); + prefix_buffer.push(proximity); + batch.insert(&prefix_buffer, value, allocations); + prefix_buffer.truncate(prefix_len); + }, + ); + prefix_buffer.clear(); + } + } + batch.flush(allocations, &mut insert)?; + Ok(()) +} /** A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. @@ -275,30 +272,32 @@ impl PrefixAndProximityBatch { }; } - if self.batch.is_empty() { - insert_new_key_value!(); - } else if self.batch.len() == 1 { - let (existing_key, existing_data) = &mut self.batch[0]; - match new_key.cmp(&existing_key) { - Ordering::Less => { - insert_new_key_value!(0); - } - Ordering::Equal => { - existing_data.push(Cow::Owned(new_value)); - } - Ordering::Greater => { - insert_new_key_value!(); + match self.batch.len() { + 0 => { + insert_new_key_value!(); + } + 1 => { + let (existing_key, existing_data) = &mut self.batch[0]; + match new_key.cmp(&existing_key) { + Ordering::Less => { + insert_new_key_value!(0); + } + Ordering::Equal => { + existing_data.push(Cow::Owned(new_value)); + } + Ordering::Greater => { + insert_new_key_value!(); + } } } - } else { - match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + _ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { Ok(position) => { self.batch[position].1.push(Cow::Owned(new_value)); } Err(position) => { insert_new_key_value!(position); } - } + }, } } @@ -368,17 +367,13 @@ fn insert_into_database( } // This is adapted from `sorter_into_lmdb_database` -pub fn writer_into_lmdb_database( +pub fn writer_of_new_elements_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, writer: grenad::Writer, ) -> Result<()> { let file = writer.into_inner()?; let reader = grenad::Reader::new(BufReader::new(file))?; - let len = reader.len(); - dbg!(len); - let before = Instant::now(); - if database.is_empty(wtxn)? { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; let mut cursor = reader.into_cursor()?; @@ -389,11 +384,9 @@ pub fn writer_into_lmdb_database( } else { let mut cursor = reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - insert_into_database(wtxn, database, k, v)?; + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; } } - - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } @@ -454,18 +447,14 @@ impl PrefixTrieNode { self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) { let (_, c) = self.children[search_start.0 + position]; - // dbg!(position, c, byte); if c == byte { - // dbg!(); search_start.0 += position; true } else { - // dbg!(); search_start.0 = 0; false } } else { - // dbg!(); search_start.0 = 0; false } @@ -546,7 +535,26 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + + use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; + use super::*; + + fn check_prefixes( + trie: &PrefixTrieNode, + search_start: &PrefixTrieNodeSearchStart, + word: &str, + expected_prefixes: &[&str], + ) { + let mut actual_prefixes = vec![]; + trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { + let s = String::from_utf8(x.to_owned()).unwrap(); + actual_prefixes.push(s); + }); + assert_eq!(actual_prefixes, expected_prefixes); + } + #[test] fn test_trie() { let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ @@ -567,43 +575,146 @@ mod tests { "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", ])); - // let mut buffer = String::new(); - // trie.print(&mut buffer, 0); - // buffer.clear(); + let mut search_start = PrefixTrieNodeSearchStart(0); - let mut buffer = vec![]; let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - println!("{search_start:?}"); - println!("is empty: {is_empty}"); - trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { - let s = std::str::from_utf8(x).unwrap(); - println!("{s}"); - }); - buffer.clear(); - trie.for_each_prefix_of("trans".as_bytes(), &mut buffer, &search_start, |x| { - let s = std::str::from_utf8(x).unwrap(); - println!("{s}"); - }); - buffer.clear(); + assert!(!is_empty); + assert_eq!(search_start.0, 2); - trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { - let s = std::str::from_utf8(x).unwrap(); - println!("{s}"); - }); - buffer.clear(); - // trie.for_each_prefix_of("1", |x| { - // println!("{x}"); - // }); - // trie.for_each_prefix_of("19", |x| { - // println!("{x}"); - // }); - // trie.for_each_prefix_of("21", |x| { - // println!("{x}"); - // }); - // let mut buffer = vec![]; - // trie.for_each_prefix_of("integ", &mut buffer, |x| { - // println!("{x}"); - // }); + check_prefixes(&trie, &search_start, "affair", &["a"]); + check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); + + let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(trie.children[search_start.0].1, b'u'); + + check_prefixes(&trie, &search_start, "unique", &["u", "un"]); + + // NOTE: this should fail, because the search start is already beyong 'a' + let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); + assert!(!is_empty); + // search start is reset + assert_eq!(search_start.0, 0); + + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); + check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); + } + + #[test] + fn test_execute_on_word_pairs_and_prefixes() { + let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + + let mut serialised_bitmap123 = vec![]; + let mut bitmap123 = RoaringBitmap::new(); + bitmap123.insert(1); + bitmap123.insert(2); + bitmap123.insert(3); + CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); + + let mut serialised_bitmap456 = vec![]; + let mut bitmap456 = RoaringBitmap::new(); + bitmap456.insert(4); + bitmap456.insert(5); + bitmap456.insert(6); + CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); + + let mut serialised_bitmap789 = vec![]; + let mut bitmap789 = RoaringBitmap::new(); + bitmap789.insert(7); + bitmap789.insert(8); + bitmap789.insert(9); + CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); + + let mut serialised_bitmap_ranges = vec![]; + let mut bitmap_ranges = RoaringBitmap::new(); + bitmap_ranges.insert_range(63_000..65_000); + bitmap_ranges.insert_range(123_000..128_000); + CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); + + let word_pairs = [ + // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) + (("healthy", "arbre", 2), &serialised_bitmap123), + // not inserted because 3 > max_proximity + (("healthy", "arbre", 3), &serialised_bitmap456), + // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) + (("healthy", "arbres", 1), &serialised_bitmap123), + // 1, 3: + (("healthy", "arbres", 2), &serialised_bitmap456), + // not be inserted because 3 > max_proximity + (("healthy", "arbres", 3), &serialised_bitmap789), + // not inserted because no prefixes for boat + (("healthy", "boat", 1), &serialised_bitmap123), + // not inserted because no prefixes for ca + (("healthy", "ca", 1), &serialised_bitmap123), + // 4: (healthy cat 1) with (bitmap456 + bitmap123) + (("healthy", "cats", 1), &serialised_bitmap456), + // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) + (("healthy", "cats", 2), &serialised_bitmap789), + // 4 + 6: (healthy catto 1) with (bitmap123) + (("healthy", "cattos", 1), &serialised_bitmap123), + // 5 + 7: (healthy catto 2) with (bitmap_ranges) + (("healthy", "cattos", 2), &serialised_bitmap_ranges), + // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) + (("jittery", "cat", 1), &serialised_bitmap123), + // 8: + (("jittery", "cata", 1), &serialised_bitmap456), + // 8: + (("jittery", "catb", 1), &serialised_bitmap789), + // 8: + (("jittery", "catc", 1), &serialised_bitmap_ranges), + ]; + + let expected_result = [ + // first batch: + (("healthy", "arb", 1), bitmap123.clone()), + (("healthy", "arb", 2), &bitmap123 | &bitmap456), + (("healthy", "arbre", 1), bitmap123.clone()), + (("healthy", "arbre", 2), &bitmap123 | &bitmap456), + // second batch: + (("healthy", "cat", 1), &bitmap456 | &bitmap123), + (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), + (("healthy", "catto", 1), bitmap123.clone()), + (("healthy", "catto", 2), bitmap_ranges.clone()), + // third batch + (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ]; + + let mut result = vec![]; + + let mut allocations = Allocations::default(); + let mut iter = + IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { + ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + }); + execute_on_word_pairs_and_prefixes( + &mut iter, + |iter| Ok(iter.next()), + &prefixes, + &mut allocations, + 2, + |k, v| { + let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); + result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + Ok(()) + }, + ) + .unwrap(); + + for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { + let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; + let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + + assert_eq!(actual_word1, expected_word1); + assert_eq!(actual_prefix, expected_prefix); + assert_eq!(actual_proximity, expected_proximity); + assert_eq!(actual_bitmap, expected_bitmap); + } } } From 044356d22165977be4ecaaffc8ee4027ade7026c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 14 Jul 2022 11:53:21 +0200 Subject: [PATCH 04/17] Optimise WordPrefixPairProximityDocIds merge operation --- .../word_prefix_pair_proximity_docids/mod.rs | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index a5ece8005..ad498b5da 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -8,12 +8,11 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; use std::io::BufReader; -use std::time::Instant; use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; -use crate::{Index, Result, UncheckedStrStrU8Codec}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -189,6 +188,7 @@ fn execute_on_word_pairs_and_prefixes( let mut empty_prefixes = false; let mut prefix_buffer = allocations.take_byte_vector(); + let mut merge_buffer = allocations.take_byte_vector(); while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { if proximity > max_proximity { @@ -200,7 +200,7 @@ fn execute_on_word_pairs_and_prefixes( } let word1_different_than_prev = word1 != batch.word1; if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut insert)?; + batch.flush(allocations, &mut merge_buffer, &mut insert)?; if word1_different_than_prev { prefix_search_start.0 = 0; batch.word1.clear(); @@ -231,7 +231,7 @@ fn execute_on_word_pairs_and_prefixes( prefix_buffer.clear(); } } - batch.flush(allocations, &mut insert)?; + batch.flush(allocations, &mut merge_buffer, &mut insert)?; Ok(()) } /** @@ -307,12 +307,14 @@ impl PrefixAndProximityBatch { fn flush( &mut self, allocations: &mut Allocations, + merge_buffer: &mut Vec, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { let PrefixAndProximityBatch { word1, batch } = self; if batch.is_empty() { return Ok(()); } + merge_buffer.clear(); let mut buffer = allocations.take_byte_vector(); buffer.extend_from_slice(word1); @@ -321,14 +323,15 @@ impl PrefixAndProximityBatch { for (key, mergeable_data) in batch.drain(..) { buffer.truncate(word1.len() + 1); buffer.extend_from_slice(key.as_slice()); - let merged; + let data = if mergeable_data.len() > 1 { - merged = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; - &merged + CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; + merge_buffer.as_slice() } else { &mergeable_data[0] }; insert(buffer.as_slice(), data)?; + merge_buffer.clear(); allocations.reclaim_byte_vector(key); allocations.reclaim_mergeable_data_vector(mergeable_data); } @@ -443,20 +446,17 @@ impl PrefixTrieNode { let byte = word[0]; if self.children[search_start.0].1 == byte { return true; - } else if let Some(position) = - self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) - { - let (_, c) = self.children[search_start.0 + position]; - if c == byte { - search_start.0 += position; - true - } else { - search_start.0 = 0; - false - } } else { - search_start.0 = 0; - false + match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { + Ok(position) => { + search_start.0 += position; + true + } + Err(_) => { + search_start.0 = 0; + false + } + } } } From 220921628b47c0f9e80db80896f640db4550fe08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 15:34:40 +0200 Subject: [PATCH 05/17] Simplify and document WordPrefixPairProximityDocIds::execute --- .../word_prefix_pair_proximity_docids/mod.rs | 103 ++++++++---------- .../readme.md | 8 +- 2 files changed, 50 insertions(+), 61 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index ad498b5da..6345dd210 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -1,18 +1,14 @@ -use grenad::CompressionType; -use heed::types::ByteSlice; - -use heed::BytesDecode; -use log::debug; - -use std::borrow::Cow; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::io::BufReader; - use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -72,10 +68,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + // This is an optimisation, to reuse allocations between loop iterations let mut allocations = Allocations::default(); - let mut count = 0; - + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length let prefixes = PrefixTrieNode::from_sorted_prefixes( common_prefix_fst_words .into_iter() @@ -85,9 +82,14 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { .filter(|s| s.len() <= self.max_prefix_length), ); + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB if !prefixes.is_empty() { let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { @@ -101,8 +103,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &prefixes, &mut allocations, self.max_proximity, + // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) |key, value| { - count += 1; insert_into_database( &mut self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), @@ -113,6 +115,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { )?; } + // Now we do the same thing with the new prefixes and all word pairs in the DB + let prefixes = PrefixTrieNode::from_sorted_prefixes( new_prefix_fst_words .into_iter() @@ -128,6 +132,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { .remap_data_type::() .iter(self.wtxn)?; + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad let mut writer = create_writer( self.chunk_compression_type, self.chunk_compression_level, @@ -143,7 +149,12 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { |key, value| writer.insert(key, value).map_err(|e| e.into()), )?; drop(db_iter); - writer_of_new_elements_into_lmdb_database( + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), writer, @@ -169,6 +180,15 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { Ok(()) } } + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 2. a prefix trie +/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about the fn execute_on_word_pairs_and_prefixes( iter: &mut Iter, mut next_word_pair_proximity: impl for<'a> FnMut( @@ -252,52 +272,19 @@ struct PrefixAndProximityBatch { } impl PrefixAndProximityBatch { + /// Insert the new key and value into the batch fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { - // this is a macro instead of a closure because the borrow checker will complain - // about the closure moving `new_value` - macro_rules! insert_new_key_value { - () => { + match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + Ok(position) => { + self.batch[position].1.push(Cow::Owned(new_value)); + } + Err(position) => { let mut key = allocations.take_byte_vector(); key.extend_from_slice(new_key); let mut mergeable_data = allocations.take_mergeable_data_vector(); mergeable_data.push(Cow::Owned(new_value)); - self.batch.push((key, mergeable_data)); - }; - ($idx:expr) => { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_key); - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert($idx, (key, mergeable_data)); - }; - } - - match self.batch.len() { - 0 => { - insert_new_key_value!(); + self.batch.insert(position, (key, mergeable_data)); } - 1 => { - let (existing_key, existing_data) = &mut self.batch[0]; - match new_key.cmp(&existing_key) { - Ordering::Less => { - insert_new_key_value!(0); - } - Ordering::Equal => { - existing_data.push(Cow::Owned(new_value)); - } - Ordering::Greater => { - insert_new_key_value!(); - } - } - } - _ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { - Ok(position) => { - self.batch[position].1.push(Cow::Owned(new_value)); - } - Err(position) => { - insert_new_key_value!(position); - } - }, } } @@ -369,8 +356,10 @@ fn insert_into_database( Ok(()) } -// This is adapted from `sorter_into_lmdb_database` -pub fn writer_of_new_elements_into_lmdb_database( +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, writer: grenad::Writer, diff --git a/milli/src/update/word_prefix_pair_proximity_docids/readme.md b/milli/src/update/word_prefix_pair_proximity_docids/readme.md index 7e467e92d..0718fd79c 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/readme.md +++ b/milli/src/update/word_prefix_pair_proximity_docids/readme.md @@ -1,7 +1,7 @@ ## What is WordPrefixPairProximityDocids? The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. -The prefixes present in this database are only those that correspond to many different words present in the documents. +The prefixes present in this database are only those that correspond to many different words in the documents. ## How is it created/updated? (simplified version) To compute it, we have access to (mainly) two inputs: @@ -28,13 +28,13 @@ horror cathedral 4 -> docids5: [1, 2] I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: -1. ==Outer loop:== First, we iterate over each word pair and its proximity: +1. **Outer loop:** First, we iterate over each word pair and its proximity: ``` word1 : good word2 : dog proximity: 3 ``` -2. ==Inner loop:== Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: ``` Outer loop 1: ------------------------------ @@ -108,7 +108,7 @@ Because `word2` begins with a different letter than the previous `word2`, we kno 2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. -6. ==Flushing the batch==: to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: ``` Flushing Batch loop 1: ------------------------------ From ea4a96761c37c5b033e6077822dd5c683374bc3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 15:39:39 +0200 Subject: [PATCH 06/17] Move content of readme for WordPrefixPairProximityDocids into the code --- .../word_prefix_pair_proximity_docids.rs | 871 +++++++++++++++--- .../word_prefix_pair_proximity_docids/mod.rs | 709 -------------- .../readme.md | 144 --- 3 files changed, 739 insertions(+), 985 deletions(-) delete mode 100644 milli/src/update/word_prefix_pair_proximity_docids/mod.rs delete mode 100644 milli/src/update/word_prefix_pair_proximity_docids/readme.md diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 574b49e97..4a3a7d13e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,16 +1,161 @@ -use std::collections::{HashMap, HashSet}; +/*! + ## What is WordPrefixPairProximityDocids? +The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. +The prefixes present in this database are only those that correspond to many different words in the documents. + +## How is it created/updated? (simplified version) +To compute it, we have access to (mainly) two inputs: + +* a list of sorted prefixes, such as: +``` +c +ca +cat +d +do +dog +``` +Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. + +* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: +``` +good dog 3 -> docids1: [2, 5, 6] +good doggo 1 -> docids2: [8] +good dogma 1 -> docids3: [7, 19, 20] +good ghost 2 -> docids4: [1] +horror cathedral 4 -> docids5: [1, 2] +``` + +I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: + +1. **Outer loop:** First, we iterate over each word pair and its proximity: +``` +word1 : good +word2 : dog +proximity: 3 +``` +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +``` +Outer loop 1: +------------------------------ +word1 : good +word2 : dog +proximity: 3 +docids : docids1 + +prefixes: [d, do, dog] + +batch: [ + (d, 3) -> [docids1] + (do, 3) -> [docids1] + (dog, 3) -> [docids1] +] +``` +3. For illustration purpose, let's run through a second iteration of the outer loop: +``` +Outer loop 2: +------------------------------ +word1 : good +word2 : doggo +proximity: 1 +docids : docids2 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2] + (d, 3) -> [docids1] + (do, 1) -> [docids2] + (do, 3) -> [docids1] + (dog, 1) -> [docids2] + (dog, 3) -> [docids1] +] +``` +Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. + +4. And a third: +``` +Outer loop 3: +------------------------------ +word1 : good +word2 : dogma +proximity: 1 +docids : docids3 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2, docids3] + (d, 3) -> [docids1] + (do, 1) -> [docids2, docids3] + (do, 3) -> [docids1] + (dog, 1) -> [docids2, docids3] + (dog, 3) -> [docids1] +] +``` +Notice that there were some conflicts which were resolved by merging the conflicting values together. + +5. On the fourth iteration of the outer loop, we have: +``` +Outer loop 4: +------------------------------ +word1 : good +word2 : ghost +proximity: 2 +``` +Because `word2` begins with a different letter than the previous `word2`, we know that: +1. All the prefixes of `word2` are greater than the prefixes of the previous word2 +2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. +Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. + +6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +``` +Flushing Batch loop 1: +------------------------------ +word1 : good +word2 : d +proximity: 1 +docids : [docids2, docids3] +``` +We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. +Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. + +7. That's it! ... except... + +## How is it created/updated (continued) + +I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: + +* For the list of sorted prefixes, we have: + * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents + * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents + +* For the list of word pairs and proximities, we have: + * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents + * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. + +To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: + +1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. +3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. + + + +*/ +use crate::update::index_documents::{ + create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, +}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; -use slice_group_by::GroupBy; - -use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, -}; -use crate::{Index, Result, StrStrU8Codec}; +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -62,94 +207,104 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute( - self, + pub fn execute<'a>( + mut self, new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - let new_prefix_fst_words: Vec<_> = - new_prefix_fst_words.linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); + // This is an optimisation, to reuse allocations between loop iterations + let mut allocations = Allocations::default(); - let mut new_wppd_iter = new_word_pair_proximity_docids.into_cursor()?; - let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), ); - if !common_prefix_fst_words.is_empty() { - // We compute the prefix docids associated with the common prefixes between - // the old and new word prefix fst. - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_wppd_iter.move_on_next()? { - let (w1, w2, prox) = - StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - if prox > self.max_proximity { - continue; - } - - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - common_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, - )?; - } - - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some(((word1, word2, proximity), value))) + } else { + Ok(None) + } + }, + &prefixes, + &mut allocations, + self.max_proximity, + // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) + |key, value| { + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, )?; } - if !new_prefix_fst_words.is_empty() { - // We compute the prefix docids associated with the newly added prefixes - // in the new word prefix fst. + // Now we do the same thing with the new prefixes and all word pairs in the DB + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + if !prefixes.is_empty() { let mut db_iter = self .index .word_pair_proximity_docids + .remap_key_type::() .remap_data_type::() .iter(self.wtxn)?; - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { - if prox > self.max_proximity { - continue; - } + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad + let mut writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - &new_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, - )?; - } + execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + &mut allocations, + self.max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + writer, )?; } @@ -169,84 +324,359 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } } - // We finally write and merge the new word prefix pair proximity docids - // in the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - word_prefix_pair_proximity_docids_sorter, - merge_cbo_roaring_bitmaps, - )?; + Ok(()) + } +} + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 2. a prefix trie +/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about what this function does, read the module documentation. +fn execute_on_word_pairs_and_prefixes( + iter: &mut Iter, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut Iter, + ) -> Result< + Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, + allocations: &mut Allocations, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + let mut empty_prefixes = false; + + let mut prefix_buffer = allocations.take_byte_vector(); + let mut merge_buffer = allocations.take_byte_vector(); + + while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + if proximity > max_proximity { + continue; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + let word1_different_than_prev = word1 != batch.word1; + if word1_different_than_prev || word2_start_different_than_prev { + batch.flush(allocations, &mut merge_buffer, &mut insert)?; + if word1_different_than_prev { + prefix_search_start.0 = 0; + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + // word2_start_different_than_prev == true + prev_word2_start = word2[0]; + } + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + let prefix_len = prefix_buffer.len(); + prefix_buffer.push(0); + prefix_buffer.push(proximity); + batch.insert(&prefix_buffer, value, allocations); + prefix_buffer.truncate(prefix_len); + }, + ); + prefix_buffer.clear(); + } + } + batch.flush(allocations, &mut merge_buffer, &mut insert)?; + Ok(()) +} +/** +A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). +The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. + +It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. + +The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content +can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: +- key : (word1, prefix, proximity) as bytes +- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes +*/ +#[derive(Default)] +struct PrefixAndProximityBatch { + word1: Vec, + batch: Vec<(Vec, Vec>)>, +} + +impl PrefixAndProximityBatch { + /// Insert the new key and value into the batch + fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { + match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + Ok(position) => { + self.batch[position].1.push(Cow::Owned(new_value)); + } + Err(position) => { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_key); + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.insert(position, (key, mergeable_data)); + } + } + } + + /// Empties the batch, calling `insert` on each element. + /// + /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + fn flush( + &mut self, + allocations: &mut Allocations, + merge_buffer: &mut Vec, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let PrefixAndProximityBatch { word1, batch } = self; + if batch.is_empty() { + return Ok(()); + } + merge_buffer.clear(); + + let mut buffer = allocations.take_byte_vector(); + buffer.extend_from_slice(word1); + buffer.push(0); + + for (key, mergeable_data) in batch.drain(..) { + buffer.truncate(word1.len() + 1); + buffer.extend_from_slice(key.as_slice()); + + let data = if mergeable_data.len() > 1 { + CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; + merge_buffer.as_slice() + } else { + &mergeable_data[0] + }; + insert(buffer.as_slice(), data)?; + merge_buffer.clear(); + allocations.reclaim_byte_vector(key); + allocations.reclaim_mergeable_data_vector(mergeable_data); + } Ok(()) } } -fn write_prefixes_in_sorter( - prefixes: &mut HashMap, Vec>>, - sorter: &mut grenad::Sorter, +// This is adapted from `sorter_into_lmdb_database` +fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], ) -> Result<()> { - for (key, data_slices) in prefixes.drain() { - for data in data_slices { - if valid_lmdb_key(&key) { - sorter.insert(&key, data)?; - } + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; } } - Ok(()) } -/// Computes the current prefix based on the previous and the currently iterated value -/// i.e. w1, w2, prox. It also makes sure to follow the `max_prefix_length` setting. -/// -/// Uses the current prefixes values to insert the associated data i.e. RoaringBitmap, -/// into the sorter that will, later, be inserted in the LMDB database. -fn insert_current_prefix_data_in_sorter<'a>( - buffer: &mut Vec, - current_prefixes: &mut Option<&'a &'a [String]>, - prefixes_cache: &mut HashMap, Vec>>, - word_prefix_pair_proximity_docids_sorter: &mut grenad::Sorter, - prefix_fst_keys: &'a [&'a [std::string::String]], - max_prefix_length: usize, - w1: &str, - w2: &str, - prox: u8, - data: &[u8], +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, ) -> Result<()> { - *current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter(prefixes_cache, word_prefix_pair_proximity_docids_sorter)?; - prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; } - }; + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + Ok(()) +} - if let Some(prefixes) = current_prefixes { - buffer.clear(); - buffer.extend_from_slice(w1.as_bytes()); - buffer.push(0); - for prefix in prefixes.iter() { - if prefix.len() <= max_prefix_length && w2.starts_with(prefix) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); +struct Allocations { + byte_vectors: Vec>, + mergeable_data_vectors: Vec>>, +} +impl Default for Allocations { + fn default() -> Self { + Self { + byte_vectors: Vec::with_capacity(65_536), + mergeable_data_vectors: Vec::with_capacity(4096), + } + } +} +impl Allocations { + fn take_byte_vector(&mut self) -> Vec { + self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) + } + fn take_mergeable_data_vector(&mut self) -> Vec> { + self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) + } - match prefixes_cache.get_mut(buffer.as_slice()) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); - } + fn reclaim_byte_vector(&mut self, mut data: Vec) { + data.clear(); + self.byte_vectors.push(data); + } + fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { + data.clear(); + self.mergeable_data_vectors.push(data); + } +} + +#[derive(Default, Debug)] +struct PrefixTrieNode { + children: Vec<(PrefixTrieNode, u8)>, + is_end_node: bool, +} + +#[derive(Debug)] +struct PrefixTrieNodeSearchStart(usize); + +impl PrefixTrieNode { + fn is_empty(&self) -> bool { + self.children.is_empty() + } + + /// Returns false if the trie does not contain a prefix of the given word. + /// Returns true if the trie *may* contain a prefix of the given word. + /// + /// Moves the search start to the first node equal to the first letter of the word, + /// or to 0 otherwise. + fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { + let byte = word[0]; + if self.children[search_start.0].1 == byte { + return true; + } else { + match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { + Ok(position) => { + search_start.0 += position; + true + } + Err(_) => { + search_start.0 = 0; + false } } } } - Ok(()) + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { + let mut node = PrefixTrieNode::default(); + for prefix in prefixes { + node.insert_sorted_prefix(prefix.as_bytes().into_iter()); + } + node + } + fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { + if let Some(&c) = prefix.next() { + if let Some((node, byte)) = self.children.last_mut() { + if *byte == c { + node.insert_sorted_prefix(prefix); + return; + } + } + let mut new_node = PrefixTrieNode::default(); + new_node.insert_sorted_prefix(prefix); + self.children.push((new_node, c)); + } else { + self.is_end_node = true; + } + } + fn for_each_prefix_of( + &self, + word: &[u8], + buffer: &mut Vec, + search_start: &PrefixTrieNodeSearchStart, + mut do_fn: impl FnMut(&mut Vec), + ) { + let first_byte = word[0]; + let mut cur_node = self; + buffer.push(first_byte); + if let Some((child_node, c)) = + cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) + { + if *c == first_byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + for &byte in &word[1..] { + buffer.push(byte); + if let Some((child_node, c)) = + cur_node.children.iter().find(|(_, c)| *c >= byte) + { + if *c == byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + } else { + break; + } + } else { + break; + } + } + } + } + } + // fn print(&self, buffer: &mut String, ident: usize) { + // let mut spaces = String::new(); + // for _ in 0..ident { + // spaces.push(' ') + // } + // for (child, c) in &self.children { + // buffer.push(char::from_u32(*c as u32).unwrap()); + // println!("{spaces}{buffer}:"); + // child.print(buffer, ident + 4); + // buffer.pop(); + // } + // } } - #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + + use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; + + use super::*; + use std::io::Cursor; use crate::db_snap; @@ -328,4 +758,181 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); } + + fn check_prefixes( + trie: &PrefixTrieNode, + search_start: &PrefixTrieNodeSearchStart, + word: &str, + expected_prefixes: &[&str], + ) { + let mut actual_prefixes = vec![]; + trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { + let s = String::from_utf8(x.to_owned()).unwrap(); + actual_prefixes.push(s); + }); + assert_eq!(actual_prefixes, expected_prefixes); + } + + #[test] + fn test_trie() { + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", + "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", + "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", + "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", + "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", + "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", + "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", + "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", + "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", + "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", + "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", + "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", + "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", + "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", + "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", + "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", + ])); + + let mut search_start = PrefixTrieNodeSearchStart(0); + + let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(search_start.0, 2); + + check_prefixes(&trie, &search_start, "affair", &["a"]); + check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); + + let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(trie.children[search_start.0].1, b'u'); + + check_prefixes(&trie, &search_start, "unique", &["u", "un"]); + + // NOTE: this should fail, because the search start is already beyong 'a' + let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); + assert!(!is_empty); + // search start is reset + assert_eq!(search_start.0, 0); + + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); + check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); + } + + #[test] + fn test_execute_on_word_pairs_and_prefixes() { + let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + + let mut serialised_bitmap123 = vec![]; + let mut bitmap123 = RoaringBitmap::new(); + bitmap123.insert(1); + bitmap123.insert(2); + bitmap123.insert(3); + CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); + + let mut serialised_bitmap456 = vec![]; + let mut bitmap456 = RoaringBitmap::new(); + bitmap456.insert(4); + bitmap456.insert(5); + bitmap456.insert(6); + CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); + + let mut serialised_bitmap789 = vec![]; + let mut bitmap789 = RoaringBitmap::new(); + bitmap789.insert(7); + bitmap789.insert(8); + bitmap789.insert(9); + CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); + + let mut serialised_bitmap_ranges = vec![]; + let mut bitmap_ranges = RoaringBitmap::new(); + bitmap_ranges.insert_range(63_000..65_000); + bitmap_ranges.insert_range(123_000..128_000); + CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); + + let word_pairs = [ + // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) + (("healthy", "arbre", 2), &serialised_bitmap123), + // not inserted because 3 > max_proximity + (("healthy", "arbre", 3), &serialised_bitmap456), + // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) + (("healthy", "arbres", 1), &serialised_bitmap123), + // 1, 3: + (("healthy", "arbres", 2), &serialised_bitmap456), + // not be inserted because 3 > max_proximity + (("healthy", "arbres", 3), &serialised_bitmap789), + // not inserted because no prefixes for boat + (("healthy", "boat", 1), &serialised_bitmap123), + // not inserted because no prefixes for ca + (("healthy", "ca", 1), &serialised_bitmap123), + // 4: (healthy cat 1) with (bitmap456 + bitmap123) + (("healthy", "cats", 1), &serialised_bitmap456), + // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) + (("healthy", "cats", 2), &serialised_bitmap789), + // 4 + 6: (healthy catto 1) with (bitmap123) + (("healthy", "cattos", 1), &serialised_bitmap123), + // 5 + 7: (healthy catto 2) with (bitmap_ranges) + (("healthy", "cattos", 2), &serialised_bitmap_ranges), + // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) + (("jittery", "cat", 1), &serialised_bitmap123), + // 8: + (("jittery", "cata", 1), &serialised_bitmap456), + // 8: + (("jittery", "catb", 1), &serialised_bitmap789), + // 8: + (("jittery", "catc", 1), &serialised_bitmap_ranges), + ]; + + let expected_result = [ + // first batch: + (("healthy", "arb", 1), bitmap123.clone()), + (("healthy", "arb", 2), &bitmap123 | &bitmap456), + (("healthy", "arbre", 1), bitmap123.clone()), + (("healthy", "arbre", 2), &bitmap123 | &bitmap456), + // second batch: + (("healthy", "cat", 1), &bitmap456 | &bitmap123), + (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), + (("healthy", "catto", 1), bitmap123.clone()), + (("healthy", "catto", 2), bitmap_ranges.clone()), + // third batch + (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ]; + + let mut result = vec![]; + + let mut allocations = Allocations::default(); + let mut iter = + IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { + ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + }); + execute_on_word_pairs_and_prefixes( + &mut iter, + |iter| Ok(iter.next()), + &prefixes, + &mut allocations, + 2, + |k, v| { + let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); + result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + Ok(()) + }, + ) + .unwrap(); + + for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { + let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; + let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + + assert_eq!(actual_word1, expected_word1); + assert_eq!(actual_prefix, expected_prefix); + assert_eq!(actual_proximity, expected_proximity); + assert_eq!(actual_bitmap, expected_bitmap); + } + } } diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs deleted file mode 100644 index 6345dd210..000000000 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ /dev/null @@ -1,709 +0,0 @@ -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; - -pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - max_proximity: u8, - max_prefix_length: usize, -} - -impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { - WordPrefixPairProximityDocids { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - max_proximity: 4, - max_prefix_length: 2, - } - } - - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - mut self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // This is an optimisation, to reuse allocations between loop iterations - let mut allocations = Allocations::default(); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some(((word1, word2, proximity), value))) - } else { - Ok(None) - } - }, - &prefixes, - &mut allocations, - self.max_proximity, - // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) - |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - &mut allocations, - self.max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) - } -} - -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. -/// -/// Its main arguments are: -/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements -/// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements -/// -/// For more information about the -fn execute_on_word_pairs_and_prefixes( - iter: &mut Iter, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut Iter, - ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - allocations: &mut Allocations, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - let mut empty_prefixes = false; - - let mut prefix_buffer = allocations.take_byte_vector(); - let mut merge_buffer = allocations.take_byte_vector(); - - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { - if proximity > max_proximity { - continue; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - let word1_different_than_prev = word1 != batch.word1; - if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut merge_buffer, &mut insert)?; - if word1_different_than_prev { - prefix_search_start.0 = 0; - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - // word2_start_different_than_prev == true - prev_word2_start = word2[0]; - } - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); - let prefix_len = prefix_buffer.len(); - prefix_buffer.push(0); - prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, value, allocations); - prefix_buffer.truncate(prefix_len); - }, - ); - prefix_buffer.clear(); - } - } - batch.flush(allocations, &mut merge_buffer, &mut insert)?; - Ok(()) -} -/** -A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). -The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. - -It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. - -The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content -can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (word1, prefix, proximity) as bytes -- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes -*/ -#[derive(Default)] -struct PrefixAndProximityBatch { - word1: Vec, - batch: Vec<(Vec, Vec>)>, -} - -impl PrefixAndProximityBatch { - /// Insert the new key and value into the batch - fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { - match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { - Ok(position) => { - self.batch[position].1.push(Cow::Owned(new_value)); - } - Err(position) => { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_key); - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert(position, (key, mergeable_data)); - } - } - } - - /// Empties the batch, calling `insert` on each element. - /// - /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. - fn flush( - &mut self, - allocations: &mut Allocations, - merge_buffer: &mut Vec, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let PrefixAndProximityBatch { word1, batch } = self; - if batch.is_empty() { - return Ok(()); - } - merge_buffer.clear(); - - let mut buffer = allocations.take_byte_vector(); - buffer.extend_from_slice(word1); - buffer.push(0); - - for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(word1.len() + 1); - buffer.extend_from_slice(key.as_slice()); - - let data = if mergeable_data.len() > 1 { - CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; - merge_buffer.as_slice() - } else { - &mergeable_data[0] - }; - insert(buffer.as_slice(), data)?; - merge_buffer.clear(); - allocations.reclaim_byte_vector(key); - allocations.reclaim_mergeable_data_vector(mergeable_data); - } - - Ok(()) - } -} - -// This is adapted from `sorter_into_lmdb_database` -fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer, -) -> Result<()> { - let file = writer.into_inner()?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - -struct Allocations { - byte_vectors: Vec>, - mergeable_data_vectors: Vec>>, -} -impl Default for Allocations { - fn default() -> Self { - Self { - byte_vectors: Vec::with_capacity(65_536), - mergeable_data_vectors: Vec::with_capacity(4096), - } - } -} -impl Allocations { - fn take_byte_vector(&mut self) -> Vec { - self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) - } - fn take_mergeable_data_vector(&mut self) -> Vec> { - self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) - } - - fn reclaim_byte_vector(&mut self, mut data: Vec) { - data.clear(); - self.byte_vectors.push(data); - } - fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { - data.clear(); - self.mergeable_data_vectors.push(data); - } -} - -#[derive(Default, Debug)] -struct PrefixTrieNode { - children: Vec<(PrefixTrieNode, u8)>, - is_end_node: bool, -} - -#[derive(Debug)] -struct PrefixTrieNodeSearchStart(usize); - -impl PrefixTrieNode { - fn is_empty(&self) -> bool { - self.children.is_empty() - } - - /// Returns false if the trie does not contain a prefix of the given word. - /// Returns true if the trie *may* contain a prefix of the given word. - /// - /// Moves the search start to the first node equal to the first letter of the word, - /// or to 0 otherwise. - fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { - let byte = word[0]; - if self.children[search_start.0].1 == byte { - return true; - } else { - match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { - Ok(position) => { - search_start.0 += position; - true - } - Err(_) => { - search_start.0 = 0; - false - } - } - } - } - - fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { - let mut node = PrefixTrieNode::default(); - for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().into_iter()); - } - node - } - fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { - if let Some(&c) = prefix.next() { - if let Some((node, byte)) = self.children.last_mut() { - if *byte == c { - node.insert_sorted_prefix(prefix); - return; - } - } - let mut new_node = PrefixTrieNode::default(); - new_node.insert_sorted_prefix(prefix); - self.children.push((new_node, c)); - } else { - self.is_end_node = true; - } - } - fn for_each_prefix_of( - &self, - word: &[u8], - buffer: &mut Vec, - search_start: &PrefixTrieNodeSearchStart, - mut do_fn: impl FnMut(&mut Vec), - ) { - let first_byte = word[0]; - let mut cur_node = self; - buffer.push(first_byte); - if let Some((child_node, c)) = - cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) - { - if *c == first_byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - for &byte in &word[1..] { - buffer.push(byte); - if let Some((child_node, c)) = - cur_node.children.iter().find(|(_, c)| *c >= byte) - { - if *c == byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - } else { - break; - } - } else { - break; - } - } - } - } - } - // fn print(&self, buffer: &mut String, ident: usize) { - // let mut spaces = String::new(); - // for _ in 0..ident { - // spaces.push(' ') - // } - // for (child, c) in &self.children { - // buffer.push(char::from_u32(*c as u32).unwrap()); - // println!("{spaces}{buffer}:"); - // child.print(buffer, ident + 4); - // buffer.pop(); - // } - // } -} -#[cfg(test)] -mod tests { - use roaring::RoaringBitmap; - - use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; - - use super::*; - - fn check_prefixes( - trie: &PrefixTrieNode, - search_start: &PrefixTrieNodeSearchStart, - word: &str, - expected_prefixes: &[&str], - ) { - let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { - let s = String::from_utf8(x.to_owned()).unwrap(); - actual_prefixes.push(s); - }); - assert_eq!(actual_prefixes, expected_prefixes); - } - - #[test] - fn test_trie() { - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", - "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", - "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", - "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", - "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", - "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", - "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", - "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", - "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", - "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", - "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", - "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", - "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", - "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", - "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", - "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", - ])); - - let mut search_start = PrefixTrieNodeSearchStart(0); - - let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(search_start.0, 2); - - check_prefixes(&trie, &search_start, "affair", &["a"]); - check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); - - let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(trie.children[search_start.0].1, b'u'); - - check_prefixes(&trie, &search_start, "unique", &["u", "un"]); - - // NOTE: this should fail, because the search start is already beyong 'a' - let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); - assert!(!is_empty); - // search start is reset - assert_eq!(search_start.0, 0); - - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); - check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); - } - - #[test] - fn test_execute_on_word_pairs_and_prefixes() { - let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - - let mut serialised_bitmap123 = vec![]; - let mut bitmap123 = RoaringBitmap::new(); - bitmap123.insert(1); - bitmap123.insert(2); - bitmap123.insert(3); - CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); - - let mut serialised_bitmap456 = vec![]; - let mut bitmap456 = RoaringBitmap::new(); - bitmap456.insert(4); - bitmap456.insert(5); - bitmap456.insert(6); - CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); - - let mut serialised_bitmap789 = vec![]; - let mut bitmap789 = RoaringBitmap::new(); - bitmap789.insert(7); - bitmap789.insert(8); - bitmap789.insert(9); - CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); - - let mut serialised_bitmap_ranges = vec![]; - let mut bitmap_ranges = RoaringBitmap::new(); - bitmap_ranges.insert_range(63_000..65_000); - bitmap_ranges.insert_range(123_000..128_000); - CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); - - let word_pairs = [ - // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) - (("healthy", "arbre", 2), &serialised_bitmap123), - // not inserted because 3 > max_proximity - (("healthy", "arbre", 3), &serialised_bitmap456), - // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) - (("healthy", "arbres", 1), &serialised_bitmap123), - // 1, 3: - (("healthy", "arbres", 2), &serialised_bitmap456), - // not be inserted because 3 > max_proximity - (("healthy", "arbres", 3), &serialised_bitmap789), - // not inserted because no prefixes for boat - (("healthy", "boat", 1), &serialised_bitmap123), - // not inserted because no prefixes for ca - (("healthy", "ca", 1), &serialised_bitmap123), - // 4: (healthy cat 1) with (bitmap456 + bitmap123) - (("healthy", "cats", 1), &serialised_bitmap456), - // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) - (("healthy", "cats", 2), &serialised_bitmap789), - // 4 + 6: (healthy catto 1) with (bitmap123) - (("healthy", "cattos", 1), &serialised_bitmap123), - // 5 + 7: (healthy catto 2) with (bitmap_ranges) - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) - (("jittery", "cat", 1), &serialised_bitmap123), - // 8: - (("jittery", "cata", 1), &serialised_bitmap456), - // 8: - (("jittery", "catb", 1), &serialised_bitmap789), - // 8: - (("jittery", "catc", 1), &serialised_bitmap_ranges), - ]; - - let expected_result = [ - // first batch: - (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), - (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - // second batch: - (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), - (("healthy", "catto", 1), bitmap123.clone()), - (("healthy", "catto", 2), bitmap_ranges.clone()), - // third batch - (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - ]; - - let mut result = vec![]; - - let mut allocations = Allocations::default(); - let mut iter = - IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { - ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) - }); - execute_on_word_pairs_and_prefixes( - &mut iter, - |iter| Ok(iter.next()), - &prefixes, - &mut allocations, - 2, - |k, v| { - let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); - let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); - Ok(()) - }, - ) - .unwrap(); - - for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; - let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; - - assert_eq!(actual_word1, expected_word1); - assert_eq!(actual_prefix, expected_prefix); - assert_eq!(actual_proximity, expected_proximity); - assert_eq!(actual_bitmap, expected_bitmap); - } - } -} diff --git a/milli/src/update/word_prefix_pair_proximity_docids/readme.md b/milli/src/update/word_prefix_pair_proximity_docids/readme.md deleted file mode 100644 index 0718fd79c..000000000 --- a/milli/src/update/word_prefix_pair_proximity_docids/readme.md +++ /dev/null @@ -1,144 +0,0 @@ -## What is WordPrefixPairProximityDocids? -The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. - -The prefixes present in this database are only those that correspond to many different words in the documents. - -## How is it created/updated? (simplified version) -To compute it, we have access to (mainly) two inputs: - -* a list of sorted prefixes, such as: -``` -c -ca -cat -d -do -dog -``` -Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. - -* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: -``` -good dog 3 -> docids1: [2, 5, 6] -good doggo 1 -> docids2: [8] -good dogma 1 -> docids3: [7, 19, 20] -good ghost 2 -> docids4: [1] -horror cathedral 4 -> docids5: [1, 2] -``` - -I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: - -1. **Outer loop:** First, we iterate over each word pair and its proximity: -``` -word1 : good -word2 : dog -proximity: 3 -``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: -``` -Outer loop 1: ------------------------------- -word1 : good -word2 : dog -proximity: 3 -docids : docids1 - -prefixes: [d, do, dog] - -batch: [ - (d, 3) -> [docids1] - (do, 3) -> [docids1] - (dog, 3) -> [docids1] -] -``` -3. For illustration purpose, let's run through a second iteration of the outer loop: -``` -Outer loop 2: ------------------------------- -word1 : good -word2 : doggo -proximity: 1 -docids : docids2 - -prefixes: [d, do, dog] - -batch: [ - (d, 1) -> [docids2] - (d, 3) -> [docids1] - (do, 1) -> [docids2] - (do, 3) -> [docids1] - (dog, 1) -> [docids2] - (dog, 3) -> [docids1] -] -``` -Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. - -4. And a third: -``` -Outer loop 3: ------------------------------- -word1 : good -word2 : dogma -proximity: 1 -docids : docids3 - -prefixes: [d, do, dog] - -batch: [ - (d, 1) -> [docids2, docids3] - (d, 3) -> [docids1] - (do, 1) -> [docids2, docids3] - (do, 3) -> [docids1] - (dog, 1) -> [docids2, docids3] - (dog, 3) -> [docids1] -] -``` -Notice that there were some conflicts which were resolved by merging the conflicting values together. - -5. On the fourth iteration of the outer loop, we have: -``` -Outer loop 4: ------------------------------- -word1 : good -word2 : ghost -proximity: 2 -``` -Because `word2` begins with a different letter than the previous `word2`, we know that: -1. All the prefixes of `word2` are greater than the prefixes of the previous word2 -2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. -Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. - -6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: -``` -Flushing Batch loop 1: ------------------------------- -word1 : good -word2 : d -proximity: 1 -docids : [docids2, docids3] -``` -We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. -Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. - -7. That's it! ... except... - -## How is it created/updated (continued) - -I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: - -* For the list of sorted prefixes, we have: - * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents - * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents - -* For the list of word pairs and proximities, we have: - * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents - * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. - -To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: - -1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. -3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. - -Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. - - From 474500362c76e84dbf17fdbcc4c828a4762763e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:11:09 +0200 Subject: [PATCH 07/17] Update wpppd snapshots New snapshot (yes, it's wrong as well, it will get fixed later): --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- 5 a 1 [101, ] 5 a 2 [101, ] 5 am 1 [101, ] 5 b 4 [101, ] 5 be 4 [101, ] am a 3 [101, ] amazing a 1 [100, ] amazing a 2 [100, ] amazing a 3 [100, ] amazing an 1 [100, ] amazing an 2 [100, ] amazing b 2 [100, ] amazing be 2 [100, ] an a 1 [100, ] an a 2 [100, 202, ] an am 1 [100, ] an b 3 [100, ] an be 3 [100, ] and a 2 [100, ] and a 3 [100, ] and a 4 [100, ] and b 1 [100, ] and be 1 [100, ] d\0 0 [100, 202, ] an an 2 [100, ] and am 2 [100, ] and an 3 [100, ] at a 2 [100, 101, ] at a 3 [100, ] at am 2 [100, 101, ] at an 1 [100, 202, ] at an 3 [100, ] at b 3 [101, ] at b 4 [100, ] at be 3 [101, ] at be 4 [100, ] beautiful a 2 [100, ] beautiful a 3 [100, ] beautiful a 4 [100, ] beautiful am 3 [100, ] beautiful an 2 [100, ] beautiful an 4 [100, ] bell a 2 [101, ] bell a 4 [101, ] bell am 4 [101, ] extraordinary a 2 [202, ] extraordinary a 3 [202, ] extraordinary an 2 [202, ] house a 4 [100, 202, ] house a 4 [100, ] house am 4 [100, ] house an 3 [100, 202, ] house b 2 [100, ] house be 2 [100, ] rings a 1 [101, ] rings a 3 [101, ] rings am 3 [101, ] rings b 2 [101, ] rings be 2 [101, ] the a 3 [101, ] the b 1 [101, ] the be 1 [101, ] --- ...ord_prefix_pair_proximity_docids.hash.snap | 4 ++ .../word_prefix_pair_proximity_docids.snap | 56 ------------------- 2 files changed, 4 insertions(+), 56 deletions(-) create mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..574cfa72f --- /dev/null +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +53e42e513b83885139e4f6d817888561 diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index aabd9ddec..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,56 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -5 a 1 [101, ] -5 a 2 [101, ] -5 am 1 [101, ] -5 b 4 [101, ] -5 be 4 [101, ] -am a 3 [101, ] -amazing a 1 [100, ] -amazing a 2 [100, ] -amazing a 3 [100, ] -amazing b 2 [100, ] -amazing be 2 [100, ] -an a 1 [100, ] -an a 2 [100, 202, ] -an am 1 [100, ] -an b 3 [100, ] -an be 3 [100, ] -and a 2 [100, ] -and a 3 [100, ] -and a 4 [100, ] -and am 2 [100, ] -and b 1 [100, ] -and be 1 [100, ] -at a 1 [100, 202, ] -at a 2 [100, 101, ] -at a 3 [100, ] -at am 2 [100, 101, ] -at b 3 [101, ] -at b 4 [100, ] -at be 3 [101, ] -at be 4 [100, ] -beautiful a 2 [100, ] -beautiful a 3 [100, ] -beautiful a 4 [100, ] -beautiful am 3 [100, ] -bell a 2 [101, ] -bell a 4 [101, ] -bell am 4 [101, ] -extraordinary a 2 [202, ] -extraordinary a 3 [202, ] -house a 3 [100, 202, ] -house a 4 [100, 202, ] -house am 4 [100, ] -house b 2 [100, ] -house be 2 [100, ] -rings a 1 [101, ] -rings a 3 [101, ] -rings am 3 [101, ] -rings b 2 [101, ] -rings be 2 [101, ] -the a 3 [101, ] -the b 1 [101, ] -the be 1 [101, ] - From 06f3fd8c6df0232710ef7e19331499e83193aa58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 15:57:44 +0200 Subject: [PATCH 08/17] Add more comments to WordPrefixPairProximityDocids::execute --- .../update/word_prefix_pair_proximity_docids.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 4a3a7d13e..d08646b27 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -351,23 +351,34 @@ fn execute_on_word_pairs_and_prefixes( let mut batch = PrefixAndProximityBatch::default(); let mut prev_word2_start = 0; + // Optimisation: the index at the root of the prefix trie where to search for let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + + // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter let mut empty_prefixes = false; let mut prefix_buffer = allocations.take_byte_vector(); let mut merge_buffer = allocations.take_byte_vector(); while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + // skip this iteration if the proximity is over the threshold if proximity > max_proximity { continue; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; + // if there were no potential prefixes for the previous word2 based on its first letter, + // and if the current word2 starts with the same letter, then there is also no potential + // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; } + + // if word1 is different than the previous word1 OR if the start of word2 is different + // than the previous start of word2, then we'll need to flush the batch let word1_different_than_prev = word1 != batch.word1; if word1_different_than_prev || word2_start_different_than_prev { batch.flush(allocations, &mut merge_buffer, &mut insert)?; + // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { prefix_search_start.0 = 0; batch.word1.clear(); @@ -377,10 +388,12 @@ fn execute_on_word_pairs_and_prefixes( // word2_start_different_than_prev == true prev_word2_start = word2[0]; } + // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); } if !empty_prefixes { + // All conditions are satisfied, we can now insert each new prefix of word2 into the batch prefixes.for_each_prefix_of( word2, &mut prefix_buffer, @@ -618,6 +631,10 @@ impl PrefixTrieNode { self.is_end_node = true; } } + + /// Call the given closure on each prefix of the word contained in the prefix trie. + /// + /// The search starts from the given `search_start`. fn for_each_prefix_of( &self, word: &[u8], From 34c991ea02bd3a5f8151de605f4e09849975c889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 07:03:30 +0200 Subject: [PATCH 09/17] Add newlines in documentation of word_prefix_pair_proximity_docids --- .../word_prefix_pair_proximity_docids.rs | 95 ++++++++++++++----- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index d08646b27..0426edef9 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,8 +1,12 @@ /*! ## What is WordPrefixPairProximityDocids? -The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. +The word-prefix-pair-proximity-docids database is a database whose keys are of +the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of +the documents which contain `word` followed by another word starting with +`prefix` at a distance of `proximity`. -The prefixes present in this database are only those that correspond to many different words in the documents. +The prefixes present in this database are only those that correspond to many +different words in the documents. ## How is it created/updated? (simplified version) To compute it, we have access to (mainly) two inputs: @@ -16,9 +20,11 @@ d do dog ``` -Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. +Note that only prefixes which correspond to more than a certain number of +different words from the database are included in this list. -* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: +* a sorted list of word pairs and the distance between them (i.e. proximity), +* associated with a roaring bitmap, such as: ``` good dog 3 -> docids1: [2, 5, 6] good doggo 1 -> docids2: [8] @@ -27,7 +33,8 @@ good ghost 2 -> docids4: [1] horror cathedral 4 -> docids5: [1, 2] ``` -I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: +I illustrate a simplified version of the algorithm to create the word-prefix +pair-proximity database below: 1. **Outer loop:** First, we iterate over each word pair and its proximity: ``` @@ -35,7 +42,10 @@ word1 : good word2 : dog proximity: 3 ``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are +in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) +and the value (`docids`) to a sorted map which we call the “batch”. For example, +at the end of the first inner loop, we may have: ``` Outer loop 1: ------------------------------ @@ -72,7 +82,9 @@ batch: [ (dog, 3) -> [docids1] ] ``` -Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. +Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some +of the elements inserted in the second iteration of the outer loop appear +*before* elements from the first iteration. 4. And a third: ``` @@ -94,7 +106,8 @@ batch: [ (dog, 3) -> [docids1] ] ``` -Notice that there were some conflicts which were resolved by merging the conflicting values together. +Notice that there were some conflicts which were resolved by merging the +conflicting values together. 5. On the fourth iteration of the outer loop, we have: ``` @@ -104,12 +117,20 @@ word1 : good word2 : ghost proximity: 2 ``` -Because `word2` begins with a different letter than the previous `word2`, we know that: -1. All the prefixes of `word2` are greater than the prefixes of the previous word2 -2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. -Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. +Because `word2` begins with a different letter than the previous `word2`, +we know that: -6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +1. All the prefixes of `word2` are greater than the prefixes of the previous word2 +2. And therefore, every instance of (`word2`, `prefix`) will be greater than +any element in the batch. + +Therefore, we know that we can insert every element from the batch into the +database before proceeding any further. This operation is called +“flushing the batch”. Flushing the batch should also be done whenever `word1` +is different than the previous `word1`. + +6. **Flushing the batch:** to flush the batch, we look at the `word1` and +iterate over the elements of the batch in sorted order: ``` Flushing Batch loop 1: ------------------------------ @@ -118,29 +139,55 @@ word2 : d proximity: 1 docids : [docids2, docids3] ``` -We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. -Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. +We then merge the array of `docids` (of type `Vec>`) using +`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a +roaring bitmap of all the document ids where `word1` is followed by `prefix` +at a distance of `proximity`. +Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` +into the database. 7. That's it! ... except... ## How is it created/updated (continued) -I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: +I lied a little bit about the input data. In reality, we get two sets of the +inputs described above, which come from different places: * For the list of sorted prefixes, we have: - * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents - * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents + 1. `new_prefixes`, which are all the prefixes that were not present in the + database before the insertion of the new documents + + 2. `common_prefixes` which are the prefixes that are present both in the + database and in the newly added documents * For the list of word pairs and proximities, we have: - * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents - * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. + 1. `new_word_pairs`, which is the list of word pairs and their proximities + present in the newly added documents -To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: + 2. `word_pairs_db`, which is the list of word pairs from the database. + This list includes all elements in `new_word_pairs`** since `new_word_pairs` + was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` + function. -1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. -3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. +To update the prefix database correctly, we call the algorithm described earlier first +on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). +Thus: -Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. +1. For all the word pairs that were already present in the DB, we insert them +again with the `new_prefixes`. Calling the algorithm on them with the +`common_prefixes` would not result in any new data. + +2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, +and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over +`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- +docids from the batch directly into the database (we would have a concurrent +reader and writer). Therefore, when calling the algorithm on +(`new_prefixes`, `word_pairs_db`), we insert the computed +((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad +Writer instead of the DB. At the end of the outer loop, we finally read from +the grenad and insert its elements in the database. From f6f8f543e105dbbc865f3a269c0ab12a3d8e7c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 07:08:36 +0200 Subject: [PATCH 10/17] Run cargo fmt --- milli/src/heed_codec/mod.rs | 3 +-- .../word_prefix_pair_proximity_docids.rs | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 02235f26d..f3691b7d8 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -15,5 +15,4 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::str_beu32_codec::StrBEU32Codec; -pub use self::str_str_u8_codec::StrStrU8Codec; -pub use self::str_str_u8_codec::UncheckedStrStrU8Codec; +pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec}; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 0426edef9..07908efb5 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -192,17 +192,19 @@ the grenad and insert its elements in the database. */ -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; + use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; + +use crate::update::index_documents::{ + create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, +}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -737,9 +739,8 @@ impl PrefixTrieNode { mod tests { use roaring::RoaringBitmap; - use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; - use super::*; + use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; use std::io::Cursor; From 730911143376a367dacdc59cc6db004c812c4fff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 08:52:01 +0200 Subject: [PATCH 11/17] Don't run block code in doc tests of word_pair_proximity_docids --- .../update/word_prefix_pair_proximity_docids.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 07908efb5..90430c0dd 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -12,7 +12,7 @@ different words in the documents. To compute it, we have access to (mainly) two inputs: * a list of sorted prefixes, such as: -``` +```text c ca cat @@ -25,7 +25,7 @@ different words from the database are included in this list. * a sorted list of word pairs and the distance between them (i.e. proximity), * associated with a roaring bitmap, such as: -``` +```text good dog 3 -> docids1: [2, 5, 6] good doggo 1 -> docids2: [8] good dogma 1 -> docids3: [7, 19, 20] @@ -37,7 +37,7 @@ I illustrate a simplified version of the algorithm to create the word-prefix pair-proximity database below: 1. **Outer loop:** First, we iterate over each word pair and its proximity: -``` +```text word1 : good word2 : dog proximity: 3 @@ -46,7 +46,7 @@ proximity: 3 in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: -``` +```text Outer loop 1: ------------------------------ word1 : good @@ -63,7 +63,7 @@ batch: [ ] ``` 3. For illustration purpose, let's run through a second iteration of the outer loop: -``` +```text Outer loop 2: ------------------------------ word1 : good @@ -87,7 +87,7 @@ of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. 4. And a third: -``` +```text Outer loop 3: ------------------------------ word1 : good @@ -110,7 +110,7 @@ Notice that there were some conflicts which were resolved by merging the conflicting values together. 5. On the fourth iteration of the outer loop, we have: -``` +```text Outer loop 4: ------------------------------ word1 : good @@ -131,7 +131,7 @@ is different than the previous `word1`. 6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: -``` +```text Flushing Batch loop 1: ------------------------------ word1 : good From ef75a77464c63f3723e4c7513c5ca53853a5e9cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 12:04:48 +0200 Subject: [PATCH 12/17] Fix undefined behaviour caused by reusing key from the database New full snapshot: --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- 5 a 1 [101, ] 5 a 2 [101, ] 5 am 1 [101, ] 5 b 4 [101, ] 5 be 4 [101, ] am a 3 [101, ] amazing a 1 [100, ] amazing a 2 [100, ] amazing a 3 [100, ] amazing an 1 [100, ] amazing an 2 [100, ] amazing b 2 [100, ] amazing be 2 [100, ] an a 1 [100, ] an a 2 [100, 202, ] an am 1 [100, ] an an 2 [100, ] an b 3 [100, ] an be 3 [100, ] and a 2 [100, ] and a 3 [100, ] and a 4 [100, ] and am 2 [100, ] and an 3 [100, ] and b 1 [100, ] and be 1 [100, ] at a 1 [100, 202, ] at a 2 [100, 101, ] at a 3 [100, ] at am 2 [100, 101, ] at an 1 [100, 202, ] at an 3 [100, ] at b 3 [101, ] at b 4 [100, ] at be 3 [101, ] at be 4 [100, ] beautiful a 2 [100, ] beautiful a 3 [100, ] beautiful a 4 [100, ] beautiful am 3 [100, ] beautiful an 2 [100, ] beautiful an 4 [100, ] bell a 2 [101, ] bell a 4 [101, ] bell am 4 [101, ] extraordinary a 2 [202, ] extraordinary a 3 [202, ] extraordinary an 2 [202, ] house a 3 [100, 202, ] house a 4 [100, 202, ] house am 4 [100, ] house an 3 [100, 202, ] house b 2 [100, ] house be 2 [100, ] rings a 1 [101, ] rings a 3 [101, ] rings am 3 [101, ] rings b 2 [101, ] rings be 2 [101, ] the a 3 [101, ] the b 1 [101, ] the be 1 [101, ] --- .../update/word_prefix_pair_proximity_docids.hash.snap | 2 +- milli/src/update/word_prefix_pair_proximity_docids.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap index 574cfa72f..a39ee07b5 100644 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- -53e42e513b83885139e4f6d817888561 +5ed4bf83317b10962a55ade353427bdd diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 90430c0dd..bcd940410 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -554,8 +554,8 @@ fn insert_into_database( process: "get-put-merge", } })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(key, &val)? }; + // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour + unsafe { iter.put_current(new_key, &val)? }; } _ => { drop(iter); @@ -579,7 +579,7 @@ pub fn write_into_lmdb_database_without_merging( let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; let mut cursor = reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - // safety: we don't keep references from inside the LMDB database. + // safety: the key comes from the grenad reader, not the database unsafe { out_iter.append(k, v)? }; } } else { From 1bc4788e5998960ce901d092cd5f2b69043f0b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 12:47:07 +0200 Subject: [PATCH 13/17] Remove cached Allocations struct from wpppd indexing --- .../word_prefix_pair_proximity_docids.rs | 63 +++---------------- 1 file changed, 8 insertions(+), 55 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index bcd940410..e8d63acbb 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -265,9 +265,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - // This is an optimisation, to reuse allocations between loop iterations - let mut allocations = Allocations::default(); - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length let prefixes = PrefixTrieNode::from_sorted_prefixes( common_prefix_fst_words @@ -297,7 +294,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } }, &prefixes, - &mut allocations, self.max_proximity, // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) |key, value| { @@ -340,7 +336,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut db_iter, |db_iter| db_iter.next().transpose().map_err(|e| e.into()), &prefixes, - &mut allocations, self.max_proximity, |key, value| writer.insert(key, value).map_err(|e| e.into()), )?; @@ -393,7 +388,6 @@ fn execute_on_word_pairs_and_prefixes( Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, >, prefixes: &PrefixTrieNode, - allocations: &mut Allocations, max_proximity: u8, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { @@ -406,8 +400,8 @@ fn execute_on_word_pairs_and_prefixes( // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter let mut empty_prefixes = false; - let mut prefix_buffer = allocations.take_byte_vector(); - let mut merge_buffer = allocations.take_byte_vector(); + let mut prefix_buffer = Vec::with_capacity(8); + let mut merge_buffer = Vec::with_capacity(65_536); while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold @@ -426,7 +420,7 @@ fn execute_on_word_pairs_and_prefixes( // than the previous start of word2, then we'll need to flush the batch let word1_different_than_prev = word1 != batch.word1; if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut merge_buffer, &mut insert)?; + batch.flush(&mut merge_buffer, &mut insert)?; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { prefix_search_start.0 = 0; @@ -448,19 +442,17 @@ fn execute_on_word_pairs_and_prefixes( &mut prefix_buffer, &prefix_search_start, |prefix_buffer| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); let prefix_len = prefix_buffer.len(); prefix_buffer.push(0); prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, value, allocations); + batch.insert(&prefix_buffer, data.to_vec()); prefix_buffer.truncate(prefix_len); }, ); prefix_buffer.clear(); } } - batch.flush(allocations, &mut merge_buffer, &mut insert)?; + batch.flush(&mut merge_buffer, &mut insert)?; Ok(()) } /** @@ -482,17 +474,13 @@ struct PrefixAndProximityBatch { impl PrefixAndProximityBatch { /// Insert the new key and value into the batch - fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { + fn insert(&mut self, new_key: &[u8], new_value: Vec) { match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { Ok(position) => { self.batch[position].1.push(Cow::Owned(new_value)); } Err(position) => { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_key); - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert(position, (key, mergeable_data)); + self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)])); } } } @@ -502,7 +490,6 @@ impl PrefixAndProximityBatch { /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. fn flush( &mut self, - allocations: &mut Allocations, merge_buffer: &mut Vec, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { @@ -512,7 +499,7 @@ impl PrefixAndProximityBatch { } merge_buffer.clear(); - let mut buffer = allocations.take_byte_vector(); + let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1); buffer.extend_from_slice(word1); buffer.push(0); @@ -528,8 +515,6 @@ impl PrefixAndProximityBatch { }; insert(buffer.as_slice(), data)?; merge_buffer.clear(); - allocations.reclaim_byte_vector(key); - allocations.reclaim_mergeable_data_vector(mergeable_data); } Ok(()) @@ -591,36 +576,6 @@ pub fn write_into_lmdb_database_without_merging( Ok(()) } -struct Allocations { - byte_vectors: Vec>, - mergeable_data_vectors: Vec>>, -} -impl Default for Allocations { - fn default() -> Self { - Self { - byte_vectors: Vec::with_capacity(65_536), - mergeable_data_vectors: Vec::with_capacity(4096), - } - } -} -impl Allocations { - fn take_byte_vector(&mut self) -> Vec { - self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) - } - fn take_mergeable_data_vector(&mut self) -> Vec> { - self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) - } - - fn reclaim_byte_vector(&mut self, mut data: Vec) { - data.clear(); - self.byte_vectors.push(data); - } - fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { - data.clear(); - self.mergeable_data_vectors.push(data); - } -} - #[derive(Default, Debug)] struct PrefixTrieNode { children: Vec<(PrefixTrieNode, u8)>, @@ -970,7 +925,6 @@ mod tests { let mut result = vec![]; - let mut allocations = Allocations::default(); let mut iter = IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) @@ -979,7 +933,6 @@ mod tests { &mut iter, |iter| Ok(iter.next()), &prefixes, - &mut allocations, 2, |k, v| { let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); From 405555b4015635cfb57795108ff18a8d534101d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:21:05 +0200 Subject: [PATCH 14/17] Add some documentation to PrefixTrieNode --- .../word_prefix_pair_proximity_docids.rs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index e8d63acbb..367fdc7ab 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -576,6 +576,28 @@ pub fn write_into_lmdb_database_without_merging( Ok(()) } +/** A prefix trie. Used to iterate quickly over the prefixes of a word that are +within a set. + +## Structure +The trie is made of nodes composed of: +1. a byte character (e.g. 'a') +2. whether the node is an end node or not +3. a list of children nodes, sorted by their byte character + +For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` +is drawn below. Nodes with a double border are "end nodes". + +┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ +│ a │ │ c │ ║ r ║ +└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ +╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ +║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ +╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ + ╔═══╗ ╔═══╗ ╔═══╗ + ║ i ║ ║ l ║ ║ l ║ + ╚═══╝ ╚═══╝ ╚═══╝ +*/ #[derive(Default, Debug)] struct PrefixTrieNode { children: Vec<(PrefixTrieNode, u8)>, From 4f9edf13d7c6ec98fcda2be95be777ecae36a3f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 13:11:38 +0200 Subject: [PATCH 15/17] Remove commented-out function --- .../src/update/word_prefix_pair_proximity_docids.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 367fdc7ab..4e25e0c73 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -699,18 +699,6 @@ impl PrefixTrieNode { } } } - // fn print(&self, buffer: &mut String, ident: usize) { - // let mut spaces = String::new(); - // for _ in 0..ident { - // spaces.push(' ') - // } - // for (child, c) in &self.children { - // buffer.push(char::from_u32(*c as u32).unwrap()); - // println!("{spaces}{buffer}:"); - // child.print(buffer, ident + 4); - // buffer.pop(); - // } - // } } #[cfg(test)] mod tests { From 78d9f0622df253b5e90b400e710a83ba2fddb789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:21:00 +0200 Subject: [PATCH 16/17] cargo fmt --- milli/src/update/word_prefix_pair_proximity_docids.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 4e25e0c73..cf5e19a5c 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -702,16 +702,14 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { + use std::io::Cursor; + use roaring::RoaringBitmap; use super::*; - use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; - - use std::io::Cursor; - - use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; + use crate::{db_snap, CboRoaringBitmapCodec, StrStrU8Codec}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); From 93252769af3096971fdef83c6ab02c122e7beb41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:41:22 +0200 Subject: [PATCH 17/17] Apply review suggestions --- milli/src/update/word_prefix_pair_proximity_docids.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index cf5e19a5c..724858e4f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -165,7 +165,7 @@ inputs described above, which come from different places: present in the newly added documents 2. `word_pairs_db`, which is the list of word pairs from the database. - This list includes all elements in `new_word_pairs`** since `new_word_pairs` + This list includes all elements in `new_word_pairs` since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. @@ -380,10 +380,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { /// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements /// /// For more information about what this function does, read the module documentation. -fn execute_on_word_pairs_and_prefixes( - iter: &mut Iter, +fn execute_on_word_pairs_and_prefixes( + iter: &mut I, mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut Iter, + &'a mut I, ) -> Result< Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, >,