Add tests for WordPrefixPairProximityDocIds

This commit is contained in:
Loïc Lecrenier 2022-07-14 11:25:10 +02:00
parent 86807ca848
commit d350114159

View File

@ -88,7 +88,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
if !prefixes.is_empty() { if !prefixes.is_empty() {
let mut cursor = new_word_pair_proximity_docids.into_cursor()?; let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
Self::execute_on_word_pairs_and_prefixes( execute_on_word_pairs_and_prefixes(
&mut cursor, &mut cursor,
|cursor| { |cursor| {
if let Some((key, value)) = cursor.move_on_next()? { if let Some((key, value)) = cursor.move_on_next()? {
@ -113,7 +113,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
}, },
)?; )?;
} }
dbg!(count);
let prefixes = PrefixTrieNode::from_sorted_prefixes( let prefixes = PrefixTrieNode::from_sorted_prefixes(
new_prefix_fst_words new_prefix_fst_words
@ -136,7 +135,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
tempfile::tempfile()?, tempfile::tempfile()?,
); );
Self::execute_on_word_pairs_and_prefixes( execute_on_word_pairs_and_prefixes(
&mut db_iter, &mut db_iter,
|db_iter| db_iter.next().transpose().map_err(|e| e.into()), |db_iter| db_iter.next().transpose().map_err(|e| e.into()),
&prefixes, &prefixes,
@ -145,7 +144,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|key, value| writer.insert(key, value).map_err(|e| e.into()), |key, value| writer.insert(key, value).map_err(|e| e.into()),
)?; )?;
drop(db_iter); drop(db_iter);
writer_into_lmdb_database( writer_of_new_elements_into_lmdb_database(
self.wtxn, self.wtxn,
*self.index.word_prefix_pair_proximity_docids.as_polymorph(), *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
writer, writer,
@ -170,8 +169,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
Ok(()) Ok(())
} }
}
fn execute_on_word_pairs_and_prefixes<Iter>( fn execute_on_word_pairs_and_prefixes<Iter>(
iter: &mut Iter, iter: &mut Iter,
mut next_word_pair_proximity: impl for<'a> FnMut( mut next_word_pair_proximity: impl for<'a> FnMut(
&'a mut Iter, &'a mut Iter,
@ -182,7 +181,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
allocations: &mut Allocations, allocations: &mut Allocations,
max_proximity: u8, max_proximity: u8,
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
let mut batch = PrefixAndProximityBatch::default(); let mut batch = PrefixAndProximityBatch::default();
let mut prev_word2_start = 0; let mut prev_word2_start = 0;
@ -234,9 +233,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
} }
batch.flush(allocations, &mut insert)?; batch.flush(allocations, &mut insert)?;
Ok(()) Ok(())
}
} }
/** /**
A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
@ -275,9 +272,11 @@ impl PrefixAndProximityBatch {
}; };
} }
if self.batch.is_empty() { match self.batch.len() {
0 => {
insert_new_key_value!(); insert_new_key_value!();
} else if self.batch.len() == 1 { }
1 => {
let (existing_key, existing_data) = &mut self.batch[0]; let (existing_key, existing_data) = &mut self.batch[0];
match new_key.cmp(&existing_key) { match new_key.cmp(&existing_key) {
Ordering::Less => { Ordering::Less => {
@ -290,15 +289,15 @@ impl PrefixAndProximityBatch {
insert_new_key_value!(); insert_new_key_value!();
} }
} }
} else { }
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { _ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
Ok(position) => { Ok(position) => {
self.batch[position].1.push(Cow::Owned(new_value)); self.batch[position].1.push(Cow::Owned(new_value));
} }
Err(position) => { Err(position) => {
insert_new_key_value!(position); insert_new_key_value!(position);
} }
} },
} }
} }
@ -368,17 +367,13 @@ fn insert_into_database(
} }
// This is adapted from `sorter_into_lmdb_database` // This is adapted from `sorter_into_lmdb_database`
pub fn writer_into_lmdb_database( pub fn writer_of_new_elements_into_lmdb_database(
wtxn: &mut heed::RwTxn, wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase, database: heed::PolyDatabase,
writer: grenad::Writer<std::fs::File>, writer: grenad::Writer<std::fs::File>,
) -> Result<()> { ) -> Result<()> {
let file = writer.into_inner()?; let file = writer.into_inner()?;
let reader = grenad::Reader::new(BufReader::new(file))?; let reader = grenad::Reader::new(BufReader::new(file))?;
let len = reader.len();
dbg!(len);
let before = Instant::now();
if database.is_empty(wtxn)? { if database.is_empty(wtxn)? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
let mut cursor = reader.into_cursor()?; let mut cursor = reader.into_cursor()?;
@ -389,11 +384,9 @@ pub fn writer_into_lmdb_database(
} else { } else {
let mut cursor = reader.into_cursor()?; let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? { while let Some((k, v)) = cursor.move_on_next()? {
insert_into_database(wtxn, database, k, v)?; database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
} }
} }
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(()) Ok(())
} }
@ -454,18 +447,14 @@ impl PrefixTrieNode {
self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) self.children[search_start.0..].iter().position(|(_, c)| *c >= byte)
{ {
let (_, c) = self.children[search_start.0 + position]; let (_, c) = self.children[search_start.0 + position];
// dbg!(position, c, byte);
if c == byte { if c == byte {
// dbg!();
search_start.0 += position; search_start.0 += position;
true true
} else { } else {
// dbg!();
search_start.0 = 0; search_start.0 = 0;
false false
} }
} else { } else {
// dbg!();
search_start.0 = 0; search_start.0 = 0;
false false
} }
@ -546,7 +535,26 @@ impl PrefixTrieNode {
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use roaring::RoaringBitmap;
use crate::{CboRoaringBitmapCodec, StrStrU8Codec};
use super::*; use super::*;
fn check_prefixes(
trie: &PrefixTrieNode,
search_start: &PrefixTrieNodeSearchStart,
word: &str,
expected_prefixes: &[&str],
) {
let mut actual_prefixes = vec![];
trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| {
let s = String::from_utf8(x.to_owned()).unwrap();
actual_prefixes.push(s);
});
assert_eq!(actual_prefixes, expected_prefixes);
}
#[test] #[test]
fn test_trie() { fn test_trie() {
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
@ -567,43 +575,146 @@ mod tests {
"ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
"vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
])); ]));
// let mut buffer = String::new();
// trie.print(&mut buffer, 0);
// buffer.clear();
let mut search_start = PrefixTrieNodeSearchStart(0); let mut search_start = PrefixTrieNodeSearchStart(0);
let mut buffer = vec![];
let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
println!("{search_start:?}"); assert!(!is_empty);
println!("is empty: {is_empty}"); assert_eq!(search_start.0, 2);
trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| {
let s = std::str::from_utf8(x).unwrap();
println!("{s}");
});
buffer.clear();
trie.for_each_prefix_of("trans".as_bytes(), &mut buffer, &search_start, |x| {
let s = std::str::from_utf8(x).unwrap();
println!("{s}");
});
buffer.clear();
trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { check_prefixes(&trie, &search_start, "affair", &["a"]);
let s = std::str::from_utf8(x).unwrap(); check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]);
println!("{s}");
let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start);
assert!(!is_empty);
assert_eq!(trie.children[search_start.0].1, b'u');
check_prefixes(&trie, &search_start, "unique", &["u", "un"]);
// NOTE: this should fail, because the search start is already beyong 'a'
let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start);
assert!(!is_empty);
// search start is reset
assert_eq!(search_start.0, 0);
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
"arb", "arbre", "cat", "catto",
]));
check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]);
check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]);
}
#[test]
fn test_execute_on_word_pairs_and_prefixes() {
let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
"arb", "arbre", "cat", "catto",
]));
let mut serialised_bitmap123 = vec![];
let mut bitmap123 = RoaringBitmap::new();
bitmap123.insert(1);
bitmap123.insert(2);
bitmap123.insert(3);
CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123);
let mut serialised_bitmap456 = vec![];
let mut bitmap456 = RoaringBitmap::new();
bitmap456.insert(4);
bitmap456.insert(5);
bitmap456.insert(6);
CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456);
let mut serialised_bitmap789 = vec![];
let mut bitmap789 = RoaringBitmap::new();
bitmap789.insert(7);
bitmap789.insert(8);
bitmap789.insert(9);
CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789);
let mut serialised_bitmap_ranges = vec![];
let mut bitmap_ranges = RoaringBitmap::new();
bitmap_ranges.insert_range(63_000..65_000);
bitmap_ranges.insert_range(123_000..128_000);
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
let word_pairs = [
// 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456)
(("healthy", "arbre", 2), &serialised_bitmap123),
// not inserted because 3 > max_proximity
(("healthy", "arbre", 3), &serialised_bitmap456),
// 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123)
(("healthy", "arbres", 1), &serialised_bitmap123),
// 1, 3:
(("healthy", "arbres", 2), &serialised_bitmap456),
// not be inserted because 3 > max_proximity
(("healthy", "arbres", 3), &serialised_bitmap789),
// not inserted because no prefixes for boat
(("healthy", "boat", 1), &serialised_bitmap123),
// not inserted because no prefixes for ca
(("healthy", "ca", 1), &serialised_bitmap123),
// 4: (healthy cat 1) with (bitmap456 + bitmap123)
(("healthy", "cats", 1), &serialised_bitmap456),
// 5: (healthy cat 2) with (bitmap789 + bitmap_ranges)
(("healthy", "cats", 2), &serialised_bitmap789),
// 4 + 6: (healthy catto 1) with (bitmap123)
(("healthy", "cattos", 1), &serialised_bitmap123),
// 5 + 7: (healthy catto 2) with (bitmap_ranges)
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
// 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges)
(("jittery", "cat", 1), &serialised_bitmap123),
// 8:
(("jittery", "cata", 1), &serialised_bitmap456),
// 8:
(("jittery", "catb", 1), &serialised_bitmap789),
// 8:
(("jittery", "catc", 1), &serialised_bitmap_ranges),
];
let expected_result = [
// first batch:
(("healthy", "arb", 1), bitmap123.clone()),
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
(("healthy", "arbre", 1), bitmap123.clone()),
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
// second batch:
(("healthy", "cat", 1), &bitmap456 | &bitmap123),
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
(("healthy", "catto", 1), bitmap123.clone()),
(("healthy", "catto", 2), bitmap_ranges.clone()),
// third batch
(("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
];
let mut result = vec![];
let mut allocations = Allocations::default();
let mut iter =
IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| {
((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice())
}); });
buffer.clear(); execute_on_word_pairs_and_prefixes(
// trie.for_each_prefix_of("1", |x| { &mut iter,
// println!("{x}"); |iter| Ok(iter.next()),
// }); &prefixes,
// trie.for_each_prefix_of("19", |x| { &mut allocations,
// println!("{x}"); 2,
// }); |k, v| {
// trie.for_each_prefix_of("21", |x| { let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap();
// println!("{x}"); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
// }); result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap));
// let mut buffer = vec![]; Ok(())
// trie.for_each_prefix_of("integ", &mut buffer, |x| { },
// println!("{x}"); )
// }); .unwrap();
for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x;
let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y;
assert_eq!(actual_word1, expected_word1);
assert_eq!(actual_prefix, expected_prefix);
assert_eq!(actual_proximity, expected_proximity);
assert_eq!(actual_bitmap, expected_bitmap);
}
} }
} }