mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Add tests for WordPrefixPairProximityDocIds
This commit is contained in:
parent
86807ca848
commit
d350114159
@ -88,7 +88,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
|
|
||||||
if !prefixes.is_empty() {
|
if !prefixes.is_empty() {
|
||||||
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
||||||
Self::execute_on_word_pairs_and_prefixes(
|
execute_on_word_pairs_and_prefixes(
|
||||||
&mut cursor,
|
&mut cursor,
|
||||||
|cursor| {
|
|cursor| {
|
||||||
if let Some((key, value)) = cursor.move_on_next()? {
|
if let Some((key, value)) = cursor.move_on_next()? {
|
||||||
@ -113,7 +113,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
dbg!(count);
|
|
||||||
|
|
||||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||||
new_prefix_fst_words
|
new_prefix_fst_words
|
||||||
@ -136,7 +135,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
|
|
||||||
Self::execute_on_word_pairs_and_prefixes(
|
execute_on_word_pairs_and_prefixes(
|
||||||
&mut db_iter,
|
&mut db_iter,
|
||||||
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
||||||
&prefixes,
|
&prefixes,
|
||||||
@ -145,7 +144,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||||
)?;
|
)?;
|
||||||
drop(db_iter);
|
drop(db_iter);
|
||||||
writer_into_lmdb_database(
|
writer_of_new_elements_into_lmdb_database(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
writer,
|
writer,
|
||||||
@ -170,8 +169,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
fn execute_on_word_pairs_and_prefixes<Iter>(
|
fn execute_on_word_pairs_and_prefixes<Iter>(
|
||||||
iter: &mut Iter,
|
iter: &mut Iter,
|
||||||
mut next_word_pair_proximity: impl for<'a> FnMut(
|
mut next_word_pair_proximity: impl for<'a> FnMut(
|
||||||
&'a mut Iter,
|
&'a mut Iter,
|
||||||
@ -182,7 +181,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
allocations: &mut Allocations,
|
allocations: &mut Allocations,
|
||||||
max_proximity: u8,
|
max_proximity: u8,
|
||||||
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut batch = PrefixAndProximityBatch::default();
|
let mut batch = PrefixAndProximityBatch::default();
|
||||||
let mut prev_word2_start = 0;
|
let mut prev_word2_start = 0;
|
||||||
|
|
||||||
@ -234,9 +233,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
batch.flush(allocations, &mut insert)?;
|
batch.flush(allocations, &mut insert)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
||||||
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
|
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
|
||||||
@ -275,9 +272,11 @@ impl PrefixAndProximityBatch {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.batch.is_empty() {
|
match self.batch.len() {
|
||||||
|
0 => {
|
||||||
insert_new_key_value!();
|
insert_new_key_value!();
|
||||||
} else if self.batch.len() == 1 {
|
}
|
||||||
|
1 => {
|
||||||
let (existing_key, existing_data) = &mut self.batch[0];
|
let (existing_key, existing_data) = &mut self.batch[0];
|
||||||
match new_key.cmp(&existing_key) {
|
match new_key.cmp(&existing_key) {
|
||||||
Ordering::Less => {
|
Ordering::Less => {
|
||||||
@ -290,15 +289,15 @@ impl PrefixAndProximityBatch {
|
|||||||
insert_new_key_value!();
|
insert_new_key_value!();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
|
_ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
|
||||||
Ok(position) => {
|
Ok(position) => {
|
||||||
self.batch[position].1.push(Cow::Owned(new_value));
|
self.batch[position].1.push(Cow::Owned(new_value));
|
||||||
}
|
}
|
||||||
Err(position) => {
|
Err(position) => {
|
||||||
insert_new_key_value!(position);
|
insert_new_key_value!(position);
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -368,17 +367,13 @@ fn insert_into_database(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// This is adapted from `sorter_into_lmdb_database`
|
// This is adapted from `sorter_into_lmdb_database`
|
||||||
pub fn writer_into_lmdb_database(
|
pub fn writer_of_new_elements_into_lmdb_database(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
database: heed::PolyDatabase,
|
database: heed::PolyDatabase,
|
||||||
writer: grenad::Writer<std::fs::File>,
|
writer: grenad::Writer<std::fs::File>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let file = writer.into_inner()?;
|
let file = writer.into_inner()?;
|
||||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||||
let len = reader.len();
|
|
||||||
dbg!(len);
|
|
||||||
let before = Instant::now();
|
|
||||||
|
|
||||||
if database.is_empty(wtxn)? {
|
if database.is_empty(wtxn)? {
|
||||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||||
let mut cursor = reader.into_cursor()?;
|
let mut cursor = reader.into_cursor()?;
|
||||||
@ -389,11 +384,9 @@ pub fn writer_into_lmdb_database(
|
|||||||
} else {
|
} else {
|
||||||
let mut cursor = reader.into_cursor()?;
|
let mut cursor = reader.into_cursor()?;
|
||||||
while let Some((k, v)) = cursor.move_on_next()? {
|
while let Some((k, v)) = cursor.move_on_next()? {
|
||||||
insert_into_database(wtxn, database, k, v)?;
|
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -454,18 +447,14 @@ impl PrefixTrieNode {
|
|||||||
self.children[search_start.0..].iter().position(|(_, c)| *c >= byte)
|
self.children[search_start.0..].iter().position(|(_, c)| *c >= byte)
|
||||||
{
|
{
|
||||||
let (_, c) = self.children[search_start.0 + position];
|
let (_, c) = self.children[search_start.0 + position];
|
||||||
// dbg!(position, c, byte);
|
|
||||||
if c == byte {
|
if c == byte {
|
||||||
// dbg!();
|
|
||||||
search_start.0 += position;
|
search_start.0 += position;
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
// dbg!();
|
|
||||||
search_start.0 = 0;
|
search_start.0 = 0;
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// dbg!();
|
|
||||||
search_start.0 = 0;
|
search_start.0 = 0;
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
@ -546,7 +535,26 @@ impl PrefixTrieNode {
|
|||||||
}
|
}
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::{CboRoaringBitmapCodec, StrStrU8Codec};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
fn check_prefixes(
|
||||||
|
trie: &PrefixTrieNode,
|
||||||
|
search_start: &PrefixTrieNodeSearchStart,
|
||||||
|
word: &str,
|
||||||
|
expected_prefixes: &[&str],
|
||||||
|
) {
|
||||||
|
let mut actual_prefixes = vec![];
|
||||||
|
trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| {
|
||||||
|
let s = String::from_utf8(x.to_owned()).unwrap();
|
||||||
|
actual_prefixes.push(s);
|
||||||
|
});
|
||||||
|
assert_eq!(actual_prefixes, expected_prefixes);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_trie() {
|
fn test_trie() {
|
||||||
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
||||||
@ -567,43 +575,146 @@ mod tests {
|
|||||||
"ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
|
"ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
|
||||||
"vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
|
"vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
|
||||||
]));
|
]));
|
||||||
// let mut buffer = String::new();
|
|
||||||
// trie.print(&mut buffer, 0);
|
|
||||||
// buffer.clear();
|
|
||||||
let mut search_start = PrefixTrieNodeSearchStart(0);
|
let mut search_start = PrefixTrieNodeSearchStart(0);
|
||||||
let mut buffer = vec![];
|
|
||||||
|
|
||||||
let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
|
let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
|
||||||
println!("{search_start:?}");
|
assert!(!is_empty);
|
||||||
println!("is empty: {is_empty}");
|
assert_eq!(search_start.0, 2);
|
||||||
trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| {
|
|
||||||
let s = std::str::from_utf8(x).unwrap();
|
|
||||||
println!("{s}");
|
|
||||||
});
|
|
||||||
buffer.clear();
|
|
||||||
trie.for_each_prefix_of("trans".as_bytes(), &mut buffer, &search_start, |x| {
|
|
||||||
let s = std::str::from_utf8(x).unwrap();
|
|
||||||
println!("{s}");
|
|
||||||
});
|
|
||||||
buffer.clear();
|
|
||||||
|
|
||||||
trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| {
|
check_prefixes(&trie, &search_start, "affair", &["a"]);
|
||||||
let s = std::str::from_utf8(x).unwrap();
|
check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]);
|
||||||
println!("{s}");
|
|
||||||
|
let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start);
|
||||||
|
assert!(!is_empty);
|
||||||
|
assert_eq!(trie.children[search_start.0].1, b'u');
|
||||||
|
|
||||||
|
check_prefixes(&trie, &search_start, "unique", &["u", "un"]);
|
||||||
|
|
||||||
|
// NOTE: this should fail, because the search start is already beyong 'a'
|
||||||
|
let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start);
|
||||||
|
assert!(!is_empty);
|
||||||
|
// search start is reset
|
||||||
|
assert_eq!(search_start.0, 0);
|
||||||
|
|
||||||
|
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
||||||
|
"arb", "arbre", "cat", "catto",
|
||||||
|
]));
|
||||||
|
check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]);
|
||||||
|
check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_execute_on_word_pairs_and_prefixes() {
|
||||||
|
let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
||||||
|
"arb", "arbre", "cat", "catto",
|
||||||
|
]));
|
||||||
|
|
||||||
|
let mut serialised_bitmap123 = vec![];
|
||||||
|
let mut bitmap123 = RoaringBitmap::new();
|
||||||
|
bitmap123.insert(1);
|
||||||
|
bitmap123.insert(2);
|
||||||
|
bitmap123.insert(3);
|
||||||
|
CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123);
|
||||||
|
|
||||||
|
let mut serialised_bitmap456 = vec![];
|
||||||
|
let mut bitmap456 = RoaringBitmap::new();
|
||||||
|
bitmap456.insert(4);
|
||||||
|
bitmap456.insert(5);
|
||||||
|
bitmap456.insert(6);
|
||||||
|
CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456);
|
||||||
|
|
||||||
|
let mut serialised_bitmap789 = vec![];
|
||||||
|
let mut bitmap789 = RoaringBitmap::new();
|
||||||
|
bitmap789.insert(7);
|
||||||
|
bitmap789.insert(8);
|
||||||
|
bitmap789.insert(9);
|
||||||
|
CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789);
|
||||||
|
|
||||||
|
let mut serialised_bitmap_ranges = vec![];
|
||||||
|
let mut bitmap_ranges = RoaringBitmap::new();
|
||||||
|
bitmap_ranges.insert_range(63_000..65_000);
|
||||||
|
bitmap_ranges.insert_range(123_000..128_000);
|
||||||
|
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
|
||||||
|
|
||||||
|
let word_pairs = [
|
||||||
|
// 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456)
|
||||||
|
(("healthy", "arbre", 2), &serialised_bitmap123),
|
||||||
|
// not inserted because 3 > max_proximity
|
||||||
|
(("healthy", "arbre", 3), &serialised_bitmap456),
|
||||||
|
// 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123)
|
||||||
|
(("healthy", "arbres", 1), &serialised_bitmap123),
|
||||||
|
// 1, 3:
|
||||||
|
(("healthy", "arbres", 2), &serialised_bitmap456),
|
||||||
|
// not be inserted because 3 > max_proximity
|
||||||
|
(("healthy", "arbres", 3), &serialised_bitmap789),
|
||||||
|
// not inserted because no prefixes for boat
|
||||||
|
(("healthy", "boat", 1), &serialised_bitmap123),
|
||||||
|
// not inserted because no prefixes for ca
|
||||||
|
(("healthy", "ca", 1), &serialised_bitmap123),
|
||||||
|
// 4: (healthy cat 1) with (bitmap456 + bitmap123)
|
||||||
|
(("healthy", "cats", 1), &serialised_bitmap456),
|
||||||
|
// 5: (healthy cat 2) with (bitmap789 + bitmap_ranges)
|
||||||
|
(("healthy", "cats", 2), &serialised_bitmap789),
|
||||||
|
// 4 + 6: (healthy catto 1) with (bitmap123)
|
||||||
|
(("healthy", "cattos", 1), &serialised_bitmap123),
|
||||||
|
// 5 + 7: (healthy catto 2) with (bitmap_ranges)
|
||||||
|
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
|
||||||
|
// 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges)
|
||||||
|
(("jittery", "cat", 1), &serialised_bitmap123),
|
||||||
|
// 8:
|
||||||
|
(("jittery", "cata", 1), &serialised_bitmap456),
|
||||||
|
// 8:
|
||||||
|
(("jittery", "catb", 1), &serialised_bitmap789),
|
||||||
|
// 8:
|
||||||
|
(("jittery", "catc", 1), &serialised_bitmap_ranges),
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_result = [
|
||||||
|
// first batch:
|
||||||
|
(("healthy", "arb", 1), bitmap123.clone()),
|
||||||
|
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
|
||||||
|
(("healthy", "arbre", 1), bitmap123.clone()),
|
||||||
|
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
|
||||||
|
// second batch:
|
||||||
|
(("healthy", "cat", 1), &bitmap456 | &bitmap123),
|
||||||
|
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
|
||||||
|
(("healthy", "catto", 1), bitmap123.clone()),
|
||||||
|
(("healthy", "catto", 2), bitmap_ranges.clone()),
|
||||||
|
// third batch
|
||||||
|
(("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut result = vec![];
|
||||||
|
|
||||||
|
let mut allocations = Allocations::default();
|
||||||
|
let mut iter =
|
||||||
|
IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| {
|
||||||
|
((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice())
|
||||||
});
|
});
|
||||||
buffer.clear();
|
execute_on_word_pairs_and_prefixes(
|
||||||
// trie.for_each_prefix_of("1", |x| {
|
&mut iter,
|
||||||
// println!("{x}");
|
|iter| Ok(iter.next()),
|
||||||
// });
|
&prefixes,
|
||||||
// trie.for_each_prefix_of("19", |x| {
|
&mut allocations,
|
||||||
// println!("{x}");
|
2,
|
||||||
// });
|
|k, v| {
|
||||||
// trie.for_each_prefix_of("21", |x| {
|
let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap();
|
||||||
// println!("{x}");
|
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
|
||||||
// });
|
result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap));
|
||||||
// let mut buffer = vec![];
|
Ok(())
|
||||||
// trie.for_each_prefix_of("integ", &mut buffer, |x| {
|
},
|
||||||
// println!("{x}");
|
)
|
||||||
// });
|
.unwrap();
|
||||||
|
|
||||||
|
for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
|
||||||
|
let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x;
|
||||||
|
let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y;
|
||||||
|
|
||||||
|
assert_eq!(actual_word1, expected_word1);
|
||||||
|
assert_eq!(actual_prefix, expected_prefix);
|
||||||
|
assert_eq!(actual_proximity, expected_proximity);
|
||||||
|
assert_eq!(actual_bitmap, expected_bitmap);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user