mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 19:57:30 +01:00
Change encoding of word_pair_proximity DB to (proximity, word1, word2)
Same for word_prefix_pair_proximity
This commit is contained in:
parent
19b2326f3d
commit
bdeb47305e
@ -7,12 +7,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec {
|
|||||||
type DItem = (&'a str, &'a str, u8);
|
type DItem = (&'a str, &'a str, u8);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (n, bytes) = bytes.split_last()?;
|
let (n, bytes) = bytes.split_first()?;
|
||||||
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
||||||
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
||||||
let rest = &rest[1..];
|
let s2_bytes = &rest[1..];
|
||||||
let s1 = str::from_utf8(s1_bytes).ok()?;
|
let s1 = str::from_utf8(s1_bytes).ok()?;
|
||||||
let (_, s2_bytes) = rest.split_last()?;
|
|
||||||
let s2 = str::from_utf8(s2_bytes).ok()?;
|
let s2 = str::from_utf8(s2_bytes).ok()?;
|
||||||
Some((s1, s2, *n))
|
Some((s1, s2, *n))
|
||||||
}
|
}
|
||||||
@ -22,12 +21,11 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec {
|
|||||||
type EItem = (&'a str, &'a str, u8);
|
type EItem = (&'a str, &'a str, u8);
|
||||||
|
|
||||||
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
|
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||||
|
bytes.push(*n);
|
||||||
bytes.extend_from_slice(s1.as_bytes());
|
bytes.extend_from_slice(s1.as_bytes());
|
||||||
bytes.push(0);
|
bytes.push(0);
|
||||||
bytes.extend_from_slice(s2.as_bytes());
|
bytes.extend_from_slice(s2.as_bytes());
|
||||||
bytes.push(0);
|
|
||||||
bytes.push(*n);
|
|
||||||
Some(Cow::Owned(bytes))
|
Some(Cow::Owned(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -37,11 +35,10 @@ impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec {
|
|||||||
type DItem = (&'a [u8], &'a [u8], u8);
|
type DItem = (&'a [u8], &'a [u8], u8);
|
||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
let (n, bytes) = bytes.split_last()?;
|
let (n, bytes) = bytes.split_first()?;
|
||||||
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
||||||
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
||||||
let rest = &rest[1..];
|
let s2_bytes = &rest[1..];
|
||||||
let (_, s2_bytes) = rest.split_last()?;
|
|
||||||
Some((s1_bytes, s2_bytes, *n))
|
Some((s1_bytes, s2_bytes, *n))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -50,12 +47,11 @@ impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec {
|
|||||||
type EItem = (&'a [u8], &'a [u8], u8);
|
type EItem = (&'a [u8], &'a [u8], u8);
|
||||||
|
|
||||||
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
|
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||||
|
bytes.push(*n);
|
||||||
bytes.extend_from_slice(s1);
|
bytes.extend_from_slice(s1);
|
||||||
bytes.push(0);
|
bytes.push(0);
|
||||||
bytes.extend_from_slice(s2);
|
bytes.extend_from_slice(s2);
|
||||||
bytes.push(0);
|
|
||||||
bytes.push(*n);
|
|
||||||
Some(Cow::Owned(bytes))
|
Some(Cow::Owned(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -194,7 +194,7 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
|
|||||||
(word1, prefix, proximity),
|
(word1, prefix, proximity),
|
||||||
b,
|
b,
|
||||||
)| {
|
)| {
|
||||||
&format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b))
|
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
|
||||||
});
|
});
|
||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
|
@ -151,11 +151,10 @@ fn document_word_positions_into_sorter<'b>(
|
|||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
for ((w1, w2), prox) in word_pair_proximity {
|
for ((w1, w2), prox) in word_pair_proximity {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
|
key_buffer.push(prox as u8);
|
||||||
key_buffer.extend_from_slice(w1.as_bytes());
|
key_buffer.extend_from_slice(w1.as_bytes());
|
||||||
key_buffer.push(0);
|
key_buffer.push(0);
|
||||||
key_buffer.extend_from_slice(w2.as_bytes());
|
key_buffer.extend_from_slice(w2.as_bytes());
|
||||||
key_buffer.push(0);
|
|
||||||
key_buffer.push(prox as u8);
|
|
||||||
|
|
||||||
word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
|
word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
|
@ -1,46 +1,46 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/word_prefix_pair_proximity_docids.rs
|
source: milli/src/update/word_prefix_pair_proximity_docids.rs
|
||||||
---
|
---
|
||||||
5 a 1 [101, ]
|
1 5 a [101, ]
|
||||||
5 a 2 [101, ]
|
1 amazing a [100, ]
|
||||||
5 b 4 [101, ]
|
1 an a [100, ]
|
||||||
5 be 4 [101, ]
|
1 and b [100, ]
|
||||||
am a 3 [101, ]
|
1 and be [100, ]
|
||||||
amazing a 1 [100, ]
|
1 at a [100, ]
|
||||||
amazing a 2 [100, ]
|
1 rings a [101, ]
|
||||||
amazing a 3 [100, ]
|
1 the b [101, ]
|
||||||
amazing b 2 [100, ]
|
1 the be [101, ]
|
||||||
amazing be 2 [100, ]
|
2 5 a [101, ]
|
||||||
an a 1 [100, ]
|
2 amazing a [100, ]
|
||||||
an a 2 [100, ]
|
2 amazing b [100, ]
|
||||||
an b 3 [100, ]
|
2 amazing be [100, ]
|
||||||
an be 3 [100, ]
|
2 an a [100, ]
|
||||||
and a 2 [100, ]
|
2 and a [100, ]
|
||||||
and a 3 [100, ]
|
2 at a [100, 101, ]
|
||||||
and a 4 [100, ]
|
2 beautiful a [100, ]
|
||||||
and b 1 [100, ]
|
2 bell a [101, ]
|
||||||
and be 1 [100, ]
|
2 house b [100, ]
|
||||||
at a 1 [100, ]
|
2 house be [100, ]
|
||||||
at a 2 [100, 101, ]
|
2 rings b [101, ]
|
||||||
at a 3 [100, ]
|
2 rings be [101, ]
|
||||||
at b 3 [101, ]
|
3 am a [101, ]
|
||||||
at b 4 [100, ]
|
3 amazing a [100, ]
|
||||||
at be 3 [101, ]
|
3 an b [100, ]
|
||||||
at be 4 [100, ]
|
3 an be [100, ]
|
||||||
beautiful a 2 [100, ]
|
3 and a [100, ]
|
||||||
beautiful a 3 [100, ]
|
3 at a [100, ]
|
||||||
beautiful a 4 [100, ]
|
3 at b [101, ]
|
||||||
bell a 2 [101, ]
|
3 at be [101, ]
|
||||||
bell a 4 [101, ]
|
3 beautiful a [100, ]
|
||||||
house a 3 [100, ]
|
3 house a [100, ]
|
||||||
house a 4 [100, ]
|
3 rings a [101, ]
|
||||||
house b 2 [100, ]
|
3 the a [101, ]
|
||||||
house be 2 [100, ]
|
4 5 b [101, ]
|
||||||
rings a 1 [101, ]
|
4 5 be [101, ]
|
||||||
rings a 3 [101, ]
|
4 and a [100, ]
|
||||||
rings b 2 [101, ]
|
4 at b [100, ]
|
||||||
rings be 2 [101, ]
|
4 at be [100, ]
|
||||||
the a 3 [101, ]
|
4 beautiful a [100, ]
|
||||||
the b 1 [101, ]
|
4 bell a [101, ]
|
||||||
the be 1 [101, ]
|
4 house a [100, ]
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
---
|
---
|
||||||
source: milli/src/update/word_prefix_pair_proximity_docids.rs
|
source: milli/src/update/word_prefix_pair_proximity_docids.rs
|
||||||
---
|
---
|
||||||
5ed4bf83317b10962a55ade353427bdd
|
fb88e49fd666886731b62baef8f44995
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*!
|
/*!
|
||||||
## What is WordPrefixPairProximityDocids?
|
## What is WordPrefixPairProximityDocids?
|
||||||
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
||||||
the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of
|
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
||||||
the documents which contain `word` followed by another word starting with
|
the documents which contain `word` followed by another word starting with
|
||||||
`prefix` at a distance of `proximity`.
|
`prefix` at a distance of `proximity`.
|
||||||
|
|
||||||
@ -23,127 +23,100 @@ dog
|
|||||||
Note that only prefixes which correspond to more than a certain number of
|
Note that only prefixes which correspond to more than a certain number of
|
||||||
different words from the database are included in this list.
|
different words from the database are included in this list.
|
||||||
|
|
||||||
* a sorted list of word pairs and the distance between them (i.e. proximity),
|
* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
|
||||||
* associated with a roaring bitmap, such as:
|
associated with a roaring bitmap, such as:
|
||||||
```text
|
```text
|
||||||
good dog 3 -> docids1: [2, 5, 6]
|
1 good doggo -> docids1: [8]
|
||||||
good doggo 1 -> docids2: [8]
|
1 good door -> docids2: [7, 19, 20]
|
||||||
good dogma 1 -> docids3: [7, 19, 20]
|
1 good ghost -> docids3: [1]
|
||||||
good ghost 2 -> docids4: [1]
|
2 good dog -> docids4: [2, 5, 6]
|
||||||
horror cathedral 4 -> docids5: [1, 2]
|
2 horror cathedral -> docids5: [1, 2]
|
||||||
```
|
```
|
||||||
|
|
||||||
I illustrate a simplified version of the algorithm to create the word-prefix
|
I illustrate a simplified version of the algorithm to create the word-prefix
|
||||||
pair-proximity database below:
|
pair-proximity database below:
|
||||||
|
|
||||||
1. **Outer loop:** First, we iterate over each word pair and its proximity:
|
1. **Outer loop:** First, we iterate over each proximity and word pair:
|
||||||
```text
|
```text
|
||||||
|
proximity: 1
|
||||||
word1 : good
|
word1 : good
|
||||||
word2 : dog
|
word2 : doggo
|
||||||
proximity: 3
|
|
||||||
```
|
```
|
||||||
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
|
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
|
||||||
in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`)
|
in the list of sorted prefixes. And we insert the key `prefix`
|
||||||
and the value (`docids`) to a sorted map which we call the “batch”. For example,
|
and the value (`docids`) to a sorted map which we call the “batch”. For example,
|
||||||
at the end of the first inner loop, we may have:
|
at the end of the first inner loop, we may have:
|
||||||
```text
|
```text
|
||||||
Outer loop 1:
|
Outer loop 1:
|
||||||
------------------------------
|
------------------------------
|
||||||
|
proximity: 1
|
||||||
word1 : good
|
word1 : good
|
||||||
word2 : dog
|
word2 : doggo
|
||||||
proximity: 3
|
|
||||||
docids : docids1
|
docids : docids1
|
||||||
|
|
||||||
prefixes: [d, do, dog]
|
prefixes: [d, do, dog]
|
||||||
|
|
||||||
batch: [
|
batch: [
|
||||||
(d, 3) -> [docids1]
|
d, -> [docids1]
|
||||||
(do, 3) -> [docids1]
|
do -> [docids1]
|
||||||
(dog, 3) -> [docids1]
|
dog -> [docids1]
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
3. For illustration purpose, let's run through a second iteration of the outer loop:
|
3. For illustration purpose, let's run through a second iteration of the outer loop:
|
||||||
```text
|
```text
|
||||||
Outer loop 2:
|
Outer loop 2:
|
||||||
------------------------------
|
------------------------------
|
||||||
word1 : good
|
|
||||||
word2 : doggo
|
|
||||||
proximity: 1
|
proximity: 1
|
||||||
|
word1 : good
|
||||||
|
word2 : door
|
||||||
docids : docids2
|
docids : docids2
|
||||||
|
|
||||||
prefixes: [d, do, dog]
|
prefixes: [d, do, doo]
|
||||||
|
|
||||||
batch: [
|
batch: [
|
||||||
(d, 1) -> [docids2]
|
d -> [docids1, docids2]
|
||||||
(d, 3) -> [docids1]
|
do -> [docids1, docids2]
|
||||||
(do, 1) -> [docids2]
|
dog -> [docids1]
|
||||||
(do, 3) -> [docids1]
|
doo -> [docids2]
|
||||||
(dog, 1) -> [docids2]
|
|
||||||
(dog, 3) -> [docids1]
|
|
||||||
]
|
|
||||||
```
|
|
||||||
Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some
|
|
||||||
of the elements inserted in the second iteration of the outer loop appear
|
|
||||||
*before* elements from the first iteration.
|
|
||||||
|
|
||||||
4. And a third:
|
|
||||||
```text
|
|
||||||
Outer loop 3:
|
|
||||||
------------------------------
|
|
||||||
word1 : good
|
|
||||||
word2 : dogma
|
|
||||||
proximity: 1
|
|
||||||
docids : docids3
|
|
||||||
|
|
||||||
prefixes: [d, do, dog]
|
|
||||||
|
|
||||||
batch: [
|
|
||||||
(d, 1) -> [docids2, docids3]
|
|
||||||
(d, 3) -> [docids1]
|
|
||||||
(do, 1) -> [docids2, docids3]
|
|
||||||
(do, 3) -> [docids1]
|
|
||||||
(dog, 1) -> [docids2, docids3]
|
|
||||||
(dog, 3) -> [docids1]
|
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
Notice that there were some conflicts which were resolved by merging the
|
Notice that there were some conflicts which were resolved by merging the
|
||||||
conflicting values together.
|
conflicting values together. Also, an additional prefix was added at the
|
||||||
|
end of the batch.
|
||||||
|
|
||||||
5. On the fourth iteration of the outer loop, we have:
|
4. On the third iteration of the outer loop, we have:
|
||||||
```text
|
```text
|
||||||
Outer loop 4:
|
Outer loop 4:
|
||||||
------------------------------
|
------------------------------
|
||||||
|
proximity: 1
|
||||||
word1 : good
|
word1 : good
|
||||||
word2 : ghost
|
word2 : ghost
|
||||||
proximity: 2
|
|
||||||
```
|
```
|
||||||
Because `word2` begins with a different letter than the previous `word2`,
|
Because `word2` begins with a different letter than the previous `word2`,
|
||||||
we know that:
|
we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
|
||||||
|
|
||||||
1. All the prefixes of `word2` are greater than the prefixes of the previous word2
|
|
||||||
2. And therefore, every instance of (`word2`, `prefix`) will be greater than
|
|
||||||
any element in the batch.
|
|
||||||
|
|
||||||
Therefore, we know that we can insert every element from the batch into the
|
Therefore, we know that we can insert every element from the batch into the
|
||||||
database before proceeding any further. This operation is called
|
database before proceeding any further. This operation is called
|
||||||
“flushing the batch”. Flushing the batch should also be done whenever `word1`
|
“flushing the batch”. Flushing the batch should also be done whenever:
|
||||||
is different than the previous `word1`.
|
* `proximity` is different than the previous `proximity`.
|
||||||
|
* `word1` is different than the previous `word1`.
|
||||||
|
* `word2` starts with a different letter than the previous word2
|
||||||
|
|
||||||
6. **Flushing the batch:** to flush the batch, we look at the `word1` and
|
6. **Flushing the batch:** to flush the batch, we iterate over its elements:
|
||||||
iterate over the elements of the batch in sorted order:
|
|
||||||
```text
|
```text
|
||||||
Flushing Batch loop 1:
|
Flushing Batch loop 1:
|
||||||
------------------------------
|
------------------------------
|
||||||
word1 : good
|
proximity : 1
|
||||||
word2 : d
|
word1 : good
|
||||||
proximity: 1
|
prefix : d
|
||||||
|
|
||||||
docids : [docids2, docids3]
|
docids : [docids2, docids3]
|
||||||
```
|
```
|
||||||
We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
|
We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
|
||||||
`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
|
`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
|
||||||
roaring bitmap of all the document ids where `word1` is followed by `prefix`
|
roaring bitmap of all the document ids where `word1` is followed by `prefix`
|
||||||
at a distance of `proximity`.
|
at a distance of `proximity`.
|
||||||
Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids`
|
Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
|
||||||
into the database.
|
into the database.
|
||||||
|
|
||||||
7. That's it! ... except...
|
7. That's it! ... except...
|
||||||
@ -184,8 +157,8 @@ Note, also, that since we read data from the database when iterating over
|
|||||||
`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
|
`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
|
||||||
docids from the batch directly into the database (we would have a concurrent
|
docids from the batch directly into the database (we would have a concurrent
|
||||||
reader and writer). Therefore, when calling the algorithm on
|
reader and writer). Therefore, when calling the algorithm on
|
||||||
(`new_prefixes`, `word_pairs_db`), we insert the computed
|
`(new_prefixes, word_pairs_db)`, we insert the computed
|
||||||
((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad
|
`((proximity, word, prefix), docids)` elements in an intermediary grenad
|
||||||
Writer instead of the DB. At the end of the outer loop, we finally read from
|
Writer instead of the DB. At the end of the outer loop, we finally read from
|
||||||
the grenad and insert its elements in the database.
|
the grenad and insert its elements in the database.
|
||||||
|
|
||||||
@ -406,7 +379,7 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
|||||||
while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
|
while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
|
||||||
// skip this iteration if the proximity is over the threshold
|
// skip this iteration if the proximity is over the threshold
|
||||||
if proximity > max_proximity {
|
if proximity > max_proximity {
|
||||||
continue;
|
break;
|
||||||
};
|
};
|
||||||
let word2_start_different_than_prev = word2[0] != prev_word2_start;
|
let word2_start_different_than_prev = word2[0] != prev_word2_start;
|
||||||
// if there were no potential prefixes for the previous word2 based on its first letter,
|
// if there were no potential prefixes for the previous word2 based on its first letter,
|
||||||
@ -416,16 +389,21 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if word1 is different than the previous word1 OR if the start of word2 is different
|
// if the proximity is different to the previous one, OR
|
||||||
// than the previous start of word2, then we'll need to flush the batch
|
// if word1 is different than the previous word1, OR
|
||||||
|
// if the start of word2 is different than the previous start of word2,
|
||||||
|
// THEN we'll need to flush the batch
|
||||||
|
let prox_different_than_prev = proximity != batch.proximity;
|
||||||
let word1_different_than_prev = word1 != batch.word1;
|
let word1_different_than_prev = word1 != batch.word1;
|
||||||
if word1_different_than_prev || word2_start_different_than_prev {
|
if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
|
||||||
|
{
|
||||||
batch.flush(&mut merge_buffer, &mut insert)?;
|
batch.flush(&mut merge_buffer, &mut insert)?;
|
||||||
// don't forget to reset the value of batch.word1 and prev_word2_start
|
// don't forget to reset the value of batch.word1 and prev_word2_start
|
||||||
if word1_different_than_prev {
|
if word1_different_than_prev {
|
||||||
prefix_search_start.0 = 0;
|
prefix_search_start.0 = 0;
|
||||||
batch.word1.clear();
|
batch.word1.clear();
|
||||||
batch.word1.extend_from_slice(word1);
|
batch.word1.extend_from_slice(word1);
|
||||||
|
batch.proximity = proximity;
|
||||||
}
|
}
|
||||||
if word2_start_different_than_prev {
|
if word2_start_different_than_prev {
|
||||||
// word2_start_different_than_prev == true
|
// word2_start_different_than_prev == true
|
||||||
@ -437,74 +415,70 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
|||||||
|
|
||||||
if !empty_prefixes {
|
if !empty_prefixes {
|
||||||
// All conditions are satisfied, we can now insert each new prefix of word2 into the batch
|
// All conditions are satisfied, we can now insert each new prefix of word2 into the batch
|
||||||
|
prefix_buffer.clear();
|
||||||
prefixes.for_each_prefix_of(
|
prefixes.for_each_prefix_of(
|
||||||
word2,
|
word2,
|
||||||
&mut prefix_buffer,
|
&mut prefix_buffer,
|
||||||
&prefix_search_start,
|
&prefix_search_start,
|
||||||
|prefix_buffer| {
|
|prefix_buffer| {
|
||||||
let prefix_len = prefix_buffer.len();
|
|
||||||
prefix_buffer.push(0);
|
|
||||||
prefix_buffer.push(proximity);
|
|
||||||
batch.insert(&prefix_buffer, data.to_vec());
|
batch.insert(&prefix_buffer, data.to_vec());
|
||||||
prefix_buffer.truncate(prefix_len);
|
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
prefix_buffer.clear();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
batch.flush(&mut merge_buffer, &mut insert)?;
|
batch.flush(&mut merge_buffer, &mut insert)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
||||||
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
|
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
|
||||||
|
|
||||||
It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently.
|
It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
|
||||||
|
|
||||||
The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content
|
The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
|
||||||
can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
|
can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
|
||||||
- key : (word1, prefix, proximity) as bytes
|
- key : (proximity, word1, prefix) as bytes
|
||||||
- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes
|
- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
|
||||||
*/
|
*/
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct PrefixAndProximityBatch {
|
struct PrefixAndProximityBatch {
|
||||||
|
proximity: u8,
|
||||||
word1: Vec<u8>,
|
word1: Vec<u8>,
|
||||||
batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
|
batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PrefixAndProximityBatch {
|
impl PrefixAndProximityBatch {
|
||||||
/// Insert the new key and value into the batch
|
/// Insert the new key and value into the batch
|
||||||
|
///
|
||||||
|
/// The key must either exist in the batch or be greater than all existing keys
|
||||||
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
|
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
|
||||||
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
|
match self.batch.iter_mut().find(|el| el.0 == new_key) {
|
||||||
Ok(position) => {
|
Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
|
||||||
self.batch[position].1.push(Cow::Owned(new_value));
|
None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
|
||||||
}
|
|
||||||
Err(position) => {
|
|
||||||
self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)]));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Empties the batch, calling `insert` on each element.
|
/// Empties the batch, calling `insert` on each element.
|
||||||
///
|
///
|
||||||
/// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap.
|
/// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
|
||||||
fn flush(
|
fn flush(
|
||||||
&mut self,
|
&mut self,
|
||||||
merge_buffer: &mut Vec<u8>,
|
merge_buffer: &mut Vec<u8>,
|
||||||
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
|
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let PrefixAndProximityBatch { word1, batch } = self;
|
let PrefixAndProximityBatch { proximity, word1, batch } = self;
|
||||||
if batch.is_empty() {
|
if batch.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
merge_buffer.clear();
|
merge_buffer.clear();
|
||||||
|
|
||||||
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1);
|
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
|
||||||
|
buffer.push(*proximity);
|
||||||
buffer.extend_from_slice(word1);
|
buffer.extend_from_slice(word1);
|
||||||
buffer.push(0);
|
buffer.push(0);
|
||||||
|
|
||||||
for (key, mergeable_data) in batch.drain(..) {
|
for (key, mergeable_data) in batch.drain(..) {
|
||||||
buffer.truncate(word1.len() + 1);
|
buffer.truncate(1 + word1.len() + 1);
|
||||||
buffer.extend_from_slice(key.as_slice());
|
buffer.extend_from_slice(key.as_slice());
|
||||||
|
|
||||||
let data = if mergeable_data.len() > 1 {
|
let data = if mergeable_data.len() > 1 {
|
||||||
@ -884,51 +858,33 @@ mod tests {
|
|||||||
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
|
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
|
||||||
|
|
||||||
let word_pairs = [
|
let word_pairs = [
|
||||||
// 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456)
|
|
||||||
(("healthy", "arbre", 2), &serialised_bitmap123),
|
|
||||||
// not inserted because 3 > max_proximity
|
|
||||||
(("healthy", "arbre", 3), &serialised_bitmap456),
|
|
||||||
// 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123)
|
|
||||||
(("healthy", "arbres", 1), &serialised_bitmap123),
|
(("healthy", "arbres", 1), &serialised_bitmap123),
|
||||||
// 1, 3:
|
|
||||||
(("healthy", "arbres", 2), &serialised_bitmap456),
|
|
||||||
// not be inserted because 3 > max_proximity
|
|
||||||
(("healthy", "arbres", 3), &serialised_bitmap789),
|
|
||||||
// not inserted because no prefixes for boat
|
|
||||||
(("healthy", "boat", 1), &serialised_bitmap123),
|
(("healthy", "boat", 1), &serialised_bitmap123),
|
||||||
// not inserted because no prefixes for ca
|
|
||||||
(("healthy", "ca", 1), &serialised_bitmap123),
|
(("healthy", "ca", 1), &serialised_bitmap123),
|
||||||
// 4: (healthy cat 1) with (bitmap456 + bitmap123)
|
|
||||||
(("healthy", "cats", 1), &serialised_bitmap456),
|
(("healthy", "cats", 1), &serialised_bitmap456),
|
||||||
// 5: (healthy cat 2) with (bitmap789 + bitmap_ranges)
|
|
||||||
(("healthy", "cats", 2), &serialised_bitmap789),
|
|
||||||
// 4 + 6: (healthy catto 1) with (bitmap123)
|
|
||||||
(("healthy", "cattos", 1), &serialised_bitmap123),
|
(("healthy", "cattos", 1), &serialised_bitmap123),
|
||||||
// 5 + 7: (healthy catto 2) with (bitmap_ranges)
|
|
||||||
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
|
|
||||||
// 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges)
|
|
||||||
(("jittery", "cat", 1), &serialised_bitmap123),
|
(("jittery", "cat", 1), &serialised_bitmap123),
|
||||||
// 8:
|
|
||||||
(("jittery", "cata", 1), &serialised_bitmap456),
|
(("jittery", "cata", 1), &serialised_bitmap456),
|
||||||
// 8:
|
|
||||||
(("jittery", "catb", 1), &serialised_bitmap789),
|
(("jittery", "catb", 1), &serialised_bitmap789),
|
||||||
// 8:
|
|
||||||
(("jittery", "catc", 1), &serialised_bitmap_ranges),
|
(("jittery", "catc", 1), &serialised_bitmap_ranges),
|
||||||
|
(("healthy", "arbre", 2), &serialised_bitmap123),
|
||||||
|
(("healthy", "arbres", 2), &serialised_bitmap456),
|
||||||
|
(("healthy", "cats", 2), &serialised_bitmap789),
|
||||||
|
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
|
||||||
|
(("healthy", "arbre", 3), &serialised_bitmap456),
|
||||||
|
(("healthy", "arbres", 3), &serialised_bitmap789),
|
||||||
];
|
];
|
||||||
|
|
||||||
let expected_result = [
|
let expected_result = [
|
||||||
// first batch:
|
|
||||||
(("healthy", "arb", 1), bitmap123.clone()),
|
(("healthy", "arb", 1), bitmap123.clone()),
|
||||||
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
|
|
||||||
(("healthy", "arbre", 1), bitmap123.clone()),
|
(("healthy", "arbre", 1), bitmap123.clone()),
|
||||||
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
|
|
||||||
// second batch:
|
|
||||||
(("healthy", "cat", 1), &bitmap456 | &bitmap123),
|
(("healthy", "cat", 1), &bitmap456 | &bitmap123),
|
||||||
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
|
|
||||||
(("healthy", "catto", 1), bitmap123.clone()),
|
(("healthy", "catto", 1), bitmap123.clone()),
|
||||||
(("healthy", "catto", 2), bitmap_ranges.clone()),
|
|
||||||
// third batch
|
|
||||||
(("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
|
(("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
|
||||||
|
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
|
||||||
|
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
|
||||||
|
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
|
||||||
|
(("healthy", "catto", 2), bitmap_ranges.clone()),
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut result = vec![];
|
let mut result = vec![];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user