mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Refine some details in word_prefix_pair_proximity indexing code
This commit is contained in:
parent
e6e76fbefe
commit
ab2f6f3aa4
@ -35,9 +35,6 @@ pub fn index_prefix_word_database(
|
|||||||
.filter(|s| s.len() <= max_prefix_length)
|
.filter(|s| s.len() <= max_prefix_length)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// If the prefix trie is not empty, then we can iterate over all new
|
|
||||||
// word pairs to look for new (word1, common_prefix, proximity) elements
|
|
||||||
// to insert in the DB
|
|
||||||
for proximity in 1..=max_proximity - 1 {
|
for proximity in 1..=max_proximity - 1 {
|
||||||
for prefix in common_prefixes.iter() {
|
for prefix in common_prefixes.iter() {
|
||||||
let mut prefix_key = vec![];
|
let mut prefix_key = vec![];
|
||||||
@ -135,13 +132,11 @@ pub fn index_prefix_word_database(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
|
||||||
///
|
///
|
||||||
/// Its main arguments are:
|
/// Its arguments are:
|
||||||
/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements
|
/// - an iterator over the words following the given `prefix` with the given `proximity`
|
||||||
/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
|
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
|
||||||
///
|
|
||||||
/// For more information about what this function does, read the module documentation.
|
|
||||||
fn execute_on_word_pairs_and_prefixes<I>(
|
fn execute_on_word_pairs_and_prefixes<I>(
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
prefix: &[u8],
|
prefix: &[u8],
|
||||||
@ -151,28 +146,32 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
|||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default();
|
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default();
|
||||||
|
|
||||||
while let Some((word2, data)) = next_word2_and_docids(iter)? {
|
// Memory usage check:
|
||||||
|
// The content of the loop will be called for each `word2` that follows a word beginning
|
||||||
|
// with `prefix` with the given proximity.
|
||||||
|
// In practice, I don't think the batch can ever get too big.
|
||||||
|
while let Some((word2, docids)) = next_word2_and_docids(iter)? {
|
||||||
let entry = batch.entry(word2.to_owned()).or_default();
|
let entry = batch.entry(word2.to_owned()).or_default();
|
||||||
entry.push(Cow::Owned(data.to_owned()));
|
entry.push(Cow::Owned(docids.to_owned()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut key_buffer = Vec::with_capacity(8);
|
let mut key_buffer = Vec::with_capacity(512);
|
||||||
key_buffer.push(proximity);
|
key_buffer.push(proximity);
|
||||||
key_buffer.extend_from_slice(prefix);
|
key_buffer.extend_from_slice(prefix);
|
||||||
key_buffer.push(0);
|
key_buffer.push(0);
|
||||||
|
|
||||||
let mut value_buffer = Vec::with_capacity(65_536);
|
let mut value_buffer = Vec::with_capacity(65_536);
|
||||||
|
|
||||||
for (key, values) in batch {
|
for (word2, docids) in batch {
|
||||||
key_buffer.truncate(prefix.len() + 2);
|
key_buffer.truncate(prefix.len() + 2);
|
||||||
value_buffer.clear();
|
value_buffer.clear();
|
||||||
|
|
||||||
key_buffer.extend_from_slice(&key);
|
key_buffer.extend_from_slice(&word2);
|
||||||
let data = if values.len() > 1 {
|
let data = if docids.len() > 1 {
|
||||||
CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?;
|
CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
|
||||||
value_buffer.as_slice()
|
value_buffer.as_slice()
|
||||||
} else {
|
} else {
|
||||||
&values[0]
|
&docids[0]
|
||||||
};
|
};
|
||||||
insert(key_buffer.as_slice(), data)?;
|
insert(key_buffer.as_slice(), data)?;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
/*!
|
/*!
|
||||||
## What is WordPrefix?
|
|
||||||
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
||||||
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
||||||
the documents which contain `word` followed by another word starting with
|
the documents which contain `word` followed by another word starting with
|
||||||
@ -320,7 +319,7 @@ fn execute_on_word_pairs_and_prefixes<I>(
|
|||||||
let mut merge_buffer = Vec::with_capacity(65_536);
|
let mut merge_buffer = Vec::with_capacity(65_536);
|
||||||
|
|
||||||
while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
|
while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
|
||||||
// skip this iteration if the proximity is over the threshold
|
// stop indexing if the proximity is over the threshold
|
||||||
if proximity > max_proximity {
|
if proximity > max_proximity {
|
||||||
break;
|
break;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user