mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-23 11:47:28 +01:00
Merge #431
431: Fix and improve word prefix pair proximity r=ManyTheFish a=Kerollmops This PR first fixes the algorithm we used to select and compute the word prefix pair proximity database. The previous version was skipping nearly all of the prefixes. The issue is that this fix made this method to take more time and we were trying to reduce the time spent in it. With `@ManyTheFish` we found out that we could skip some of the work we were doing by: - discarding the prefixes that were shorter than a specific threshold (default: 2). - discarding the word prefix pairs with proximity bigger than a specific threshold (default: 4). - remove the unused threshold that was specifying a minimum amount of word docids to merge. We will take more time to do some more optimization, like stop clearing and recomputing from scratch the database, we will compute the subsets of keys to create, keep and merge. This change is a little bit more complex than what this PR does. I keep this PR as a draft as I want to further test the real gain if it is enough or not if it is valid or not. I advise reviewers to review commit by commit to see the changes bit by bit, reviewing the whole PR can be hard. Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
commit
38d23546a5
@ -18,7 +18,8 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
pub(crate) max_memory: Option<usize>,
|
pub(crate) max_memory: Option<usize>,
|
||||||
threshold: u32,
|
max_proximity: u8,
|
||||||
|
max_prefix_length: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||||
@ -33,18 +34,29 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
chunk_compression_level: None,
|
chunk_compression_level: None,
|
||||||
max_nb_chunks: None,
|
max_nb_chunks: None,
|
||||||
max_memory: None,
|
max_memory: None,
|
||||||
threshold: 100,
|
max_proximity: 4,
|
||||||
|
max_prefix_length: 2,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the number of words required to make a prefix be part of the words prefixes
|
/// Set the maximum proximity required to make a prefix be part of the words prefixes
|
||||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
/// database. If two words are too far from the threshold the associated documents will
|
||||||
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
/// not be part of the prefix database.
|
||||||
///
|
///
|
||||||
/// Default value is 100. This value must be higher than 50 and will be clamped
|
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
|
||||||
/// to these bound otherwise.
|
/// to this bound otherwise.
|
||||||
pub fn threshold(&mut self, value: u32) -> &mut Self {
|
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
|
||||||
self.threshold = value.max(50);
|
self.max_proximity = value.max(7);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
|
||||||
|
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
|
||||||
|
/// will not be part of the prefix database.
|
||||||
|
///
|
||||||
|
/// Default value is 2.
|
||||||
|
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||||
|
self.max_prefix_length = value;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -64,28 +76,29 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
let prefix_fst_keys = prefix_fst.into_stream().into_bytes();
|
let prefix_fst_keys = prefix_fst.into_stream().into_strs()?;
|
||||||
let prefix_fst_keys: Vec<_> = prefix_fst_keys
|
let prefix_fst_keys: Vec<_> =
|
||||||
.as_slice()
|
prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect();
|
||||||
.linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut db =
|
let mut db =
|
||||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?;
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?;
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut current_prefixes: Option<&&[Vec<u8>]> = None;
|
let mut current_prefixes: Option<&&[String]> = None;
|
||||||
let mut prefixes_cache = HashMap::new();
|
let mut prefixes_cache = HashMap::new();
|
||||||
while let Some(((w1, w2, prox), data)) = db.next().transpose()? {
|
while let Some(((w1, w2, prox), data)) = db.next().transpose()? {
|
||||||
|
if prox > self.max_proximity {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
current_prefixes = match current_prefixes.take() {
|
current_prefixes = match current_prefixes.take() {
|
||||||
Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes),
|
Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes),
|
||||||
_otherwise => {
|
_otherwise => {
|
||||||
write_prefixes_in_sorter(
|
write_prefixes_in_sorter(
|
||||||
&mut prefixes_cache,
|
&mut prefixes_cache,
|
||||||
&mut word_prefix_pair_proximity_docids_sorter,
|
&mut word_prefix_pair_proximity_docids_sorter,
|
||||||
self.threshold,
|
|
||||||
)?;
|
)?;
|
||||||
prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0]))
|
prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -93,9 +106,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.extend_from_slice(w1.as_bytes());
|
buffer.extend_from_slice(w1.as_bytes());
|
||||||
buffer.push(0);
|
buffer.push(0);
|
||||||
for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) {
|
for prefix in prefixes.iter() {
|
||||||
|
if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) {
|
||||||
buffer.truncate(w1.len() + 1);
|
buffer.truncate(w1.len() + 1);
|
||||||
buffer.extend_from_slice(prefix);
|
buffer.extend_from_slice(prefix.as_bytes());
|
||||||
buffer.push(prox);
|
buffer.push(prox);
|
||||||
|
|
||||||
match prefixes_cache.get_mut(&buffer) {
|
match prefixes_cache.get_mut(&buffer) {
|
||||||
@ -107,11 +121,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
write_prefixes_in_sorter(
|
write_prefixes_in_sorter(
|
||||||
&mut prefixes_cache,
|
&mut prefixes_cache,
|
||||||
&mut word_prefix_pair_proximity_docids_sorter,
|
&mut word_prefix_pair_proximity_docids_sorter,
|
||||||
self.threshold,
|
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
drop(prefix_fst);
|
drop(prefix_fst);
|
||||||
@ -133,20 +147,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
fn write_prefixes_in_sorter(
|
fn write_prefixes_in_sorter(
|
||||||
prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
|
prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
|
||||||
sorter: &mut grenad::Sorter<MergeFn>,
|
sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
min_word_per_prefix: u32,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (i, (key, data_slices)) in prefixes.drain().enumerate() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
// if the number of words prefixed by the prefix is higher than the threshold,
|
|
||||||
// we insert it in the sorter.
|
|
||||||
if data_slices.len() > min_word_per_prefix as usize {
|
|
||||||
for data in data_slices {
|
for data in data_slices {
|
||||||
sorter.insert(&key, data)?;
|
sorter.insert(&key, data)?;
|
||||||
}
|
}
|
||||||
// if the first prefix isn't elligible for insertion,
|
|
||||||
// then the other prefixes can't be elligible.
|
|
||||||
} else if i == 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user