mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Use hashmap instead of Btree in wpp extractor
This commit is contained in:
parent
7ba49b849e
commit
f13e076b8a
@ -1,4 +1,4 @@
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
use heed::RoTxn;
|
||||
use itertools::merge_join_by;
|
||||
@ -35,10 +35,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
/// TODO: mutualize those buffers
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut add_word_pair_proximity = BTreeMap::new();
|
||||
let mut del_word_pair_proximity = BTreeMap::new();
|
||||
let mut word_pair_proximity = HashMap::new();
|
||||
let mut word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
|
||||
@ -51,7 +49,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
&mut |(w1, w2), prox| {
|
||||
word_pair_proximity
|
||||
.entry((w1, w2))
|
||||
.and_modify(|(del_p, _add_p)| {
|
||||
*del_p = std::cmp::min(*del_p, prox);
|
||||
})
|
||||
.or_insert((prox, 0));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
@ -61,7 +66,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
&mut |(w1, w2), prox| {
|
||||
word_pair_proximity
|
||||
.entry((w1, w2))
|
||||
.and_modify(|(del_p, _add_p)| {
|
||||
*del_p = std::cmp::min(*del_p, prox);
|
||||
})
|
||||
.or_insert((prox, 0));
|
||||
},
|
||||
)?;
|
||||
let document = inner.new();
|
||||
process_document_tokens(
|
||||
@ -69,7 +81,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
&mut |(w1, w2), prox| {
|
||||
word_pair_proximity
|
||||
.entry((w1, w2))
|
||||
.and_modify(|(_del_p, add_p)| {
|
||||
*add_p = std::cmp::min(*add_p, prox);
|
||||
})
|
||||
.or_insert((0, prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
@ -79,35 +98,23 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
&mut |(w1, w2), prox| {
|
||||
word_pair_proximity
|
||||
.entry((w1, w2))
|
||||
.and_modify(|(_del_p, add_p)| {
|
||||
*add_p = std::cmp::min(*add_p, prox);
|
||||
})
|
||||
.or_insert((0, prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
use itertools::EitherOrBoth::*;
|
||||
for eob in
|
||||
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||
d.cmp(a)
|
||||
})
|
||||
{
|
||||
match eob {
|
||||
Left(((w1, w2), prox)) => {
|
||||
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, docid)?;
|
||||
}
|
||||
Right(((w1, w2), prox)) => {
|
||||
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, docid)?;
|
||||
}
|
||||
Both(((w1, w2), del_prox), (_, add_prox)) => {
|
||||
if del_prox != add_prox {
|
||||
let key = build_key(*del_prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, docid)?;
|
||||
let key = build_key(*add_prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, docid)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
for ((w1, w2), (del_p, add_p)) in word_pair_proximity.iter() {
|
||||
let key = build_key(*del_p, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, docid)?;
|
||||
let key = build_key(*add_p, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, docid)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -125,18 +132,19 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> &
|
||||
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity: &mut dyn FnMut((String, String), u8),
|
||||
) -> Result<()> {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity
|
||||
.entry((head_word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = std::cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
word_pair_proximity((head_word.clone(), word.clone()), prox);
|
||||
// word_pair_proximity
|
||||
// .entry((head_word.clone(), word.clone()))
|
||||
// .and_modify(|p| {
|
||||
// *p = std::cmp::min(*p, prox);
|
||||
// })
|
||||
// .or_insert(prox);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@ -147,7 +155,7 @@ fn process_document_tokens(
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity: &mut dyn FnMut((String, String), u8),
|
||||
) -> Result<()> {
|
||||
let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
|
Loading…
x
Reference in New Issue
Block a user