mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Use the groups of four positions to speed up disjunctions tests
This commit is contained in:
parent
605f75b56f
commit
4afc4d0751
@ -88,7 +88,7 @@ struct IndexerOpt {
|
||||
max_memory: Option<usize>,
|
||||
|
||||
/// Size of the ARC cache when indexing.
|
||||
#[structopt(long, default_value = "65535")]
|
||||
#[structopt(long, default_value = "43690")]
|
||||
arc_cache_size: usize,
|
||||
|
||||
/// The name of the compression algorithm to use when compressing intermediate
|
||||
@ -184,7 +184,7 @@ impl Store {
|
||||
let position = position - position % 4;
|
||||
let word_vec = SmallVec32::from(word.as_bytes());
|
||||
let ids = RoaringBitmap::from_iter(Some(id));
|
||||
let (_, lrus) = self.word_position_docids.insert((word_vec, position), ids, |old, new| old.union_with(&new));
|
||||
let (_, lrus) = self.word_four_positions_docids.insert((word_vec, position), ids, |old, new| old.union_with(&new));
|
||||
Self::write_word_four_positions_docids(&mut self.sorter, lrus)
|
||||
}
|
||||
|
||||
|
@ -24,6 +24,11 @@ pub fn extract_position(position: u32) -> (u32, u32) {
|
||||
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
|
||||
}
|
||||
|
||||
// Returns the group of four positions in which this position reside (i.e. 0, 4, 12).
|
||||
pub fn group_of_four(position: u32) -> u32 {
|
||||
position - position % 4
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Node {
|
||||
// Is this node is the first node.
|
||||
|
101
src/search.rs
101
src/search.rs
@ -176,6 +176,24 @@ impl<'a> Search<'a> {
|
||||
Ok(union_docids)
|
||||
}
|
||||
|
||||
/// Returns the union of the same gorup of four positions for all the given words.
|
||||
fn union_word_four_positions(
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
words: &[(String, u8, RoaringBitmap)],
|
||||
group: Position,
|
||||
) -> anyhow::Result<RoaringBitmap>
|
||||
{
|
||||
let mut union_docids = RoaringBitmap::new();
|
||||
for (word, _distance, _positions) in words {
|
||||
// TODO would be better to check if the group exist
|
||||
if let Some(docids) = index.word_four_positions_docids.get(rtxn, &(word, group))? {
|
||||
union_docids.union_with(&docids);
|
||||
}
|
||||
}
|
||||
Ok(union_docids)
|
||||
}
|
||||
|
||||
/// Returns the union of the same attribute for all the given words.
|
||||
fn union_word_attribute(
|
||||
rtxn: &heed::RoTxn,
|
||||
@ -203,6 +221,8 @@ impl<'a> Search<'a> {
|
||||
derived_words: &[Vec<(String, u8, RoaringBitmap)>],
|
||||
union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
||||
non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
||||
group_four_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
||||
group_four_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
||||
attribute_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
||||
attribute_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
||||
) -> bool
|
||||
@ -214,37 +234,68 @@ impl<'a> Search<'a> {
|
||||
let (rattr, _) = node::extract_position(rpos);
|
||||
|
||||
if lattr == rattr {
|
||||
// We retrieve or compute the intersection between the two given words and positions.
|
||||
*non_disjoint_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
||||
// We retrieve or compute the unions for the two words and positions.
|
||||
union_cache.entry((lword, lpos)).or_insert_with(|| {
|
||||
let words: &Vec<_> = &derived_words[lword];
|
||||
Self::union_word_position(rtxn, index, words, lpos).unwrap()
|
||||
});
|
||||
union_cache.entry((rword, rpos)).or_insert_with(|| {
|
||||
let words: &Vec<_> = &derived_words[rword];
|
||||
Self::union_word_position(rtxn, index, words, rpos).unwrap()
|
||||
});
|
||||
// TODO move this function to a better place.
|
||||
let lgroup = node::group_of_four(lpos);
|
||||
let rgroup = node::group_of_four(rpos);
|
||||
|
||||
// TODO is there a way to avoid this double gets?
|
||||
let lunion_docids = union_cache.get(&(lword, lpos)).unwrap();
|
||||
let runion_docids = union_cache.get(&(rword, rpos)).unwrap();
|
||||
// We can't compute a disjunction on a group of four positions if those
|
||||
// two positions are in the same group, we must go down to the position.
|
||||
if lgroup == rgroup {
|
||||
// We retrieve or compute the intersection between the two given words and positions.
|
||||
*non_disjoint_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
||||
// We retrieve or compute the unions for the two words and positions.
|
||||
union_cache.entry((lword, lpos)).or_insert_with(|| {
|
||||
let words = &derived_words[lword];
|
||||
Self::union_word_position(rtxn, index, words, lpos).unwrap()
|
||||
});
|
||||
union_cache.entry((rword, rpos)).or_insert_with(|| {
|
||||
let words = &derived_words[rword];
|
||||
Self::union_word_position(rtxn, index, words, rpos).unwrap()
|
||||
});
|
||||
|
||||
// We first check that the docids of these unions are part of the candidates.
|
||||
if lunion_docids.is_disjoint(candidates) { return false }
|
||||
if runion_docids.is_disjoint(candidates) { return false }
|
||||
// TODO is there a way to avoid this double gets?
|
||||
let lunion_docids = union_cache.get(&(lword, lpos)).unwrap();
|
||||
let runion_docids = union_cache.get(&(rword, rpos)).unwrap();
|
||||
|
||||
!lunion_docids.is_disjoint(&runion_docids)
|
||||
})
|
||||
// We first check that the docids of these unions are part of the candidates.
|
||||
if lunion_docids.is_disjoint(candidates) { return false }
|
||||
if runion_docids.is_disjoint(candidates) { return false }
|
||||
|
||||
!lunion_docids.is_disjoint(&runion_docids)
|
||||
})
|
||||
} else {
|
||||
// We retrieve or compute the intersection between the two given words and positions.
|
||||
*group_four_non_disjoint_cache.entry(((lword, lgroup), (rword, rgroup))).or_insert_with(|| {
|
||||
// We retrieve or compute the unions for the two words and group of four positions.
|
||||
group_four_union_cache.entry((lword, lgroup)).or_insert_with(|| {
|
||||
let words = &derived_words[lword];
|
||||
Self::union_word_four_positions(rtxn, index, words, lgroup).unwrap()
|
||||
});
|
||||
group_four_union_cache.entry((rword, rgroup)).or_insert_with(|| {
|
||||
let words = &derived_words[rword];
|
||||
Self::union_word_four_positions(rtxn, index, words, rgroup).unwrap()
|
||||
});
|
||||
|
||||
// TODO is there a way to avoid this double gets?
|
||||
let lunion_group_docids = group_four_union_cache.get(&(lword, lgroup)).unwrap();
|
||||
let runion_group_docids = group_four_union_cache.get(&(rword, rgroup)).unwrap();
|
||||
|
||||
// We first check that the docids of these unions are part of the candidates.
|
||||
if lunion_group_docids.is_disjoint(candidates) { return false }
|
||||
if runion_group_docids.is_disjoint(candidates) { return false }
|
||||
|
||||
!lunion_group_docids.is_disjoint(&runion_group_docids)
|
||||
})
|
||||
}
|
||||
} else {
|
||||
*attribute_non_disjoint_cache.entry(((lword, lattr), (rword, rattr))).or_insert_with(|| {
|
||||
// We retrieve or compute the unions for the two words and positions.
|
||||
attribute_union_cache.entry((lword, lattr)).or_insert_with(|| {
|
||||
let words: &Vec<_> = &derived_words[lword];
|
||||
let words = &derived_words[lword];
|
||||
Self::union_word_attribute(rtxn, index, words, lattr).unwrap()
|
||||
});
|
||||
attribute_union_cache.entry((rword, rattr)).or_insert_with(|| {
|
||||
let words: &Vec<_> = &derived_words[rword];
|
||||
let words = &derived_words[rword];
|
||||
Self::union_word_attribute(rtxn, index, words, rattr).unwrap()
|
||||
});
|
||||
|
||||
@ -290,6 +341,9 @@ impl<'a> Search<'a> {
|
||||
let union_cache = HashMap::new();
|
||||
let mut non_disjoint_cache = HashMap::new();
|
||||
|
||||
let mut group_four_union_cache = HashMap::new();
|
||||
let mut group_four_non_disjoint_cache = HashMap::new();
|
||||
|
||||
let mut attribute_union_cache = HashMap::new();
|
||||
let mut attribute_non_disjoint_cache = HashMap::new();
|
||||
|
||||
@ -306,13 +360,13 @@ impl<'a> Search<'a> {
|
||||
&derived_words,
|
||||
&mut union_cache_cloned.borrow_mut(),
|
||||
&mut non_disjoint_cache,
|
||||
&mut group_four_union_cache,
|
||||
&mut group_four_non_disjoint_cache,
|
||||
&mut attribute_union_cache,
|
||||
&mut attribute_non_disjoint_cache,
|
||||
)
|
||||
};
|
||||
|
||||
// We instantiate an astar bag Iterator that returns the best paths incrementally,
|
||||
// it means that it will first return the best paths then the next best paths...
|
||||
let astar_iter = AstarBagIter::new(
|
||||
Node::Uninit, // start
|
||||
|n| n.successors(&union_positions, &mut contains_documents), // successors
|
||||
@ -322,7 +376,6 @@ impl<'a> Search<'a> {
|
||||
|
||||
let mut documents = Vec::new();
|
||||
for (paths, proximity) in astar_iter {
|
||||
|
||||
let mut union_cache = union_cache.borrow_mut();
|
||||
let mut candidates = candidates.borrow_mut();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user