Use the cache when retrieving the documents at the end

This commit is contained in:
Kerollmops 2020-06-21 12:24:51 +02:00
parent 1628a31efa
commit 8148210860
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 34 additions and 50 deletions

View File

@ -119,32 +119,29 @@ impl Node {
}
}
pub struct BestProximity<F> {
pub struct BestProximity {
positions: Vec<Vec<u32>>,
best_proximity: u32,
contains_documents: F,
}
impl<F> BestProximity<F> {
pub fn new(positions: Vec<Vec<u32>>, contains_documents: F) -> BestProximity<F> {
impl BestProximity {
pub fn new(positions: Vec<Vec<u32>>) -> BestProximity {
let best_proximity = (positions.len() as u32).saturating_sub(1);
BestProximity { positions, best_proximity, contains_documents }
BestProximity { positions, best_proximity }
}
}
impl<F> Iterator for BestProximity<F>
impl BestProximity {
pub fn next<F>(&mut self, mut contains_documents: F) -> Option<(u32, Vec<Vec<u32>>)>
where F: FnMut((usize, u32), (usize, u32)) -> bool,
{
type Item = (u32, Vec<Vec<u32>>);
fn next(&mut self) -> Option<Self::Item> {
let before = Instant::now();
if self.best_proximity == self.positions.len() as u32 * (MAX_DISTANCE - 1) {
return None;
}
let BestProximity { positions, best_proximity, contains_documents } = self;
let BestProximity { positions, best_proximity } = self;
let result = astar_bag(
&Node::Uninit, // start
@ -152,7 +149,7 @@ where F: FnMut((usize, u32), (usize, u32)) -> bool,
|_| 0, // heuristic
|n| { // success
let c = n.is_complete(&positions) && n.proximity() >= *best_proximity;
if n.is_reachable(contains_documents) { Some(c) } else { None }
if n.is_reachable(&mut contains_documents) { Some(c) } else { None }
},
);
@ -186,16 +183,17 @@ mod tests {
vec![ 1, ],
vec![ 3, 6],
];
let mut iter = BestProximity::new(positions, |_, _| true);
let mut iter = BestProximity::new(positions);
let f = |_, _| true;
assert_eq!(iter.next(), Some((1+2, vec![vec![0, 1, 3]]))); // 3
assert_eq!(iter.next(), Some((2+2, vec![vec![2, 1, 3]]))); // 4
assert_eq!(iter.next(), Some((3+2, vec![vec![3, 1, 3]]))); // 5
assert_eq!(iter.next(), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6
assert_eq!(iter.next(), Some((2+5, vec![vec![2, 1, 6]]))); // 7
assert_eq!(iter.next(), Some((3+5, vec![vec![3, 1, 6]]))); // 8
assert_eq!(iter.next(), Some((4+5, vec![vec![4, 1, 6]]))); // 9
assert_eq!(iter.next(), None);
assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3
assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4
assert_eq!(iter.next(f), Some((3+2, vec![vec![3, 1, 3]]))); // 5
assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6
assert_eq!(iter.next(f), Some((2+5, vec![vec![2, 1, 6]]))); // 7
assert_eq!(iter.next(f), Some((3+5, vec![vec![3, 1, 6]]))); // 8
assert_eq!(iter.next(f), Some((4+5, vec![vec![4, 1, 6]]))); // 9
assert_eq!(iter.next(f), None);
}
#[test]
@ -205,12 +203,13 @@ mod tests {
vec![ 1, 1000, 2001 ],
vec![ 3, 6, 2002, 3000],
];
let mut iter = BestProximity::new(positions, |_, _| true);
let mut iter = BestProximity::new(positions);
let f = |_, _| true;
assert_eq!(iter.next(), Some((1+1, vec![vec![2000, 2001, 2002]]))); // 2
assert_eq!(iter.next(), Some((1+2, vec![vec![0, 1, 3]]))); // 3
assert_eq!(iter.next(), Some((2+2, vec![vec![2, 1, 3]]))); // 4
assert_eq!(iter.next(), Some((1+5, vec![vec![0, 1, 6]]))); // 6
assert_eq!(iter.next(f), Some((1+1, vec![vec![2000, 2001, 2002]]))); // 2
assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3
assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4
assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6]]))); // 6
// We ignore others here...
}

View File

@ -143,7 +143,7 @@ impl Index {
let mut union_cache = HashMap::new();
let mut intersect_cache = HashMap::new();
// Returns `true` if there is documents in common between the two words and positions given.
let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| {
let mut contains_documents = |(lword, lpos), (rword, rpos), union_cache: &mut HashMap<_, _>| {
let proximity = best_proximity::positions_proximity(lpos, rpos);
if proximity == 0 { return false }
@ -162,7 +162,8 @@ impl Index {
})
};
for (proximity, mut positions) in BestProximity::new(positions, contains_documents) {
let mut iter = BestProximity::new(positions);
while let Some((proximity, mut positions)) = iter.next(|l, r| contains_documents(l, r, &mut union_cache)) {
positions.sort_unstable();
let same_prox_before = Instant::now();
@ -172,34 +173,18 @@ impl Index {
let before = Instant::now();
let mut intersect_docids: Option<RoaringBitmap> = None;
for (derived_words, pos) in words.iter().zip(positions.clone()) {
let mut count = 0;
let mut union_docids = RoaringBitmap::default();
for (word, pos) in positions.iter().enumerate() {
let before = Instant::now();
// TODO re-enable the prefixes system
for (word, attrs) in derived_words.iter() {
if attrs.contains(pos) {
let mut key = word.clone();
key.extend_from_slice(&pos.to_be_bytes());
if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
let right = RoaringBitmap::deserialize_from_slice(attrs)?;
union_docids.union_with(&right);
count += 1;
}
}
}
let union_docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos));
let before_intersect = Instant::now();
match &mut intersect_docids {
Some(left) => left.intersect_with(&union_docids),
None => intersect_docids = Some(union_docids),
None => intersect_docids = Some(union_docids.clone()),
}
eprintln!("retrieving {} word took {:.02?} and took {:.02?} to intersect",
count, before.elapsed(), before_intersect.elapsed());
eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect",
before.elapsed(), before_intersect.elapsed());
}
eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents",