From 81482108601955c69110e473a3854d16283b97c1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 21 Jun 2020 12:24:51 +0200 Subject: [PATCH] Use the cache when retrieving the documents at the end --- src/best_proximity.rs | 53 +++++++++++++++++++++---------------------- src/lib.rs | 31 +++++++------------------ 2 files changed, 34 insertions(+), 50 deletions(-) diff --git a/src/best_proximity.rs b/src/best_proximity.rs index 467e72a5a..cb6302a16 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -119,32 +119,29 @@ impl Node { } } -pub struct BestProximity { +pub struct BestProximity { positions: Vec>, best_proximity: u32, - contains_documents: F, } -impl BestProximity { - pub fn new(positions: Vec>, contains_documents: F) -> BestProximity { +impl BestProximity { + pub fn new(positions: Vec>) -> BestProximity { let best_proximity = (positions.len() as u32).saturating_sub(1); - BestProximity { positions, best_proximity, contains_documents } + BestProximity { positions, best_proximity } } } -impl Iterator for BestProximity -where F: FnMut((usize, u32), (usize, u32)) -> bool, -{ - type Item = (u32, Vec>); - - fn next(&mut self) -> Option { +impl BestProximity { + pub fn next(&mut self, mut contains_documents: F) -> Option<(u32, Vec>)> + where F: FnMut((usize, u32), (usize, u32)) -> bool, + { let before = Instant::now(); if self.best_proximity == self.positions.len() as u32 * (MAX_DISTANCE - 1) { return None; } - let BestProximity { positions, best_proximity, contains_documents } = self; + let BestProximity { positions, best_proximity } = self; let result = astar_bag( &Node::Uninit, // start @@ -152,7 +149,7 @@ where F: FnMut((usize, u32), (usize, u32)) -> bool, |_| 0, // heuristic |n| { // success let c = n.is_complete(&positions) && n.proximity() >= *best_proximity; - if n.is_reachable(contains_documents) { Some(c) } else { None } + if n.is_reachable(&mut contains_documents) { Some(c) } else { None } }, ); @@ -186,16 +183,17 @@ mod tests { vec![ 1, ], vec![ 3, 6], ]; - let mut iter = BestProximity::new(positions, |_, _| true); + let mut iter = BestProximity::new(positions); + let f = |_, _| true; - assert_eq!(iter.next(), Some((1+2, vec![vec![0, 1, 3]]))); // 3 - assert_eq!(iter.next(), Some((2+2, vec![vec![2, 1, 3]]))); // 4 - assert_eq!(iter.next(), Some((3+2, vec![vec![3, 1, 3]]))); // 5 - assert_eq!(iter.next(), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6 - assert_eq!(iter.next(), Some((2+5, vec![vec![2, 1, 6]]))); // 7 - assert_eq!(iter.next(), Some((3+5, vec![vec![3, 1, 6]]))); // 8 - assert_eq!(iter.next(), Some((4+5, vec![vec![4, 1, 6]]))); // 9 - assert_eq!(iter.next(), None); + assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3 + assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4 + assert_eq!(iter.next(f), Some((3+2, vec![vec![3, 1, 3]]))); // 5 + assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6 + assert_eq!(iter.next(f), Some((2+5, vec![vec![2, 1, 6]]))); // 7 + assert_eq!(iter.next(f), Some((3+5, vec![vec![3, 1, 6]]))); // 8 + assert_eq!(iter.next(f), Some((4+5, vec![vec![4, 1, 6]]))); // 9 + assert_eq!(iter.next(f), None); } #[test] @@ -205,12 +203,13 @@ mod tests { vec![ 1, 1000, 2001 ], vec![ 3, 6, 2002, 3000], ]; - let mut iter = BestProximity::new(positions, |_, _| true); + let mut iter = BestProximity::new(positions); + let f = |_, _| true; - assert_eq!(iter.next(), Some((1+1, vec![vec![2000, 2001, 2002]]))); // 2 - assert_eq!(iter.next(), Some((1+2, vec![vec![0, 1, 3]]))); // 3 - assert_eq!(iter.next(), Some((2+2, vec![vec![2, 1, 3]]))); // 4 - assert_eq!(iter.next(), Some((1+5, vec![vec![0, 1, 6]]))); // 6 + assert_eq!(iter.next(f), Some((1+1, vec![vec![2000, 2001, 2002]]))); // 2 + assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3 + assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4 + assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6]]))); // 6 // We ignore others here... } diff --git a/src/lib.rs b/src/lib.rs index ead9ee278..f09158fb7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -143,7 +143,7 @@ impl Index { let mut union_cache = HashMap::new(); let mut intersect_cache = HashMap::new(); // Returns `true` if there is documents in common between the two words and positions given. - let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| { + let mut contains_documents = |(lword, lpos), (rword, rpos), union_cache: &mut HashMap<_, _>| { let proximity = best_proximity::positions_proximity(lpos, rpos); if proximity == 0 { return false } @@ -162,7 +162,8 @@ impl Index { }) }; - for (proximity, mut positions) in BestProximity::new(positions, contains_documents) { + let mut iter = BestProximity::new(positions); + while let Some((proximity, mut positions)) = iter.next(|l, r| contains_documents(l, r, &mut union_cache)) { positions.sort_unstable(); let same_prox_before = Instant::now(); @@ -172,34 +173,18 @@ impl Index { let before = Instant::now(); let mut intersect_docids: Option = None; - for (derived_words, pos) in words.iter().zip(positions.clone()) { - let mut count = 0; - let mut union_docids = RoaringBitmap::default(); - + for (word, pos) in positions.iter().enumerate() { let before = Instant::now(); - - // TODO re-enable the prefixes system - for (word, attrs) in derived_words.iter() { - if attrs.contains(pos) { - let mut key = word.clone(); - key.extend_from_slice(&pos.to_be_bytes()); - if let Some(attrs) = self.postings_ids.get(rtxn, &key)? { - let right = RoaringBitmap::deserialize_from_slice(attrs)?; - union_docids.union_with(&right); - count += 1; - } - } - } + let union_docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos)); let before_intersect = Instant::now(); - match &mut intersect_docids { Some(left) => left.intersect_with(&union_docids), - None => intersect_docids = Some(union_docids), + None => intersect_docids = Some(union_docids.clone()), } - eprintln!("retrieving {} word took {:.02?} and took {:.02?} to intersect", - count, before.elapsed(), before_intersect.elapsed()); + eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect", + before.elapsed(), before_intersect.elapsed()); } eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents",