Use the cache when retrieving the documents at the end

This commit is contained in:
Kerollmops 2020-06-21 12:24:51 +02:00
parent 1628a31efa
commit 8148210860
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 34 additions and 50 deletions

View File

@ -119,32 +119,29 @@ impl Node {
} }
} }
pub struct BestProximity<F> { pub struct BestProximity {
positions: Vec<Vec<u32>>, positions: Vec<Vec<u32>>,
best_proximity: u32, best_proximity: u32,
contains_documents: F,
} }
impl<F> BestProximity<F> { impl BestProximity {
pub fn new(positions: Vec<Vec<u32>>, contains_documents: F) -> BestProximity<F> { pub fn new(positions: Vec<Vec<u32>>) -> BestProximity {
let best_proximity = (positions.len() as u32).saturating_sub(1); let best_proximity = (positions.len() as u32).saturating_sub(1);
BestProximity { positions, best_proximity, contains_documents } BestProximity { positions, best_proximity }
} }
} }
impl<F> Iterator for BestProximity<F> impl BestProximity {
pub fn next<F>(&mut self, mut contains_documents: F) -> Option<(u32, Vec<Vec<u32>>)>
where F: FnMut((usize, u32), (usize, u32)) -> bool, where F: FnMut((usize, u32), (usize, u32)) -> bool,
{ {
type Item = (u32, Vec<Vec<u32>>);
fn next(&mut self) -> Option<Self::Item> {
let before = Instant::now(); let before = Instant::now();
if self.best_proximity == self.positions.len() as u32 * (MAX_DISTANCE - 1) { if self.best_proximity == self.positions.len() as u32 * (MAX_DISTANCE - 1) {
return None; return None;
} }
let BestProximity { positions, best_proximity, contains_documents } = self; let BestProximity { positions, best_proximity } = self;
let result = astar_bag( let result = astar_bag(
&Node::Uninit, // start &Node::Uninit, // start
@ -152,7 +149,7 @@ where F: FnMut((usize, u32), (usize, u32)) -> bool,
|_| 0, // heuristic |_| 0, // heuristic
|n| { // success |n| { // success
let c = n.is_complete(&positions) && n.proximity() >= *best_proximity; let c = n.is_complete(&positions) && n.proximity() >= *best_proximity;
if n.is_reachable(contains_documents) { Some(c) } else { None } if n.is_reachable(&mut contains_documents) { Some(c) } else { None }
}, },
); );
@ -186,16 +183,17 @@ mod tests {
vec![ 1, ], vec![ 1, ],
vec![ 3, 6], vec![ 3, 6],
]; ];
let mut iter = BestProximity::new(positions, |_, _| true); let mut iter = BestProximity::new(positions);
let f = |_, _| true;
assert_eq!(iter.next(), Some((1+2, vec![vec![0, 1, 3]]))); // 3 assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3
assert_eq!(iter.next(), Some((2+2, vec![vec![2, 1, 3]]))); // 4 assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4
assert_eq!(iter.next(), Some((3+2, vec![vec![3, 1, 3]]))); // 5 assert_eq!(iter.next(f), Some((3+2, vec![vec![3, 1, 3]]))); // 5
assert_eq!(iter.next(), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6 assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6
assert_eq!(iter.next(), Some((2+5, vec![vec![2, 1, 6]]))); // 7 assert_eq!(iter.next(f), Some((2+5, vec![vec![2, 1, 6]]))); // 7
assert_eq!(iter.next(), Some((3+5, vec![vec![3, 1, 6]]))); // 8 assert_eq!(iter.next(f), Some((3+5, vec![vec![3, 1, 6]]))); // 8
assert_eq!(iter.next(), Some((4+5, vec![vec![4, 1, 6]]))); // 9 assert_eq!(iter.next(f), Some((4+5, vec![vec![4, 1, 6]]))); // 9
assert_eq!(iter.next(), None); assert_eq!(iter.next(f), None);
} }
#[test] #[test]
@ -205,12 +203,13 @@ mod tests {
vec![ 1, 1000, 2001 ], vec![ 1, 1000, 2001 ],
vec![ 3, 6, 2002, 3000], vec![ 3, 6, 2002, 3000],
]; ];
let mut iter = BestProximity::new(positions, |_, _| true); let mut iter = BestProximity::new(positions);
let f = |_, _| true;
assert_eq!(iter.next(), Some((1+1, vec![vec![2000, 2001, 2002]]))); // 2 assert_eq!(iter.next(f), Some((1+1, vec![vec![2000, 2001, 2002]]))); // 2
assert_eq!(iter.next(), Some((1+2, vec![vec![0, 1, 3]]))); // 3 assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3
assert_eq!(iter.next(), Some((2+2, vec![vec![2, 1, 3]]))); // 4 assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4
assert_eq!(iter.next(), Some((1+5, vec![vec![0, 1, 6]]))); // 6 assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6]]))); // 6
// We ignore others here... // We ignore others here...
} }

View File

@ -143,7 +143,7 @@ impl Index {
let mut union_cache = HashMap::new(); let mut union_cache = HashMap::new();
let mut intersect_cache = HashMap::new(); let mut intersect_cache = HashMap::new();
// Returns `true` if there is documents in common between the two words and positions given. // Returns `true` if there is documents in common between the two words and positions given.
let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| { let mut contains_documents = |(lword, lpos), (rword, rpos), union_cache: &mut HashMap<_, _>| {
let proximity = best_proximity::positions_proximity(lpos, rpos); let proximity = best_proximity::positions_proximity(lpos, rpos);
if proximity == 0 { return false } if proximity == 0 { return false }
@ -162,7 +162,8 @@ impl Index {
}) })
}; };
for (proximity, mut positions) in BestProximity::new(positions, contains_documents) { let mut iter = BestProximity::new(positions);
while let Some((proximity, mut positions)) = iter.next(|l, r| contains_documents(l, r, &mut union_cache)) {
positions.sort_unstable(); positions.sort_unstable();
let same_prox_before = Instant::now(); let same_prox_before = Instant::now();
@ -172,34 +173,18 @@ impl Index {
let before = Instant::now(); let before = Instant::now();
let mut intersect_docids: Option<RoaringBitmap> = None; let mut intersect_docids: Option<RoaringBitmap> = None;
for (derived_words, pos) in words.iter().zip(positions.clone()) { for (word, pos) in positions.iter().enumerate() {
let mut count = 0;
let mut union_docids = RoaringBitmap::default();
let before = Instant::now(); let before = Instant::now();
let union_docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos));
// TODO re-enable the prefixes system
for (word, attrs) in derived_words.iter() {
if attrs.contains(pos) {
let mut key = word.clone();
key.extend_from_slice(&pos.to_be_bytes());
if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
let right = RoaringBitmap::deserialize_from_slice(attrs)?;
union_docids.union_with(&right);
count += 1;
}
}
}
let before_intersect = Instant::now(); let before_intersect = Instant::now();
match &mut intersect_docids { match &mut intersect_docids {
Some(left) => left.intersect_with(&union_docids), Some(left) => left.intersect_with(&union_docids),
None => intersect_docids = Some(union_docids), None => intersect_docids = Some(union_docids.clone()),
} }
eprintln!("retrieving {} word took {:.02?} and took {:.02?} to intersect", eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect",
count, before.elapsed(), before_intersect.elapsed()); before.elapsed(), before_intersect.elapsed());
} }
eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents", eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents",