Intersect document ids by inverse popularity of the words

This reduces the worst request we had which took 56s to now took 3s ("the best of the do").
This commit is contained in:
Kerollmops 2020-07-05 19:33:51 +02:00
parent cd7e64b2b3
commit ec1023e790
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -228,20 +228,32 @@ impl Index {
for positions in positions { for positions in positions {
let before = Instant::now(); let before = Instant::now();
let mut intersect_docids: Option<RoaringBitmap> = None; let mut to_intersect: Vec<_> = positions.iter()
for (word, pos) in positions.iter().enumerate() { .enumerate()
let before = Instant::now(); .map(|(word, pos)| {
let union_docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos)); let docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos));
// FIXME don't clone here
(docids.len(), docids.clone())
})
.collect();
to_intersect.sort_unstable_by_key(|(l, _)| *l);
let elapsed_retrieving = before.elapsed();
let before_intersect = Instant::now(); let before_intersect = Instant::now();
match &mut intersect_docids { let intersect_docids: Option<RoaringBitmap> = to_intersect.into_iter()
Some(left) => left.intersect_with(&union_docids), .fold(None, |acc, (_, union_docids)| {
None => intersect_docids = Some(union_docids.clone()), match acc {
Some(mut left) => {
left.intersect_with(&union_docids);
Some(left)
},
None => Some(union_docids.clone()),
} }
});
eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect", eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect",
before.elapsed(), before_intersect.elapsed()); elapsed_retrieving, before_intersect.elapsed());
}
eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents", eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents",
proximity, positions, before.elapsed(), proximity, positions, before.elapsed(),