From ec1023e790f6f1894f1e8da057f1a03f01a18bc9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 5 Jul 2020 19:33:51 +0200 Subject: [PATCH] Intersect document ids by inverse popularity of the words This reduces the worst request we had which took 56s to now took 3s ("the best of the do"). --- src/lib.rs | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 113203adb..a4eecbcaf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -228,20 +228,32 @@ impl Index { for positions in positions { let before = Instant::now(); - let mut intersect_docids: Option = None; - for (word, pos) in positions.iter().enumerate() { - let before = Instant::now(); - let union_docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos)); + let mut to_intersect: Vec<_> = positions.iter() + .enumerate() + .map(|(word, pos)| { + let docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos)); + // FIXME don't clone here + (docids.len(), docids.clone()) + }) + .collect(); - let before_intersect = Instant::now(); - match &mut intersect_docids { - Some(left) => left.intersect_with(&union_docids), - None => intersect_docids = Some(union_docids.clone()), - } + to_intersect.sort_unstable_by_key(|(l, _)| *l); + let elapsed_retrieving = before.elapsed(); - eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect", - before.elapsed(), before_intersect.elapsed()); - } + let before_intersect = Instant::now(); + let intersect_docids: Option = to_intersect.into_iter() + .fold(None, |acc, (_, union_docids)| { + match acc { + Some(mut left) => { + left.intersect_with(&union_docids); + Some(left) + }, + None => Some(union_docids.clone()), + } + }); + + eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect", + elapsed_retrieving, before_intersect.elapsed()); eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents", proximity, positions, before.elapsed(),