mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-03 10:21:38 +01:00
Cache the unions of the derived words positions
This commit is contained in:
parent
115e0142d9
commit
1628a31efa
@ -43,7 +43,6 @@ criterion = "0.3"
|
|||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
intersect-to-csv = []
|
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "search"
|
name = "search"
|
||||||
|
140
src/lib.rs
140
src/lib.rs
@ -123,68 +123,42 @@ impl Index {
|
|||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
let mut debug_intersects = HashMap::new();
|
// Returns the union of the same position for all the derived words.
|
||||||
|
let unions_word_pos = |word: usize, pos: u32| {
|
||||||
|
let mut union_docids = RoaringBitmap::new();
|
||||||
|
for (word, attrs) in &words[word] {
|
||||||
|
if attrs.contains(pos) {
|
||||||
|
let mut key = word.clone();
|
||||||
|
key.extend_from_slice(&pos.to_be_bytes());
|
||||||
|
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() {
|
||||||
|
let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap();
|
||||||
|
union_docids.union_with(&right);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
union_docids
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut union_cache = HashMap::new();
|
||||||
let mut intersect_cache = HashMap::new();
|
let mut intersect_cache = HashMap::new();
|
||||||
let mut lunion_docids = RoaringBitmap::default();
|
// Returns `true` if there is documents in common between the two words and positions given.
|
||||||
let mut runion_docids = RoaringBitmap::default();
|
|
||||||
let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| {
|
let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| {
|
||||||
let proximity = best_proximity::positions_proximity(lpos, rpos);
|
let proximity = best_proximity::positions_proximity(lpos, rpos);
|
||||||
|
|
||||||
if proximity == 0 { return false }
|
if proximity == 0 { return false }
|
||||||
|
|
||||||
|
// We retrieve or compute the intersection between the two given words and positions.
|
||||||
*intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
*intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
||||||
let (nb_words, nb_docs_intersect, lnblookups, lnbbitmaps, rnblookups, rnbbitmaps) =
|
// We retrieve or compute the unions for the two words and positions.
|
||||||
debug_intersects.entry((lword, lpos, rword, rpos, proximity)).or_default();
|
union_cache.entry((lword, lpos)).or_insert_with(|| unions_word_pos(lword, lpos));
|
||||||
|
union_cache.entry((rword, rpos)).or_insert_with(|| unions_word_pos(rword, rpos));
|
||||||
|
|
||||||
let left = &words[lword];
|
// TODO is there a way to avoid this double gets?
|
||||||
let right = &words[rword];
|
let lunion_docids = union_cache.get(&(lword, lpos)).unwrap();
|
||||||
|
let runion_docids = union_cache.get(&(rword, rpos)).unwrap();
|
||||||
|
|
||||||
*nb_words = left.len() + right.len();
|
!lunion_docids.is_disjoint(&runion_docids)
|
||||||
|
|
||||||
let mut l_lookups = 0;
|
|
||||||
let mut l_bitmaps = 0;
|
|
||||||
let mut r_lookups = 0;
|
|
||||||
let mut r_bitmaps = 0;
|
|
||||||
|
|
||||||
// This for the left word
|
|
||||||
lunion_docids.clear();
|
|
||||||
for (word, attrs) in left {
|
|
||||||
if attrs.contains(lpos) {
|
|
||||||
l_lookups += 1;
|
|
||||||
let mut key = word.clone();
|
|
||||||
key.extend_from_slice(&lpos.to_be_bytes());
|
|
||||||
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() {
|
|
||||||
l_bitmaps += 1;
|
|
||||||
let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap();
|
|
||||||
lunion_docids.union_with(&right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// This for the right word
|
|
||||||
runion_docids.clear();
|
|
||||||
for (word, attrs) in right {
|
|
||||||
if attrs.contains(rpos) {
|
|
||||||
r_lookups += 1;
|
|
||||||
let mut key = word.clone();
|
|
||||||
key.extend_from_slice(&rpos.to_be_bytes());
|
|
||||||
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() {
|
|
||||||
r_bitmaps += 1;
|
|
||||||
let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap();
|
|
||||||
runion_docids.union_with(&right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let intersect_docids = &mut lunion_docids;
|
|
||||||
intersect_docids.intersect_with(&runion_docids);
|
|
||||||
|
|
||||||
*lnblookups = l_lookups;
|
|
||||||
*lnbbitmaps = l_bitmaps;
|
|
||||||
*rnblookups = r_lookups;
|
|
||||||
*rnbbitmaps = r_bitmaps;
|
|
||||||
*nb_docs_intersect += intersect_docids.len();
|
|
||||||
|
|
||||||
!intersect_docids.is_empty()
|
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -262,65 +236,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if cfg!(feature = "intersect-to-csv") {
|
|
||||||
debug_intersects_to_csv(debug_intersects);
|
|
||||||
}
|
|
||||||
|
|
||||||
eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::<u64>());
|
eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::<u64>());
|
||||||
Ok(documents.iter().flatten().take(20).collect())
|
Ok(documents.iter().flatten().take(20).collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn debug_intersects_to_csv(intersects: HashMap<(usize, u32, usize, u32, u32), (usize, u64, usize, usize, usize, usize)>) {
|
|
||||||
let mut wrt = csv::Writer::from_path("intersects-stats.csv").unwrap();
|
|
||||||
wrt.write_record(&[
|
|
||||||
"proximity",
|
|
||||||
"lword",
|
|
||||||
"lpos",
|
|
||||||
"rword",
|
|
||||||
"rpos",
|
|
||||||
"nb_derived_words",
|
|
||||||
"nb_docs_intersect",
|
|
||||||
"lnblookups",
|
|
||||||
"lnbbitmaps",
|
|
||||||
"rnblookups",
|
|
||||||
"rnbbitmaps",
|
|
||||||
]).unwrap();
|
|
||||||
|
|
||||||
for ((lword, lpos, rword, rpos, proximity), vals) in intersects {
|
|
||||||
let (
|
|
||||||
nb_derived_words,
|
|
||||||
nb_docs_intersect,
|
|
||||||
lnblookups,
|
|
||||||
lnbbitmaps,
|
|
||||||
rnblookups,
|
|
||||||
rnbbitmaps,
|
|
||||||
) = vals;
|
|
||||||
|
|
||||||
let proximity = proximity.to_string();
|
|
||||||
let lword = lword.to_string();
|
|
||||||
let lpos = lpos.to_string();
|
|
||||||
let rword = rword.to_string();
|
|
||||||
let rpos = rpos.to_string();
|
|
||||||
let nb_derived_words = nb_derived_words.to_string();
|
|
||||||
let nb_docs_intersect = nb_docs_intersect.to_string();
|
|
||||||
let lnblookups = lnblookups.to_string();
|
|
||||||
let lnbbitmaps = lnbbitmaps.to_string();
|
|
||||||
let rnblookups = rnblookups.to_string();
|
|
||||||
let rnbbitmaps = rnbbitmaps.to_string();
|
|
||||||
|
|
||||||
wrt.write_record(&[
|
|
||||||
&proximity,
|
|
||||||
&lword,
|
|
||||||
&lpos,
|
|
||||||
&rword,
|
|
||||||
&rpos,
|
|
||||||
&nb_derived_words,
|
|
||||||
&nb_docs_intersect,
|
|
||||||
&lnblookups,
|
|
||||||
&lnbbitmaps,
|
|
||||||
&rnblookups,
|
|
||||||
&rnbbitmaps,
|
|
||||||
]).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user