Always do the intersections with the universe

This commit is contained in:
Clément Renault 2024-06-21 14:26:05 +02:00
parent 50a7393c55
commit 0ca1a4e805
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
9 changed files with 201 additions and 107 deletions

View File

@ -46,34 +46,68 @@ pub struct DatabaseCache<'ctx> {
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>, pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
} }
impl<'ctx> DatabaseCache<'ctx> { impl<'ctx> DatabaseCache<'ctx> {
fn get_value<'v, K1, KC, DC>( fn get_value<'v, K1, KC>(
txn: &'ctx RoTxn<'_>, txn: &'ctx RoTxn<'_>,
cache_key: K1, cache_key: K1,
db_key: &'v KC::EItem, db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>, cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
universe: Option<&RoaringBitmap>,
db: Database<KC, Bytes>, db: Database<KC, Bytes>,
) -> Result<Option<DC::DItem>> ) -> Result<Option<RoaringBitmap>>
where where
K1: Copy + Eq + Hash, K1: Copy + Eq + Hash,
KC: BytesEncode<'v>, KC: BytesEncode<'v>,
DC: BytesDecodeOwned,
{ {
if let Entry::Vacant(entry) = cache.entry(cache_key) { if let Entry::Vacant(entry) = cache.entry(cache_key) {
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed); let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
entry.insert(bitmap_ptr); entry.insert(bitmap_ptr);
} }
match cache.get(&cache_key).unwrap() { let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some) .map(Some)
.map_err(heed::Error::Decoding) .map_err(Into::into)
.map_err(Into::into),
Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
None => Ok(None),
} }
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
}
}
fn get_value_length<'v, K1, KC>(
txn: &'ctx RoTxn<'_>,
cache_key: K1,
db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
) -> Result<Option<u64>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
{
if let Entry::Vacant(entry) = cache.entry(cache_key) {
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
entry.insert(bitmap_ptr);
}
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into)
} }
fn get_value_from_keys<'v, K1, KC, DC>( fn get_value_from_keys<'v, K1, KC, DC>(
@ -137,11 +171,15 @@ impl<'ctx> SearchContext<'ctx> {
} }
} }
pub fn word_docids(&mut self, word: Word) -> Result<Option<RoaringBitmap>> { pub fn word_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Word,
) -> Result<Option<RoaringBitmap>> {
match word { match word {
Word::Original(word) => { Word::Original(word) => {
let exact = self.get_db_exact_word_docids(word)?; let exact = self.get_db_exact_word_docids(universe, word)?;
let tolerant = self.get_db_word_docids(word)?; let tolerant = self.get_db_word_docids(universe, word)?;
Ok(match (exact, tolerant) { Ok(match (exact, tolerant) {
(None, None) => None, (None, None) => None,
(None, Some(tolerant)) => Some(tolerant), (None, Some(tolerant)) => Some(tolerant),
@ -153,12 +191,16 @@ impl<'ctx> SearchContext<'ctx> {
} }
}) })
} }
Word::Derived(word) => self.get_db_word_docids(word), Word::Derived(word) => self.get_db_word_docids(universe, word),
} }
} }
/// Retrieve or insert the given value in the `word_docids` database. /// Retrieve or insert the given value in the `word_docids` database.
fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> { fn get_db_word_docids(
&mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids { match &self.restricted_fids {
Some(restricted_fids) => { Some(restricted_fids) => {
let interned = self.word_interner.get(word).as_str(); let interned = self.word_interner.get(word).as_str();
@ -174,11 +216,12 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
) )
} }
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( None => DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
word, word,
self.word_interner.get(word).as_str(), self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids, &mut self.db_cache.word_docids,
universe,
self.index.word_docids.remap_data_type::<Bytes>(), self.index.word_docids.remap_data_type::<Bytes>(),
), ),
} }
@ -186,6 +229,7 @@ impl<'ctx> SearchContext<'ctx> {
fn get_db_exact_word_docids( fn get_db_exact_word_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>, word: Interned<String>,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids { match &self.restricted_fids {
@ -203,21 +247,26 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
) )
} }
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( None => DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
word, word,
self.word_interner.get(word).as_str(), self.word_interner.get(word).as_str(),
&mut self.db_cache.exact_word_docids, &mut self.db_cache.exact_word_docids,
universe,
self.index.exact_word_docids.remap_data_type::<Bytes>(), self.index.exact_word_docids.remap_data_type::<Bytes>(),
), ),
} }
} }
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> { pub fn word_prefix_docids(
&mut self,
universe: Option<&RoaringBitmap>,
prefix: Word,
) -> Result<Option<RoaringBitmap>> {
match prefix { match prefix {
Word::Original(prefix) => { Word::Original(prefix) => {
let exact = self.get_db_exact_word_prefix_docids(prefix)?; let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?;
let tolerant = self.get_db_word_prefix_docids(prefix)?; let tolerant = self.get_db_word_prefix_docids(universe, prefix)?;
Ok(match (exact, tolerant) { Ok(match (exact, tolerant) {
(None, None) => None, (None, None) => None,
(None, Some(tolerant)) => Some(tolerant), (None, Some(tolerant)) => Some(tolerant),
@ -229,13 +278,14 @@ impl<'ctx> SearchContext<'ctx> {
} }
}) })
} }
Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix), Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix),
} }
} }
/// Retrieve or insert the given value in the `word_prefix_docids` database. /// Retrieve or insert the given value in the `word_prefix_docids` database.
fn get_db_word_prefix_docids( fn get_db_word_prefix_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
prefix: Interned<String>, prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids { match &self.restricted_fids {
@ -253,11 +303,12 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
) )
} }
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( None => DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
prefix, prefix,
self.word_interner.get(prefix).as_str(), self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids, &mut self.db_cache.word_prefix_docids,
universe,
self.index.word_prefix_docids.remap_data_type::<Bytes>(), self.index.word_prefix_docids.remap_data_type::<Bytes>(),
), ),
} }
@ -265,6 +316,7 @@ impl<'ctx> SearchContext<'ctx> {
fn get_db_exact_word_prefix_docids( fn get_db_exact_word_prefix_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
prefix: Interned<String>, prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
match &self.restricted_fids { match &self.restricted_fids {
@ -282,11 +334,12 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps, merge_cbo_roaring_bitmaps,
) )
} }
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( None => DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
prefix, prefix,
self.word_interner.get(prefix).as_str(), self.word_interner.get(prefix).as_str(),
&mut self.db_cache.exact_word_prefix_docids, &mut self.db_cache.exact_word_prefix_docids,
universe,
self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(), self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
), ),
} }
@ -294,6 +347,7 @@ impl<'ctx> SearchContext<'ctx> {
pub fn get_db_word_pair_proximity_docids( pub fn get_db_word_pair_proximity_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word1: Interned<String>, word1: Interned<String>,
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
@ -320,8 +374,8 @@ impl<'ctx> SearchContext<'ctx> {
for fid in fids { for fid in fids {
// for each field, intersect left word bitmap and right word bitmap, // for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache. // then merge the result in a global bitmap before storing it in the cache.
let word1_docids = self.get_db_word_fid_docids(word1, fid)?; let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
let word2_docids = self.get_db_word_fid_docids(word2, fid)?; let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?;
if let (Some(word1_docids), Some(word2_docids)) = if let (Some(word1_docids), Some(word2_docids)) =
(word1_docids, word2_docids) (word1_docids, word2_docids)
{ {
@ -341,7 +395,7 @@ impl<'ctx> SearchContext<'ctx> {
Ok(docids) Ok(docids)
} }
ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
(proximity, word1, word2), (proximity, word1, word2),
&( &(
@ -350,6 +404,7 @@ impl<'ctx> SearchContext<'ctx> {
self.word_interner.get(word2).as_str(), self.word_interner.get(word2).as_str(),
), ),
&mut self.db_cache.word_pair_proximity_docids, &mut self.db_cache.word_pair_proximity_docids,
universe,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
), ),
} }
@ -357,16 +412,16 @@ impl<'ctx> SearchContext<'ctx> {
pub fn get_db_word_pair_proximity_docids_len( pub fn get_db_word_pair_proximity_docids_len(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word1: Interned<String>, word1: Interned<String>,
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<u64>> { ) -> Result<Option<u64>> {
match self.index.proximity_precision(self.txn)?.unwrap_or_default() { match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
ProximityPrecision::ByAttribute => Ok(self ProximityPrecision::ByAttribute => Ok(self
.get_db_word_pair_proximity_docids(word1, word2, proximity)? .get_db_word_pair_proximity_docids(universe, word1, word2, proximity)?
.map(|d| d.len())), .map(|d| d.len())),
ProximityPrecision::ByWord => { ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
self.txn, self.txn,
(proximity, word1, word2), (proximity, word1, word2),
&( &(
@ -376,13 +431,13 @@ impl<'ctx> SearchContext<'ctx> {
), ),
&mut self.db_cache.word_pair_proximity_docids, &mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
) ),
}
} }
} }
pub fn get_db_word_prefix_pair_proximity_docids( pub fn get_db_word_prefix_pair_proximity_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word1: Interned<String>, word1: Interned<String>,
prefix2: Interned<String>, prefix2: Interned<String>,
mut proximity: u8, mut proximity: u8,
@ -409,8 +464,9 @@ impl<'ctx> SearchContext<'ctx> {
// for each field, intersect left word bitmap and right word bitmap, // for each field, intersect left word bitmap and right word bitmap,
// then merge the result in a global bitmap before storing it in the cache. // then merge the result in a global bitmap before storing it in the cache.
for fid in fids { for fid in fids {
let word1_docids = self.get_db_word_fid_docids(word1, fid)?; let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?; let prefix2_docids =
self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?;
if let (Some(word1_docids), Some(prefix2_docids)) = if let (Some(word1_docids), Some(prefix2_docids)) =
(word1_docids, prefix2_docids) (word1_docids, prefix2_docids)
{ {
@ -452,16 +508,18 @@ impl<'ctx> SearchContext<'ctx> {
pub fn get_db_prefix_word_pair_proximity_docids( pub fn get_db_prefix_word_pair_proximity_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
left_prefix: Interned<String>, left_prefix: Interned<String>,
right: Interned<String>, right: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
// only accept exact matches on reverted positions // only accept exact matches on reverted positions
self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity)
} }
pub fn get_db_word_fid_docids( pub fn get_db_word_fid_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>, word: Interned<String>,
fid: u16, fid: u16,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
@ -470,17 +528,19 @@ impl<'ctx> SearchContext<'ctx> {
return Ok(None); return Ok(None);
} }
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
(word, fid), (word, fid),
&(self.word_interner.get(word).as_str(), fid), &(self.word_interner.get(word).as_str(), fid),
&mut self.db_cache.word_fid_docids, &mut self.db_cache.word_fid_docids,
universe,
self.index.word_fid_docids.remap_data_type::<Bytes>(), self.index.word_fid_docids.remap_data_type::<Bytes>(),
) )
} }
pub fn get_db_word_prefix_fid_docids( pub fn get_db_word_prefix_fid_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word_prefix: Interned<String>, word_prefix: Interned<String>,
fid: u16, fid: u16,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
@ -489,11 +549,12 @@ impl<'ctx> SearchContext<'ctx> {
return Ok(None); return Ok(None);
} }
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
(word_prefix, fid), (word_prefix, fid),
&(self.word_interner.get(word_prefix).as_str(), fid), &(self.word_interner.get(word_prefix).as_str(), fid),
&mut self.db_cache.word_prefix_fid_docids, &mut self.db_cache.word_prefix_fid_docids,
universe,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(), self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
) )
} }
@ -554,28 +615,32 @@ impl<'ctx> SearchContext<'ctx> {
pub fn get_db_word_position_docids( pub fn get_db_word_position_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word: Interned<String>, word: Interned<String>,
position: u16, position: u16,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
(word, position), (word, position),
&(self.word_interner.get(word).as_str(), position), &(self.word_interner.get(word).as_str(), position),
&mut self.db_cache.word_position_docids, &mut self.db_cache.word_position_docids,
universe,
self.index.word_position_docids.remap_data_type::<Bytes>(), self.index.word_position_docids.remap_data_type::<Bytes>(),
) )
} }
pub fn get_db_word_prefix_position_docids( pub fn get_db_word_prefix_position_docids(
&mut self, &mut self,
universe: Option<&RoaringBitmap>,
word_prefix: Interned<String>, word_prefix: Interned<String>,
position: u16, position: u16,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _>(
self.txn, self.txn,
(word_prefix, position), (word_prefix, position),
&(self.word_interner.get(word_prefix).as_str(), position), &(self.word_interner.get(word_prefix).as_str(), position),
&mut self.db_cache.word_prefix_position_docids, &mut self.db_cache.word_prefix_position_docids,
universe,
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(), self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
) )
} }

View File

@ -171,8 +171,9 @@ impl State {
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
// longer phrases we'll be losing on precision here. // longer phrases we'll be losing on precision here.
let bucketed_position = crate::bucketed_position(position + offset); let bucketed_position = crate::bucketed_position(position + offset);
let word_position_docids = let word_position_docids = ctx
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default() .get_db_word_position_docids(Some(universe), *word, bucketed_position)?
.unwrap_or_default()
& universe; & universe;
candidates &= word_position_docids; candidates &= word_position_docids;
if candidates.is_empty() { if candidates.is_empty() {
@ -199,7 +200,9 @@ impl State {
// ignore stop words words in phrases // ignore stop words words in phrases
.flatten() .flatten()
.map(|word| -> Result<_> { .map(|word| -> Result<_> {
Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default()) Ok(ctx
.get_db_word_fid_docids(Some(universe), *word, fid)?
.unwrap_or_default())
}), }),
)?; )?;
intersection &= &candidates; intersection &= &candidates;

View File

@ -232,11 +232,12 @@ fn resolve_universe(
#[tracing::instrument(level = "trace", skip_all, target = "search::query")] #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
fn resolve_negative_words( fn resolve_negative_words(
ctx: &mut SearchContext<'_>, ctx: &mut SearchContext<'_>,
universe: Option<&RoaringBitmap>,
negative_words: &[Word], negative_words: &[Word],
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut negative_bitmap = RoaringBitmap::new(); let mut negative_bitmap = RoaringBitmap::new();
for &word in negative_words { for &word in negative_words {
if let Some(bitmap) = ctx.word_docids(word)? { if let Some(bitmap) = ctx.word_docids(universe, word)? {
negative_bitmap |= bitmap; negative_bitmap |= bitmap;
} }
} }
@ -246,13 +247,14 @@ fn resolve_negative_words(
#[tracing::instrument(level = "trace", skip_all, target = "search::query")] #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
fn resolve_negative_phrases( fn resolve_negative_phrases(
ctx: &mut SearchContext<'_>, ctx: &mut SearchContext<'_>,
universe: Option<&RoaringBitmap>,
negative_phrases: &[LocatedQueryTerm], negative_phrases: &[LocatedQueryTerm],
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut negative_bitmap = RoaringBitmap::new(); let mut negative_bitmap = RoaringBitmap::new();
for term in negative_phrases { for term in negative_phrases {
let query_term = ctx.term_interner.get(term.value); let query_term = ctx.term_interner.get(term.value);
if let Some(phrase) = query_term.original_phrase() { if let Some(phrase) = query_term.original_phrase() {
negative_bitmap |= ctx.get_phrase_docids(phrase)?; negative_bitmap |= ctx.get_phrase_docids(universe, phrase)?;
} }
} }
Ok(negative_bitmap) Ok(negative_bitmap)
@ -686,8 +688,8 @@ pub fn execute_search(
located_query_terms_from_tokens(ctx, tokens, words_limit)?; located_query_terms_from_tokens(ctx, tokens, words_limit)?;
used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty(); used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
let ignored_documents = resolve_negative_words(ctx, &negative_words)?; let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?;
let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?; let ignored_phrases = resolve_negative_phrases(ctx, Some(&universe), &negative_phrases)?;
universe -= ignored_documents; universe -= ignored_documents;
universe -= ignored_phrases; universe -= ignored_phrases;

View File

@ -417,7 +417,7 @@ fn split_best_frequency(
let left = ctx.word_interner.insert(left.to_owned()); let left = ctx.word_interner.insert(left.to_owned());
let right = ctx.word_interner.insert(right.to_owned()); let right = ctx.word_interner.insert(right.to_owned());
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? {
if best.map_or(true, |(old, _, _)| frequency > old) { if best.map_or(true, |(old, _, _)| frequency > old) {
best = Some((frequency, left, right)); best = Some((frequency, left, right));
} }

View File

@ -26,18 +26,15 @@ fn compute_docids(
} else { } else {
return Ok(Default::default()); return Ok(Default::default());
}; };
let mut candidates = match exact_term {
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), let candidates = match exact_term {
// TODO I move the intersection here
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(Some(universe), phrase)? & universe,
ExactTerm::Word(word) => { ExactTerm::Word(word) => {
if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? { ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default()
word_candidates
} else {
return Ok(Default::default());
}
} }
}; };
candidates &= universe;
Ok(candidates) Ok(candidates)
} }

View File

@ -30,8 +30,12 @@ impl RankingRuleGraphTrait for FidGraph {
let docids = if let Some(fid) = condition.fid { let docids = if let Some(fid) = condition.fid {
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
let docids = let docids = compute_query_term_subset_docids_within_field_id(
compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; ctx,
Some(universe),
&term.term_subset,
fid,
)?;
docids & universe docids & universe
} else { } else {
RoaringBitmap::new() RoaringBitmap::new()

View File

@ -33,6 +33,7 @@ impl RankingRuleGraphTrait for PositionGraph {
docids |= universe docids |= universe
& compute_query_term_subset_docids_within_position( & compute_query_term_subset_docids_within_position(
ctx, ctx,
Some(universe),
&term.term_subset, &term.term_subset,
*position, *position,
)?; )?;

View File

@ -74,10 +74,10 @@ pub fn compute_docids(
if right_derivs.len() > 1 { if right_derivs.len() > 1 {
let universe = &universe; let universe = &universe;
if let Some(left_phrase) = left_phrase { if let Some(left_phrase) = left_phrase {
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { if universe.is_disjoint(ctx.get_phrase_docids(Some(universe), left_phrase)?) {
continue; continue;
} }
} else if let Some(left_word_docids) = ctx.word_docids(left_word)? { } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
if universe.is_disjoint(&left_word_docids) { if universe.is_disjoint(&left_word_docids) {
continue; continue;
} }
@ -123,7 +123,10 @@ fn compute_prefix_edges(
let mut universe = universe.clone(); let mut universe = universe.clone();
if let Some(phrase) = left_phrase { if let Some(phrase) = left_phrase {
let phrase_docids = ctx.get_phrase_docids(phrase)?; // TODO we can clearly give the universe to this method
// Unfortunately, it is deserializing/computing stuff and
// keeping the result as a materialized bitmap.
let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
if !phrase_docids.is_empty() { if !phrase_docids.is_empty() {
used_left_phrases.insert(phrase); used_left_phrases.insert(phrase);
} }
@ -133,9 +136,13 @@ fn compute_prefix_edges(
} }
} }
if let Some(new_docids) = // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(
{ Some(&universe),
left_word,
right_prefix,
forward_proximity,
)? {
let new_docids = &universe & new_docids; let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(left_word); used_left_words.insert(left_word);
@ -147,6 +154,7 @@ fn compute_prefix_edges(
// No swapping when computing the proximity between a phrase and a word // No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none() { if left_phrase.is_none() {
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
Some(&universe),
right_prefix, right_prefix,
left_word, left_word,
backward_proximity, backward_proximity,
@ -177,26 +185,29 @@ fn compute_non_prefix_edges(
let mut universe = universe.clone(); let mut universe = universe.clone();
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
let phrase_docids = ctx.get_phrase_docids(phrase)?; // TODO do the intersection in the method, again!
let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
universe &= phrase_docids; universe &= phrase_docids;
if universe.is_empty() { if universe.is_empty() {
return Ok(()); return Ok(());
} }
} }
// TODO check that it is not an issue to alterate the universe
if let Some(new_docids) = if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)?
{ {
let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
*docids |= new_docids; *docids |= new_docids;
} }
} }
if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() { if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
if let Some(new_docids) = if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? Some(&universe),
{ word2,
let new_docids = &universe & new_docids; word1,
backward_proximity,
)? {
if !new_docids.is_empty() { if !new_docids.is_empty() {
*docids |= new_docids; *docids |= new_docids;
} }

View File

@ -19,11 +19,16 @@ pub struct PhraseDocIdsCache {
} }
impl<'ctx> SearchContext<'ctx> { impl<'ctx> SearchContext<'ctx> {
/// Get the document ids associated with the given phrase /// Get the document ids associated with the given phrase
pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> { pub fn get_phrase_docids(
&mut self,
universe: Option<&RoaringBitmap>,
phrase: Interned<Phrase>,
) -> Result<&RoaringBitmap> {
if self.phrase_docids.cache.contains_key(&phrase) { if self.phrase_docids.cache.contains_key(&phrase) {
return Ok(&self.phrase_docids.cache[&phrase]); return Ok(&self.phrase_docids.cache[&phrase]);
}; };
let docids = compute_phrase_docids(self, phrase)?; let docids = compute_phrase_docids(self, universe, phrase)?;
// TODO can we improve that? Because there is an issue, we keep that in cache...
let _ = self.phrase_docids.cache.insert(phrase, docids); let _ = self.phrase_docids.cache.insert(phrase, docids);
let docids = &self.phrase_docids.cache[&phrase]; let docids = &self.phrase_docids.cache[&phrase];
Ok(docids) Ok(docids)
@ -35,17 +40,18 @@ pub fn compute_query_term_subset_docids(
term: &QueryTermSubset, term: &QueryTermSubset,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
// TODO use the MultiOps trait to do large intersections
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_docids) = ctx.word_docids(word)? { if let Some(word_docids) = ctx.word_docids(universe, word)? {
docids |= word_docids; docids |= word_docids;
} }
} }
for phrase in term.all_phrases(ctx)? { for phrase in term.all_phrases(ctx)? {
docids |= ctx.get_phrase_docids(phrase)?; docids |= ctx.get_phrase_docids(universe, phrase)?;
} }
if let Some(prefix) = term.use_prefix_db(ctx) { if let Some(prefix) = term.use_prefix_db(ctx) {
if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? { if let Some(prefix_docids) = ctx.word_prefix_docids(universe, prefix)? {
docids |= prefix_docids; docids |= prefix_docids;
} }
} }
@ -58,12 +64,13 @@ pub fn compute_query_term_subset_docids(
pub fn compute_query_term_subset_docids_within_field_id( pub fn compute_query_term_subset_docids_within_field_id(
ctx: &mut SearchContext<'_>, ctx: &mut SearchContext<'_>,
universe: Option<&RoaringBitmap>,
term: &QueryTermSubset, term: &QueryTermSubset,
fid: u16, fid: u16,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, word.interned(), fid)? {
docids |= word_fid_docids; docids |= word_fid_docids;
} }
} }
@ -72,15 +79,15 @@ pub fn compute_query_term_subset_docids_within_field_id(
// There may be false positives when resolving a phrase, so we're not // There may be false positives when resolving a phrase, so we're not
// guaranteed that all of its words are within a single fid. // guaranteed that all of its words are within a single fid.
if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? {
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; docids |= ctx.get_phrase_docids(Some(&word_fid_docids), phrase)?;
} }
} }
} }
if let Some(word_prefix) = term.use_prefix_db(ctx) { if let Some(word_prefix) = term.use_prefix_db(ctx) {
if let Some(word_fid_docids) = if let Some(word_fid_docids) =
ctx.get_db_word_prefix_fid_docids(word_prefix.interned(), fid)? ctx.get_db_word_prefix_fid_docids(universe, word_prefix.interned(), fid)?
{ {
docids |= word_fid_docids; docids |= word_fid_docids;
} }
@ -91,13 +98,14 @@ pub fn compute_query_term_subset_docids_within_field_id(
pub fn compute_query_term_subset_docids_within_position( pub fn compute_query_term_subset_docids_within_position(
ctx: &mut SearchContext<'_>, ctx: &mut SearchContext<'_>,
universe: Option<&RoaringBitmap>,
term: &QueryTermSubset, term: &QueryTermSubset,
position: u16, position: u16,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? { for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_position_docids) = if let Some(word_position_docids) =
ctx.get_db_word_position_docids(word.interned(), position)? ctx.get_db_word_position_docids(universe, word.interned(), position)?
{ {
docids |= word_position_docids; docids |= word_position_docids;
} }
@ -107,15 +115,17 @@ pub fn compute_query_term_subset_docids_within_position(
// It's difficult to know the expected position of the words in the phrase, // It's difficult to know the expected position of the words in the phrase,
// so instead we just check the first one. // so instead we just check the first one.
if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { if let Some(word_position_docids) =
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids ctx.get_db_word_position_docids(universe, *word, position)?
{
docids |= ctx.get_phrase_docids(Some(&word_position_docids), phrase)?;
} }
} }
} }
if let Some(word_prefix) = term.use_prefix_db(ctx) { if let Some(word_prefix) = term.use_prefix_db(ctx) {
if let Some(word_position_docids) = if let Some(word_position_docids) =
ctx.get_db_word_prefix_position_docids(word_prefix.interned(), position)? ctx.get_db_word_prefix_position_docids(universe, word_prefix.interned(), position)?
{ {
docids |= word_position_docids; docids |= word_position_docids;
} }
@ -180,6 +190,7 @@ pub fn compute_query_graph_docids(
pub fn compute_phrase_docids( pub fn compute_phrase_docids(
ctx: &mut SearchContext<'_>, ctx: &mut SearchContext<'_>,
universe: Option<&RoaringBitmap>,
phrase: Interned<Phrase>, phrase: Interned<Phrase>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
@ -189,7 +200,7 @@ pub fn compute_phrase_docids(
} }
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
for word in words.iter().flatten().copied() { for word in words.iter().flatten().copied() {
if let Some(word_docids) = ctx.word_docids(Word::Original(word))? { if let Some(word_docids) = ctx.word_docids(universe, Word::Original(word))? {
candidates |= word_docids; candidates |= word_docids;
} else { } else {
return Ok(RoaringBitmap::new()); return Ok(RoaringBitmap::new());
@ -213,7 +224,7 @@ pub fn compute_phrase_docids(
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{ {
if dist == 0 { if dist == 0 {
match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { match ctx.get_db_word_pair_proximity_docids(universe, s1, s2, 1)? {
Some(m) => bitmaps.push(m), Some(m) => bitmaps.push(m),
// If there are no documents for this pair, there will be no // If there are no documents for this pair, there will be no
// results for the phrase query. // results for the phrase query.
@ -223,7 +234,7 @@ pub fn compute_phrase_docids(
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
for dist in 0..=dist { for dist in 0..=dist {
if let Some(m) = if let Some(m) =
ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? ctx.get_db_word_pair_proximity_docids(universe, s1, s2, dist as u8 + 1)?
{ {
bitmap |= m; bitmap |= m;
} }