use std::borrow::Cow; use std::collections::hash_map::Entry; use std::hash::Hash; use fxhash::FxHashMap; use grenad::MergeFunction; use heed::types::Bytes; use heed::{BytesEncode, Database, RoTxn}; use roaring::RoaringBitmap; use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::proximity::ProximityPrecision; use crate::update::MergeCboRoaringBitmaps; use crate::{ CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, }; /// A cache storing pointers to values in the LMDB databases. /// /// Used for performance reasons only. By using this cache, we avoid performing a /// database lookup and instead get a direct reference to the value using a fast /// local HashMap lookup. #[derive(Default)] pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_prefix_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>, pub prefix_word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_docids: FxHashMap, Option>>, pub exact_word_docids: FxHashMap, Option>>, pub word_prefix_docids: FxHashMap, Option>>, pub exact_word_prefix_docids: FxHashMap, Option>>, pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option>>, pub word_prefix_position_docids: FxHashMap<(Interned, u16), Option>>, pub word_positions: FxHashMap, Vec>, pub word_prefix_positions: FxHashMap, Vec>, pub word_fid_docids: FxHashMap<(Interned, u16), Option>>, pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option>>, pub word_fids: FxHashMap, Vec>, pub word_prefix_fids: FxHashMap, Vec>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( txn: &'ctx RoTxn<'_>, cache_key: K1, db_key: &'v KC::EItem, cache: &mut FxHashMap>>, universe: Option<&RoaringBitmap>, db: Database, ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, { if let Entry::Vacant(entry) = cache.entry(cache_key) { let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed); entry.insert(bitmap_ptr); } let bitmap_bytes = match cache.get(&cache_key).unwrap() { Some(Cow::Borrowed(bytes)) => bytes, Some(Cow::Owned(bytes)) => bytes.as_slice(), None => return Ok(None), }; match (bitmap_bytes, universe) { (bytes, Some(universe)) => { CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe) .map(Some) .map_err(Into::into) } (bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes) .map(Some) .map_err(heed::Error::Decoding) .map_err(Into::into), } } fn get_value_length<'v, K1, KC>( txn: &'ctx RoTxn<'_>, cache_key: K1, db_key: &'v KC::EItem, cache: &mut FxHashMap>>, db: Database, ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, { if let Entry::Vacant(entry) = cache.entry(cache_key) { let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed); entry.insert(bitmap_ptr); } let bitmap_bytes = match cache.get(&cache_key).unwrap() { Some(Cow::Borrowed(bytes)) => bytes, Some(Cow::Owned(bytes)) => bytes.as_slice(), None => return Ok(None), }; CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes) .map(Some) .map_err(heed::Error::Decoding) .map_err(Into::into) } fn get_value_from_keys<'v, K1, KC, MF>( txn: &'ctx RoTxn<'_>, cache_key: K1, db_keys: &'v [KC::EItem], cache: &mut FxHashMap>>, db: Database, universe: Option<&RoaringBitmap>, merger: MF, ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, KC::EItem: Sized, MF: MergeFunction, crate::Error: From, { if let Entry::Vacant(entry) = cache.entry(cache_key) { let bitmap_ptr: Option> = match db_keys { [] => None, [key] => db.get(txn, key)?.map(Cow::Borrowed), keys => { let bitmaps = keys .iter() .filter_map(|key| db.get(txn, key).transpose()) .map(|v| v.map(Cow::Borrowed)) .collect::>, _>>()?; if bitmaps.is_empty() { None } else { Some(merger.merge(&[], &bitmaps[..])?) } } }; entry.insert(bitmap_ptr); } let bitmap_bytes = match cache.get(&cache_key).unwrap() { Some(Cow::Borrowed(bytes)) => bytes, Some(Cow::Owned(bytes)) => bytes.as_slice(), None => return Ok(None), }; match (bitmap_bytes, universe) { (bytes, Some(universe)) => { CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe) .map(Some) .map_err(Into::into) } (bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes) .map(Some) .map_err(heed::Error::Decoding) .map_err(Into::into), } } } impl<'ctx> SearchContext<'ctx> { pub fn get_words_fst(&mut self) -> Result>> { if let Some(fst) = self.db_cache.words_fst.clone() { Ok(fst) } else { let fst = self.index.words_fst(self.txn)?; self.db_cache.words_fst = Some(fst.clone()); Ok(fst) } } pub fn word_docids( &mut self, universe: Option<&RoaringBitmap>, word: Word, ) -> Result> { match word { Word::Original(word) => { let exact = self.get_db_exact_word_docids(universe, word)?; let tolerant = self.get_db_word_docids(universe, word)?; Ok(match (exact, tolerant) { (None, None) => None, (None, Some(tolerant)) => Some(tolerant), (Some(exact), None) => Some(exact), (Some(exact), Some(tolerant)) => { let mut both = exact; both |= tolerant; Some(both) } }) } Word::Derived(word) => self.get_db_word_docids(universe, word), } } /// Retrieve or insert the given value in the `word_docids` database. fn get_db_word_docids( &mut self, universe: Option<&RoaringBitmap>, word: Interned, ) -> Result> { match &self.restricted_fids { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); let keys: Vec<_> = restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys( self.txn, word, &keys[..], &mut self.db_cache.word_docids, self.index.word_fid_docids.remap_data_type::(), universe, MergeCboRoaringBitmaps, ) } None => DatabaseCache::get_value( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.word_docids, universe, self.index.word_docids.remap_data_type::(), ), } } fn get_db_exact_word_docids( &mut self, universe: Option<&RoaringBitmap>, word: Interned, ) -> Result> { match &self.restricted_fids { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); let keys: Vec<_> = restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys( self.txn, word, &keys[..], &mut self.db_cache.exact_word_docids, self.index.word_fid_docids.remap_data_type::(), universe, MergeCboRoaringBitmaps, ) } None => DatabaseCache::get_value( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.exact_word_docids, universe, self.index.exact_word_docids.remap_data_type::(), ), } } pub fn word_prefix_docids( &mut self, universe: Option<&RoaringBitmap>, prefix: Word, ) -> Result> { match prefix { Word::Original(prefix) => { let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?; let tolerant = self.get_db_word_prefix_docids(universe, prefix)?; Ok(match (exact, tolerant) { (None, None) => None, (None, Some(tolerant)) => Some(tolerant), (Some(exact), None) => Some(exact), (Some(exact), Some(tolerant)) => { let mut both = exact; both |= tolerant; Some(both) } }) } Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix), } } /// Retrieve or insert the given value in the `word_prefix_docids` database. fn get_db_word_prefix_docids( &mut self, universe: Option<&RoaringBitmap>, prefix: Interned, ) -> Result> { match &self.restricted_fids { Some(restricted_fids) => { let interned = self.word_interner.get(prefix).as_str(); let keys: Vec<_> = restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys( self.txn, prefix, &keys[..], &mut self.db_cache.word_prefix_docids, self.index.word_prefix_fid_docids.remap_data_type::(), universe, MergeCboRoaringBitmaps, ) } None => DatabaseCache::get_value( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.word_prefix_docids, universe, self.index.word_prefix_docids.remap_data_type::(), ), } } fn get_db_exact_word_prefix_docids( &mut self, universe: Option<&RoaringBitmap>, prefix: Interned, ) -> Result> { match &self.restricted_fids { Some(restricted_fids) => { let interned = self.word_interner.get(prefix).as_str(); let keys: Vec<_> = restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); DatabaseCache::get_value_from_keys( self.txn, prefix, &keys[..], &mut self.db_cache.exact_word_prefix_docids, self.index.word_prefix_fid_docids.remap_data_type::(), universe, MergeCboRoaringBitmaps, ) } None => DatabaseCache::get_value( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.exact_word_prefix_docids, universe, self.index.exact_word_prefix_docids.remap_data_type::(), ), } } pub fn get_db_word_pair_proximity_docids( &mut self, universe: Option<&RoaringBitmap>, word1: Interned, word2: Interned, proximity: u8, ) -> Result> { match self.index.proximity_precision(self.txn)?.unwrap_or_default() { ProximityPrecision::ByAttribute => { // Force proximity to 0 because: // in ByAttribute, there are only 2 possible distances: // 1. words in same attribute: in that the DB contains (0, word1, word2) // 2. words in different attributes: no DB entry for these two words. let proximity = 0; let docids = if let Some(docids) = self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2)) { docids .as_ref() .map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d)) .transpose() .map_err(heed::Error::Decoding)? } else { // Compute the distance at the attribute level and store it in the cache. let fids = self.index.searchable_fields_ids(self.txn)?; let mut docids = RoaringBitmap::new(); for fid in fids { // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?; let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?; if let (Some(word1_docids), Some(word2_docids)) = (word1_docids, word2_docids) { docids |= word1_docids & word2_docids; } } let encoded = CboRoaringBitmapCodec::bytes_encode(&docids) .map(Cow::into_owned) .map(Cow::Owned) .map(Some) .map_err(heed::Error::Decoding)?; self.db_cache .word_pair_proximity_docids .insert((proximity, word1, word2), encoded); Some(docids) }; Ok(docids) } ProximityPrecision::ByWord => DatabaseCache::get_value( self.txn, (proximity, word1, word2), &( proximity, self.word_interner.get(word1).as_str(), self.word_interner.get(word2).as_str(), ), &mut self.db_cache.word_pair_proximity_docids, universe, self.index.word_pair_proximity_docids.remap_data_type::(), ), } } pub fn get_db_word_pair_proximity_docids_len( &mut self, universe: Option<&RoaringBitmap>, word1: Interned, word2: Interned, proximity: u8, ) -> Result> { match self.index.proximity_precision(self.txn)?.unwrap_or_default() { ProximityPrecision::ByAttribute => Ok(self .get_db_word_pair_proximity_docids(universe, word1, word2, proximity)? .map(|d| d.len())), ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>( self.txn, (proximity, word1, word2), &( proximity, self.word_interner.get(word1).as_str(), self.word_interner.get(word2).as_str(), ), &mut self.db_cache.word_pair_proximity_docids, self.index.word_pair_proximity_docids.remap_data_type::(), ), } } pub fn get_db_word_prefix_pair_proximity_docids( &mut self, universe: Option<&RoaringBitmap>, word1: Interned, prefix2: Interned, mut proximity: u8, ) -> Result> { let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default(); if proximity_precision == ProximityPrecision::ByAttribute { // Force proximity to 0 because: // in ByAttribute, there are only 2 possible distances: // 1. words in same attribute: in that the DB contains (0, word1, word2) // 2. words in different attributes: no DB entry for these two words. proximity = 0; } let docids = if let Some(docids) = self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2)) { docids.clone() } else { let prefix_docids = match proximity_precision { ProximityPrecision::ByAttribute => { // Compute the distance at the attribute level and store it in the cache. let fids = self.index.searchable_fields_ids(self.txn)?; let mut prefix_docids = RoaringBitmap::new(); // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. for fid in fids { let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?; let prefix2_docids = self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?; if let (Some(word1_docids), Some(prefix2_docids)) = (word1_docids, prefix2_docids) { prefix_docids |= word1_docids & prefix2_docids; } } prefix_docids } ProximityPrecision::ByWord => { // compute docids using prefix iter and store the result in the cache. let key = U8StrStrCodec::bytes_encode(&( proximity, self.word_interner.get(word1).as_str(), self.word_interner.get(prefix2).as_str(), )) .unwrap() .into_owned(); let mut prefix_docids = RoaringBitmap::new(); let remap_key_type = self .index .word_pair_proximity_docids .remap_key_type::() .prefix_iter(self.txn, &key)?; for result in remap_key_type { let (_, docids) = result?; prefix_docids |= docids; } prefix_docids } }; self.db_cache .word_prefix_pair_proximity_docids .insert((proximity, word1, prefix2), Some(prefix_docids.clone())); Some(prefix_docids) }; Ok(docids) } pub fn get_db_prefix_word_pair_proximity_docids( &mut self, universe: Option<&RoaringBitmap>, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { // only accept exact matches on reverted positions self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity) } pub fn get_db_word_fid_docids( &mut self, universe: Option<&RoaringBitmap>, word: Interned, fid: u16, ) -> Result> { // if the requested fid isn't in the restricted list, return None. if self.restricted_fids.as_ref().is_some_and(|fids| !fids.contains(&fid)) { return Ok(None); } DatabaseCache::get_value( self.txn, (word, fid), &(self.word_interner.get(word).as_str(), fid), &mut self.db_cache.word_fid_docids, universe, self.index.word_fid_docids.remap_data_type::(), ) } pub fn get_db_word_prefix_fid_docids( &mut self, universe: Option<&RoaringBitmap>, word_prefix: Interned, fid: u16, ) -> Result> { // if the requested fid isn't in the restricted list, return None. if self.restricted_fids.as_ref().is_some_and(|fids| !fids.contains(&fid)) { return Ok(None); } DatabaseCache::get_value( self.txn, (word_prefix, fid), &(self.word_interner.get(word_prefix).as_str(), fid), &mut self.db_cache.word_prefix_fid_docids, universe, self.index.word_prefix_fid_docids.remap_data_type::(), ) } pub fn get_db_word_fids(&mut self, word: Interned) -> Result> { let fids = match self.db_cache.word_fids.entry(word) { Entry::Occupied(fids) => fids.get().clone(), Entry::Vacant(entry) => { let mut key = self.word_interner.get(word).as_bytes().to_owned(); key.push(0); let mut fids = vec![]; let remap_key_type = self .index .word_fid_docids .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { let ((_, fid), value) = result?; // filling other caches to avoid searching for them again self.db_cache.word_fid_docids.insert((word, fid), Some(Cow::Borrowed(value))); fids.push(fid); } entry.insert(fids.clone()); fids } }; Ok(fids) } pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned) -> Result> { let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) { Entry::Occupied(fids) => fids.get().clone(), Entry::Vacant(entry) => { let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned(); key.push(0); let mut fids = vec![]; let remap_key_type = self .index .word_prefix_fid_docids .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { let ((_, fid), value) = result?; // filling other caches to avoid searching for them again self.db_cache .word_prefix_fid_docids .insert((word_prefix, fid), Some(Cow::Borrowed(value))); fids.push(fid); } entry.insert(fids.clone()); fids } }; Ok(fids) } pub fn get_db_word_position_docids( &mut self, universe: Option<&RoaringBitmap>, word: Interned, position: u16, ) -> Result> { DatabaseCache::get_value( self.txn, (word, position), &(self.word_interner.get(word).as_str(), position), &mut self.db_cache.word_position_docids, universe, self.index.word_position_docids.remap_data_type::(), ) } pub fn get_db_word_prefix_position_docids( &mut self, universe: Option<&RoaringBitmap>, word_prefix: Interned, position: u16, ) -> Result> { DatabaseCache::get_value( self.txn, (word_prefix, position), &(self.word_interner.get(word_prefix).as_str(), position), &mut self.db_cache.word_prefix_position_docids, universe, self.index.word_prefix_position_docids.remap_data_type::(), ) } pub fn get_db_word_positions(&mut self, word: Interned) -> Result> { let positions = match self.db_cache.word_positions.entry(word) { Entry::Occupied(positions) => positions.get().clone(), Entry::Vacant(entry) => { let mut key = self.word_interner.get(word).as_bytes().to_owned(); key.push(0); let mut positions = vec![]; let remap_key_type = self .index .word_position_docids .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { let ((_, position), value) = result?; // filling other caches to avoid searching for them again self.db_cache .word_position_docids .insert((word, position), Some(Cow::Borrowed(value))); positions.push(position); } entry.insert(positions.clone()); positions } }; Ok(positions) } pub fn get_db_word_prefix_positions( &mut self, word_prefix: Interned, ) -> Result> { let positions = match self.db_cache.word_prefix_positions.entry(word_prefix) { Entry::Occupied(positions) => positions.get().clone(), Entry::Vacant(entry) => { let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned(); key.push(0); let mut positions = vec![]; let remap_key_type = self .index .word_prefix_position_docids .remap_types::() .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { let ((_, position), value) = result?; // filling other caches to avoid searching for them again self.db_cache .word_prefix_position_docids .insert((word_prefix, position), Some(Cow::Borrowed(value))); positions.push(position); } entry.insert(positions.clone()); positions } }; Ok(positions) } }