From 934b73142d8c44e55b06d7cf9c59cfef1a8f90c5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 27 Mar 2025 17:57:57 +0100 Subject: [PATCH] reduce the number of computed prefix --- crates/milli/src/search/new/db_cache.rs | 114 +++++++++++++----- .../src/update/new/words_prefix_docids.rs | 3 + 2 files changed, 89 insertions(+), 28 deletions(-) diff --git a/crates/milli/src/search/new/db_cache.rs b/crates/milli/src/search/new/db_cache.rs index 243303ba2..e50da8d12 100644 --- a/crates/milli/src/search/new/db_cache.rs +++ b/crates/milli/src/search/new/db_cache.rs @@ -37,12 +37,12 @@ pub struct DatabaseCache<'ctx> { pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option>>, - pub word_prefix_position_docids: FxHashMap<(Interned, u16), Option>>, + pub word_prefix_position_docids: FxHashMap<(Interned, u16), Option>, pub word_positions: FxHashMap, Vec>, pub word_prefix_positions: FxHashMap, Vec>, pub word_fid_docids: FxHashMap<(Interned, u16), Option>>, - pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option>>, + pub word_prefix_fid_docids: FxHashMap<(Interned, u16), Option>, pub word_fids: FxHashMap, Vec>, pub word_prefix_fids: FxHashMap, Vec>, } @@ -562,14 +562,46 @@ impl<'ctx> SearchContext<'ctx> { return Ok(None); } - DatabaseCache::get_value( - self.txn, - (word_prefix, fid), - &(self.word_interner.get(word_prefix).as_str(), fid), - &mut self.db_cache.word_prefix_fid_docids, - universe, - self.index.word_prefix_fid_docids.remap_data_type::(), - ) + let cache = &mut self.db_cache.word_prefix_fid_docids; + let prefix_db = &self.index.word_prefix_fid_docids; + let db = &self.index.word_fid_docids; + if let Entry::Vacant(entry) = cache.entry((word_prefix, fid)) { + let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned(); + let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap(); + match prefix_db.get(self.txn, &(word_prefix_str, fid))? { + Some(mut bitmap) => { + if let Some(universe) = universe { + bitmap &= universe; + } + entry.insert(Some(bitmap)); + } + None => { + let mut key = word_prefix_bytes.clone(); + key.push(0); + let remap_key_type = db + .remap_key_type::() + .prefix_iter(self.txn, &key)? + .remap_key_type::(); + + let mut bitmap = RoaringBitmap::new(); + for result in remap_key_type { + let ((_, pos), value) = result?; + + if pos == fid { + if let Some(universe) = universe { + bitmap |= value & universe; + } else { + bitmap |= value; + } + } + } + + entry.insert(Some(bitmap)); + } + } + } + + Ok(cache.get(&(word_prefix, fid)).unwrap().clone()) } pub fn get_db_word_fids(&mut self, word: Interned) -> Result> { @@ -605,6 +637,7 @@ impl<'ctx> SearchContext<'ctx> { let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned(); key.push(0); let mut fids = vec![]; + // TODO: This is no more exhaustive, we should iterate over all fids. let remap_key_type = self .index .word_prefix_fid_docids @@ -612,11 +645,7 @@ impl<'ctx> SearchContext<'ctx> { .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { - let ((_, fid), value) = result?; - // filling other caches to avoid searching for them again - self.db_cache - .word_prefix_fid_docids - .insert((word_prefix, fid), Some(Cow::Borrowed(value))); + let ((_, fid), _value) = result?; fids.push(fid); } entry.insert(fids.clone()); @@ -648,14 +677,46 @@ impl<'ctx> SearchContext<'ctx> { word_prefix: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value( - self.txn, - (word_prefix, position), - &(self.word_interner.get(word_prefix).as_str(), position), - &mut self.db_cache.word_prefix_position_docids, - universe, - self.index.word_prefix_position_docids.remap_data_type::(), - ) + let cache = &mut self.db_cache.word_prefix_position_docids; + let prefix_db = &self.index.word_prefix_position_docids; + let db = &self.index.word_position_docids; + if let Entry::Vacant(entry) = cache.entry((word_prefix, position)) { + let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned(); + let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap(); + match prefix_db.get(self.txn, &(word_prefix_str, position))? { + Some(mut bitmap) => { + if let Some(universe) = universe { + bitmap &= universe; + } + entry.insert(Some(bitmap)); + } + None => { + let mut key = word_prefix_bytes.clone(); + key.push(0); + let remap_key_type = db + .remap_key_type::() + .prefix_iter(self.txn, &key)? + .remap_key_type::(); + + let mut bitmap = RoaringBitmap::new(); + for result in remap_key_type { + let ((_, pos), value) = result?; + + if pos == position { + if let Some(universe) = universe { + bitmap |= value & universe; + } else { + bitmap |= value; + } + } + } + + entry.insert(Some(bitmap)); + } + } + } + + Ok(cache.get(&(word_prefix, position)).unwrap().clone()) } pub fn get_db_word_positions(&mut self, word: Interned) -> Result> { @@ -696,6 +757,7 @@ impl<'ctx> SearchContext<'ctx> { let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned(); key.push(0); let mut positions = vec![]; + // TODO: This is no more exhaustive, we should iterate over all positions. let remap_key_type = self .index .word_prefix_position_docids @@ -703,11 +765,7 @@ impl<'ctx> SearchContext<'ctx> { .prefix_iter(self.txn, &key)? .remap_key_type::(); for result in remap_key_type { - let ((_, position), value) = result?; - // filling other caches to avoid searching for them again - self.db_cache - .word_prefix_position_docids - .insert((word_prefix, position), Some(Cow::Borrowed(value))); + let ((_, position), _value) = result?; positions.push(position); } entry.insert(positions.clone()); diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index 95e80fe6b..9b6728fac 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -291,6 +291,9 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { let (_word, pos) = StrBEU16Codec::bytes_decode(key).map_err(Error::Decoding)?; positions.entry(pos).or_insert_with(Vec::new).push(bytes); } + + // We remove all the positions that have less than 100 bitmaps. + positions.retain(|_, bitmaps| bitmaps.len() > 100); assert!(prefixes_bitmaps.insert(prefix.as_str(), positions).is_none()); }