diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4f50fcd29..b3c828048 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,12 +28,13 @@ pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode, QueryNodeData}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; -use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; +use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; +use self::resolve_query_graph::compute_query_graph_docids; use self::sort::Sort; use crate::{ AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, @@ -48,8 +49,7 @@ pub struct SearchContext<'ctx> { pub word_interner: DedupInterner, pub phrase_interner: DedupInterner, pub term_interner: Interner, - // think about memory usage of that field (roaring bitmaps in a hashmap) - pub term_docids: QueryTermDocIdsCache, + pub phrase_docids: PhraseDocIdsCache, } impl<'ctx> SearchContext<'ctx> { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { @@ -60,7 +60,7 @@ impl<'ctx> SearchContext<'ctx> { word_interner: <_>::default(), phrase_interner: <_>::default(), term_interner: <_>::default(), - term_docids: <_>::default(), + phrase_docids: <_>::default(), } } } @@ -103,7 +103,7 @@ fn resolve_maximally_reduced_query_graph( } } logger.query_for_universe(&graph); - let docids = resolve_query_graph(ctx, &graph, universe)?; + let docids = compute_query_graph_docids(ctx, &graph, universe)?; Ok(docids) } @@ -319,7 +319,7 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(ctx, query_terms)?; + let graph = QueryGraph::from_query(ctx, &query_terms)?; check_sort_criteria(ctx, sort_criteria.as_ref())?; diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 1b7057b51..707082cab 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -3,106 +3,63 @@ use std::collections::VecDeque; use fxhash::FxHashMap; -use heed::{BytesDecode, RoTxn}; +use heed::BytesDecode; use roaring::RoaringBitmap; -use super::db_cache::DatabaseCache; -use super::interner::{DedupInterner, Interned}; +use super::interner::Interned; use super::query_graph::QueryNodeData; -use super::query_term::{Phrase, QueryTerm}; +use super::query_term::{Phrase, QueryTermSubset}; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, SearchContext}; -use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; +use crate::search::new::query_term::LocatedQueryTermSubset; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; #[derive(Default)] -pub struct QueryTermDocIdsCache { - pub phrases: FxHashMap, RoaringBitmap>, - pub terms: FxHashMap, RoaringBitmap>, +pub struct PhraseDocIdsCache { + pub cache: FxHashMap, RoaringBitmap>, } -impl QueryTermDocIdsCache { +impl<'ctx> SearchContext<'ctx> { /// Get the document ids associated with the given phrase - pub fn get_phrase_docids<'s, 'ctx>( - &'s mut self, - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, - phrase: Interned, - ) -> Result<&'s RoaringBitmap> { - if self.phrases.contains_key(&phrase) { - return Ok(&self.phrases[&phrase]); + pub fn get_phrase_docids(&mut self, phrase: Interned) -> Result<&RoaringBitmap> { + if self.phrase_docids.cache.contains_key(&phrase) { + return Ok(&self.phrase_docids.cache[&phrase]); }; - let docids = resolve_phrase(index, txn, db_cache, word_interner, phrase_interner, phrase)?; - let _ = self.phrases.insert(phrase, docids); - let docids = &self.phrases[&phrase]; - Ok(docids) - } - /// Get the document ids associated with the given term - pub fn get_query_term_docids<'s, 'ctx>( - &'s mut self, - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &DedupInterner, - term_interner: &DedupInterner, - phrase_interner: &DedupInterner, - term_interned: Interned, - ) -> Result<&'s RoaringBitmap> { - if self.terms.contains_key(&term_interned) { - return Ok(&self.terms[&term_interned]); - }; - let mut docids = RoaringBitmap::new(); - // TODO: use a MultiOps? - let term = term_interner.get(term_interned); - for word in term.all_single_words_except_prefix_db() { - if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? { - docids |= - RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; - } - } - for phrase in term.all_phrases() { - docids |= self.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - phrase_interner, - phrase, - )?; - } - - if let Some(prefix) = term.use_prefix_db { - if let Some(prefix_docids) = - db_cache.get_word_prefix_docids(index, txn, word_interner, prefix)? - { - docids |= - RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; - } - } - - let _ = self.terms.insert(term_interned, docids); - let docids = &self.terms[&term_interned]; + let docids = compute_phrase_docids(self, phrase)?; + let _ = self.phrase_docids.cache.insert(phrase, docids); + let docids = &self.phrase_docids.cache[&phrase]; Ok(docids) } } +pub fn compute_query_term_subset_docids( + ctx: &mut SearchContext, + term: &QueryTermSubset, +) -> Result { + let mut docids = RoaringBitmap::new(); + for word in term.all_single_words_except_prefix_db(ctx)? { + if let Some(word_docids) = ctx.get_db_word_docids(word)? { + docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?; + } + } + for phrase in term.all_phrases(ctx)? { + docids |= ctx.get_phrase_docids(phrase)?; + } -pub fn resolve_query_graph( + if let Some(prefix) = term.use_prefix_db(ctx) { + if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? { + docids |= + RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; + } + } + + Ok(docids) +} + +pub fn compute_query_graph_docids( ctx: &mut SearchContext, q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { - let SearchContext { - index, - txn, - db_cache, - word_interner, - phrase_interner, - term_interner, - term_docids: query_term_docids, - .. - } = ctx; - // TODO: there is a faster way to compute this big + // TODO: there must be a faster way to compute this big // roaring bitmap expression let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); @@ -125,17 +82,13 @@ pub fn resolve_query_graph( } let node_docids = match &node.data { - QueryNodeData::Term(located_term) => { - let term_docids = query_term_docids.get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, - located_term.value, - )?; - predecessors_docids & term_docids + QueryNodeData::Term(LocatedQueryTermSubset { + term_subset, + positions: _, + term_ids: _, + }) => { + let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?; + predecessors_docids & phrase_docids } QueryNodeData::Deleted => { panic!() @@ -163,15 +116,11 @@ pub fn resolve_query_graph( panic!() } -pub fn resolve_phrase<'ctx>( - index: &Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &DedupInterner, - phrase_interner: &DedupInterner, +pub fn compute_phrase_docids( + ctx: &mut SearchContext, phrase: Interned, ) -> Result { - let Phrase { words } = phrase_interner.get(phrase).clone(); + let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = words.len().min(3); @@ -195,14 +144,7 @@ pub fn resolve_phrase<'ctx>( .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - s1, - s2, - 1, - )? { + match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -211,14 +153,9 @@ pub fn resolve_phrase<'ctx>( } else { let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { - if let Some(m) = db_cache.get_word_pair_proximity_docids( - index, - txn, - word_interner, - s1, - s2, - dist as u8 + 1, - )? { + if let Some(m) = + ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? + { bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index fb2c62f11..dc798e55d 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -4,7 +4,7 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::query_graph::QueryNodeData; -use super::resolve_query_graph::resolve_query_graph; +use super::resolve_query_graph::compute_query_graph_docids; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; @@ -80,7 +80,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { logger.log_words_state(query_graph); - let this_bucket = resolve_query_graph(ctx, query_graph, universe)?; + let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; let child_query_graph = query_graph.clone(); loop {