MeiliSearch/milli/src/search/new/resolve_query_graph.rs

253 lines
8.4 KiB
Rust
Raw Normal View History

2023-03-09 11:12:31 +01:00
#![allow(clippy::too_many_arguments)]
2023-03-08 09:55:53 +01:00
use std::collections::VecDeque;
use fxhash::FxHashMap;
2023-03-09 11:12:31 +01:00
use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
2023-03-08 09:55:53 +01:00
2023-03-09 11:12:31 +01:00
use super::db_cache::DatabaseCache;
use super::interner::{Interned, Interner};
use super::query_graph::QUERY_GRAPH_NODE_LENGTH_LIMIT;
use super::query_term::{Phrase, QueryTerm};
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, QueryNode, SearchContext};
2023-03-09 11:12:31 +01:00
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
#[derive(Default)]
2023-03-09 11:12:31 +01:00
pub struct QueryTermDocIdsCache {
pub phrases: FxHashMap<Interned<Phrase>, RoaringBitmap>,
pub terms: FxHashMap<Interned<QueryTerm>, RoaringBitmap>,
}
2023-03-09 11:12:31 +01:00
impl QueryTermDocIdsCache {
/// Get the document ids associated with the given phrase
2023-03-13 14:03:48 +01:00
pub fn get_phrase_docids<'s, 'ctx>(
2023-03-09 11:12:31 +01:00
&'s mut self,
index: &Index,
2023-03-13 14:03:48 +01:00
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
2023-03-09 11:12:31 +01:00
word_interner: &Interner<String>,
phrase_interner: &Interner<Phrase>,
phrase: Interned<Phrase>,
) -> Result<&'s RoaringBitmap> {
if self.phrases.contains_key(&phrase) {
return Ok(&self.phrases[&phrase]);
};
2023-03-09 11:12:31 +01:00
let docids = resolve_phrase(index, txn, db_cache, word_interner, phrase_interner, phrase)?;
let _ = self.phrases.insert(phrase, docids);
let docids = &self.phrases[&phrase];
Ok(docids)
}
/// Get the document ids associated with the given term
pub fn get_query_term_docids<'s, 'ctx>(
2023-03-09 11:12:31 +01:00
&'s mut self,
index: &Index,
2023-03-13 14:03:48 +01:00
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
2023-03-09 11:12:31 +01:00
word_interner: &Interner<String>,
term_interner: &Interner<QueryTerm>,
2023-03-09 11:12:31 +01:00
phrase_interner: &Interner<Phrase>,
term_interned: Interned<QueryTerm>,
2023-03-09 11:12:31 +01:00
) -> Result<&'s RoaringBitmap> {
if self.terms.contains_key(&term_interned) {
return Ok(&self.terms[&term_interned]);
};
let mut docids = RoaringBitmap::new();
let term = term_interner.get(term_interned);
for word in term.all_single_words_except_prefix_db() {
2023-03-09 11:12:31 +01:00
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? {
docids |=
RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?;
2023-03-09 11:12:31 +01:00
}
}
for phrase in term.all_phrases() {
docids |= self.get_phrase_docids(
2023-03-09 11:12:31 +01:00
index,
txn,
db_cache,
word_interner,
phrase_interner,
phrase,
)?;
2023-03-09 11:12:31 +01:00
}
if let Some(prefix) = term.use_prefix_db {
if let Some(prefix_docids) =
db_cache.get_word_prefix_docids(index, txn, word_interner, prefix)?
{
docids |=
RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?;
2023-03-09 11:12:31 +01:00
}
}
let _ = self.terms.insert(term_interned, docids);
let docids = &self.terms[&term_interned];
Ok(docids)
2023-03-09 11:12:31 +01:00
}
}
2023-03-13 14:03:48 +01:00
pub fn resolve_query_graph<'ctx>(
ctx: &mut SearchContext<'ctx>,
q: &QueryGraph,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
2023-03-09 11:12:31 +01:00
let SearchContext {
index,
txn,
db_cache,
word_interner,
phrase_interner,
term_interner,
term_docids: query_term_docids,
2023-03-09 15:53:59 +01:00
..
2023-03-09 11:12:31 +01:00
} = ctx;
// TODO: there is a faster way to compute this big
// roaring bitmap expression
2023-03-09 11:12:31 +01:00
let mut nodes_resolved = SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT);
let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];
let mut next_nodes_to_visit = VecDeque::new();
2023-03-09 11:12:31 +01:00
next_nodes_to_visit.push_back(q.root_node);
while let Some(node) = next_nodes_to_visit.pop_front() {
let predecessors = &q.edges[node as usize].predecessors;
if !predecessors.is_subset(&nodes_resolved) {
next_nodes_to_visit.push_back(node);
continue;
}
// Take union of all predecessors
let mut predecessors_docids = RoaringBitmap::new();
for p in predecessors.iter() {
predecessors_docids |= &path_nodes_docids[p as usize];
}
let n = &q.nodes[node as usize];
let node_docids = match n {
QueryNode::Term(located_term) => {
let term_docids = query_term_docids.get_query_term_docids(
2023-03-09 11:12:31 +01:00
index,
txn,
db_cache,
word_interner,
term_interner,
2023-03-09 11:12:31 +01:00
phrase_interner,
located_term.value,
2023-03-09 11:12:31 +01:00
)?;
predecessors_docids & term_docids
}
QueryNode::Deleted => {
panic!()
}
QueryNode::Start => universe.clone(),
QueryNode::End => {
return Ok(predecessors_docids);
}
};
nodes_resolved.insert(node);
path_nodes_docids[node as usize] = node_docids;
for succ in q.edges[node as usize].successors.iter() {
if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
next_nodes_to_visit.push_back(succ);
}
}
2023-02-21 13:57:34 +01:00
for prec in q.edges[node as usize].predecessors.iter() {
if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {
path_nodes_docids[prec as usize].clear();
}
}
}
panic!()
}
2023-03-13 14:03:48 +01:00
pub fn resolve_phrase<'ctx>(
2023-03-09 11:12:31 +01:00
index: &Index,
2023-03-13 14:03:48 +01:00
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
2023-03-09 11:12:31 +01:00
word_interner: &Interner<String>,
phrase_interner: &Interner<Phrase>,
phrase: Interned<Phrase>,
) -> Result<RoaringBitmap> {
let Phrase { words } = phrase_interner.get(phrase).clone();
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = words.len().min(3);
if words.is_empty() {
return Ok(candidates);
}
for win in words.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, &s1) in win
.iter()
.enumerate()
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{
for (dist, &s2) in win
.iter()
.skip(offset + 1)
.enumerate()
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{
if dist == 0 {
2023-03-09 11:12:31 +01:00
match db_cache.get_word_pair_proximity_docids(
index,
txn,
word_interner,
s1,
s2,
1,
)? {
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
// If there are no documents for this pair, there will be no
// results for the phrase query.
None => return Ok(RoaringBitmap::new()),
}
} else {
let mut bitmap = RoaringBitmap::new();
for dist in 0..=dist {
2023-03-09 11:12:31 +01:00
if let Some(m) = db_cache.get_word_pair_proximity_docids(
index,
txn,
word_interner,
s1,
s2,
dist as u8 + 1,
)? {
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
}
}
if bitmap.is_empty() {
return Ok(bitmap);
} else {
bitmaps.push(bitmap);
}
}
}
}
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by_key(|a| a.len());
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
// There will be no match, return early
if candidates.is_empty() {
break;
}
}
}
Ok(candidates)
}