2023-03-09 11:12:31 +01:00
|
|
|
#![allow(clippy::too_many_arguments)]
|
|
|
|
|
2023-03-08 09:55:53 +01:00
|
|
|
use std::collections::VecDeque;
|
|
|
|
|
|
|
|
use fxhash::FxHashMap;
|
2023-05-02 18:54:09 +02:00
|
|
|
use roaring::{MultiOps, RoaringBitmap};
|
2023-03-08 09:55:53 +01:00
|
|
|
|
2023-03-30 11:10:38 +02:00
|
|
|
use super::interner::Interned;
|
2023-03-14 16:37:47 +01:00
|
|
|
use super::query_graph::QueryNodeData;
|
2023-03-30 11:10:38 +02:00
|
|
|
use super::query_term::{Phrase, QueryTermSubset};
|
2023-03-07 14:42:58 +01:00
|
|
|
use super::small_bitmap::SmallBitmap;
|
2023-04-11 18:27:41 +02:00
|
|
|
use super::{QueryGraph, SearchContext, Word};
|
2023-03-30 11:10:38 +02:00
|
|
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
2023-04-11 15:31:40 +02:00
|
|
|
use crate::Result;
|
2023-02-21 09:45:17 +01:00
|
|
|
|
|
|
|
#[derive(Default)]
|
2023-03-30 11:10:38 +02:00
|
|
|
pub struct PhraseDocIdsCache {
|
|
|
|
pub cache: FxHashMap<Interned<Phrase>, RoaringBitmap>,
|
2023-02-21 13:21:41 +01:00
|
|
|
}
|
2023-03-30 11:10:38 +02:00
|
|
|
impl<'ctx> SearchContext<'ctx> {
|
2023-03-09 11:12:31 +01:00
|
|
|
/// Get the document ids associated with the given phrase
|
2023-03-30 11:10:38 +02:00
|
|
|
pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
|
|
|
|
if self.phrase_docids.cache.contains_key(&phrase) {
|
|
|
|
return Ok(&self.phrase_docids.cache[&phrase]);
|
2023-02-21 13:21:41 +01:00
|
|
|
};
|
2023-03-30 11:10:38 +02:00
|
|
|
let docids = compute_phrase_docids(self, phrase)?;
|
|
|
|
let _ = self.phrase_docids.cache.insert(phrase, docids);
|
|
|
|
let docids = &self.phrase_docids.cache[&phrase];
|
2023-03-09 11:12:31 +01:00
|
|
|
Ok(docids)
|
|
|
|
}
|
2023-03-30 11:10:38 +02:00
|
|
|
}
|
|
|
|
pub fn compute_query_term_subset_docids(
|
|
|
|
ctx: &mut SearchContext,
|
|
|
|
term: &QueryTermSubset,
|
|
|
|
) -> Result<RoaringBitmap> {
|
2023-04-12 11:52:56 +02:00
|
|
|
// TODO Use the roaring::MultiOps trait
|
|
|
|
|
2023-03-30 11:10:38 +02:00
|
|
|
let mut docids = RoaringBitmap::new();
|
|
|
|
for word in term.all_single_words_except_prefix_db(ctx)? {
|
2023-04-11 18:27:41 +02:00
|
|
|
if let Some(word_docids) = ctx.word_docids(word)? {
|
2023-04-11 15:31:40 +02:00
|
|
|
docids |= word_docids;
|
2023-03-09 11:12:31 +01:00
|
|
|
}
|
2023-03-30 11:10:38 +02:00
|
|
|
}
|
|
|
|
for phrase in term.all_phrases(ctx)? {
|
|
|
|
docids |= ctx.get_phrase_docids(phrase)?;
|
|
|
|
}
|
2023-03-09 11:12:31 +01:00
|
|
|
|
2023-03-30 11:10:38 +02:00
|
|
|
if let Some(prefix) = term.use_prefix_db(ctx) {
|
2023-04-11 22:06:10 +02:00
|
|
|
if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? {
|
2023-04-11 15:31:40 +02:00
|
|
|
docids |= prefix_docids;
|
2023-03-09 11:12:31 +01:00
|
|
|
}
|
|
|
|
}
|
2023-03-30 11:10:38 +02:00
|
|
|
|
|
|
|
Ok(docids)
|
2023-02-21 09:45:17 +01:00
|
|
|
}
|
|
|
|
|
2023-04-12 11:52:56 +02:00
|
|
|
pub fn compute_query_term_subset_docids_within_field_id(
|
|
|
|
ctx: &mut SearchContext,
|
|
|
|
term: &QueryTermSubset,
|
|
|
|
fid: u16,
|
|
|
|
) -> Result<RoaringBitmap> {
|
|
|
|
// TODO Use the roaring::MultiOps trait
|
|
|
|
|
|
|
|
let mut docids = RoaringBitmap::new();
|
|
|
|
for word in term.all_single_words_except_prefix_db(ctx)? {
|
2023-04-12 16:53:11 +02:00
|
|
|
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
|
|
|
|
docids |= word_fid_docids;
|
2023-04-12 11:52:56 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for phrase in term.all_phrases(ctx)? {
|
2023-04-24 12:11:25 +02:00
|
|
|
// There may be false positives when resolving a phrase, so we're not
|
|
|
|
// guaranteed that all of its words are within a single fid.
|
|
|
|
// TODO: fix this?
|
|
|
|
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
|
|
|
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
|
2023-05-01 16:26:01 +02:00
|
|
|
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
|
2023-04-12 11:52:56 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
2023-04-12 16:53:11 +02:00
|
|
|
if let Some(word_fid_docids) =
|
|
|
|
ctx.get_db_word_prefix_fid_docids(word_prefix.interned(), fid)?
|
|
|
|
{
|
|
|
|
docids |= word_fid_docids;
|
2023-04-12 11:52:56 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(docids)
|
|
|
|
}
|
|
|
|
|
2023-04-13 10:46:09 +02:00
|
|
|
pub fn compute_query_term_subset_docids_within_position(
|
|
|
|
ctx: &mut SearchContext,
|
|
|
|
term: &QueryTermSubset,
|
|
|
|
position: u16,
|
|
|
|
) -> Result<RoaringBitmap> {
|
|
|
|
// TODO Use the roaring::MultiOps trait
|
|
|
|
let mut docids = RoaringBitmap::new();
|
|
|
|
for word in term.all_single_words_except_prefix_db(ctx)? {
|
|
|
|
if let Some(word_position_docids) =
|
|
|
|
ctx.get_db_word_position_docids(word.interned(), position)?
|
|
|
|
{
|
|
|
|
docids |= word_position_docids;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for phrase in term.all_phrases(ctx)? {
|
2023-04-24 12:11:25 +02:00
|
|
|
// It's difficult to know the expected position of the words in the phrase,
|
|
|
|
// so instead we just check the first one.
|
|
|
|
// TODO: fix this?
|
|
|
|
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
|
|
|
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
|
2023-05-01 16:26:01 +02:00
|
|
|
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
|
2023-04-13 10:46:09 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
|
|
|
if let Some(word_position_docids) =
|
|
|
|
ctx.get_db_word_prefix_position_docids(word_prefix.interned(), position)?
|
|
|
|
{
|
|
|
|
docids |= word_position_docids;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(docids)
|
|
|
|
}
|
|
|
|
|
2023-05-02 18:54:09 +02:00
|
|
|
/// Returns the subset of the input universe that satisfies the contraints of the input query graph.
|
2023-03-30 11:10:38 +02:00
|
|
|
pub fn compute_query_graph_docids(
|
2023-03-23 09:15:57 +01:00
|
|
|
ctx: &mut SearchContext,
|
2023-02-21 09:45:17 +01:00
|
|
|
q: &QueryGraph,
|
|
|
|
universe: &RoaringBitmap,
|
|
|
|
) -> Result<RoaringBitmap> {
|
2023-03-30 11:10:38 +02:00
|
|
|
// TODO: there must be a faster way to compute this big
|
2023-02-21 09:45:17 +01:00
|
|
|
// roaring bitmap expression
|
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
|
|
|
|
let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());
|
2023-02-21 09:45:17 +01:00
|
|
|
|
|
|
|
let mut next_nodes_to_visit = VecDeque::new();
|
2023-03-09 11:12:31 +01:00
|
|
|
next_nodes_to_visit.push_back(q.root_node);
|
2023-02-21 09:45:17 +01:00
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
while let Some(node_id) = next_nodes_to_visit.pop_front() {
|
|
|
|
let node = q.nodes.get(node_id);
|
|
|
|
let predecessors = &node.predecessors;
|
2023-02-21 09:45:17 +01:00
|
|
|
if !predecessors.is_subset(&nodes_resolved) {
|
2023-03-14 16:37:47 +01:00
|
|
|
next_nodes_to_visit.push_back(node_id);
|
2023-02-21 09:45:17 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Take union of all predecessors
|
2023-05-02 18:54:09 +02:00
|
|
|
let predecessors_docids =
|
|
|
|
MultiOps::union(predecessors.iter().map(|p| path_nodes_docids.get(p)));
|
2023-02-21 09:45:17 +01:00
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
let node_docids = match &node.data {
|
2023-03-30 11:10:38 +02:00
|
|
|
QueryNodeData::Term(LocatedQueryTermSubset {
|
|
|
|
term_subset,
|
|
|
|
positions: _,
|
|
|
|
term_ids: _,
|
|
|
|
}) => {
|
2023-04-25 16:49:08 +02:00
|
|
|
let node_docids = compute_query_term_subset_docids(ctx, term_subset)?;
|
|
|
|
predecessors_docids & node_docids
|
2023-02-21 09:45:17 +01:00
|
|
|
}
|
2023-03-14 16:37:47 +01:00
|
|
|
QueryNodeData::Deleted => {
|
2023-02-21 13:21:41 +01:00
|
|
|
panic!()
|
2023-02-21 09:45:17 +01:00
|
|
|
}
|
2023-03-14 16:37:47 +01:00
|
|
|
QueryNodeData::Start => universe.clone(),
|
|
|
|
QueryNodeData::End => {
|
2023-02-21 09:45:17 +01:00
|
|
|
return Ok(predecessors_docids);
|
|
|
|
}
|
|
|
|
};
|
2023-03-14 16:37:47 +01:00
|
|
|
nodes_resolved.insert(node_id);
|
|
|
|
*path_nodes_docids.get_mut(node_id) = node_docids;
|
2023-02-21 09:45:17 +01:00
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
for succ in node.successors.iter() {
|
2023-02-21 12:55:44 +01:00
|
|
|
if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
|
|
|
|
next_nodes_to_visit.push_back(succ);
|
2023-02-21 09:45:17 +01:00
|
|
|
}
|
|
|
|
}
|
2023-02-21 13:57:34 +01:00
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
for prec in node.predecessors.iter() {
|
|
|
|
if q.nodes.get(prec).successors.is_subset(&nodes_resolved) {
|
|
|
|
path_nodes_docids.get_mut(prec).clear();
|
2023-02-21 09:45:17 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
panic!()
|
|
|
|
}
|
2023-03-02 21:27:57 +01:00
|
|
|
|
2023-03-30 11:10:38 +02:00
|
|
|
pub fn compute_phrase_docids(
|
|
|
|
ctx: &mut SearchContext,
|
2023-03-09 11:12:31 +01:00
|
|
|
phrase: Interned<Phrase>,
|
|
|
|
) -> Result<RoaringBitmap> {
|
2023-03-30 11:10:38 +02:00
|
|
|
let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
|
2023-03-02 21:27:57 +01:00
|
|
|
|
|
|
|
if words.is_empty() {
|
2023-03-30 14:48:12 +02:00
|
|
|
return Ok(RoaringBitmap::new());
|
|
|
|
}
|
2023-04-27 13:30:09 +02:00
|
|
|
let mut candidates = RoaringBitmap::new();
|
|
|
|
for word in words.iter().flatten().copied() {
|
|
|
|
if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
|
|
|
|
candidates |= word_docids;
|
2023-03-30 14:48:12 +02:00
|
|
|
} else {
|
|
|
|
return Ok(RoaringBitmap::new());
|
|
|
|
}
|
2023-03-02 21:27:57 +01:00
|
|
|
}
|
|
|
|
|
2023-03-30 14:48:12 +02:00
|
|
|
let winsize = words.len().min(3);
|
|
|
|
|
2023-03-02 21:27:57 +01:00
|
|
|
for win in words.windows(winsize) {
|
|
|
|
// Get all the documents with the matching distance for each word pairs.
|
|
|
|
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
2023-03-06 19:21:55 +01:00
|
|
|
for (offset, &s1) in win
|
2023-03-02 21:27:57 +01:00
|
|
|
.iter()
|
|
|
|
.enumerate()
|
|
|
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
|
|
|
{
|
2023-03-06 19:21:55 +01:00
|
|
|
for (dist, &s2) in win
|
2023-03-02 21:27:57 +01:00
|
|
|
.iter()
|
|
|
|
.skip(offset + 1)
|
|
|
|
.enumerate()
|
|
|
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
|
|
|
{
|
|
|
|
if dist == 0 {
|
2023-03-30 11:10:38 +02:00
|
|
|
match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
|
2023-04-11 15:31:40 +02:00
|
|
|
Some(m) => bitmaps.push(m),
|
2023-03-02 21:27:57 +01:00
|
|
|
// If there are no documents for this pair, there will be no
|
|
|
|
// results for the phrase query.
|
|
|
|
None => return Ok(RoaringBitmap::new()),
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
let mut bitmap = RoaringBitmap::new();
|
|
|
|
for dist in 0..=dist {
|
2023-03-30 11:10:38 +02:00
|
|
|
if let Some(m) =
|
|
|
|
ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
|
|
|
|
{
|
2023-04-11 15:31:40 +02:00
|
|
|
bitmap |= m;
|
2023-03-02 21:27:57 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if bitmap.is_empty() {
|
|
|
|
return Ok(bitmap);
|
|
|
|
} else {
|
|
|
|
bitmaps.push(bitmap);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We sort the bitmaps so that we perform the small intersections first, which is faster.
|
|
|
|
bitmaps.sort_unstable_by_key(|a| a.len());
|
|
|
|
|
|
|
|
for bitmap in bitmaps {
|
2023-04-27 13:30:09 +02:00
|
|
|
candidates &= bitmap;
|
|
|
|
|
2023-03-02 21:27:57 +01:00
|
|
|
// There will be no match, return early
|
|
|
|
if candidates.is_empty() {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(candidates)
|
|
|
|
}
|