MeiliSearch/milli/src/search/new/resolve_query_graph.rs

#![allow(clippy::too_many_arguments)]

use std::collections::VecDeque;

use fxhash::FxHashMap;
use roaring::{MultiOps, RoaringBitmap};

use super::interner::Interned;
use super::query_graph::QueryNodeData;
use super::query_term::{Phrase, QueryTermSubset};
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, SearchContext, Word};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::Result;

#[derive(Default)]
pub struct PhraseDocIdsCache {
    pub cache: FxHashMap<Interned<Phrase>, RoaringBitmap>,
}
impl<'ctx> SearchContext<'ctx> {
    /// Get the document ids associated with the given phrase
    pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
        if self.phrase_docids.cache.contains_key(&phrase) {
            return Ok(&self.phrase_docids.cache[&phrase]);
        };
        let docids = compute_phrase_docids(self, phrase)?;
        let _ = self.phrase_docids.cache.insert(phrase, docids);
        let docids = &self.phrase_docids.cache[&phrase];
        Ok(docids)
    }
}
pub fn compute_query_term_subset_docids(
    ctx: &mut SearchContext,
    term: &QueryTermSubset,
) -> Result<RoaringBitmap> {
    // TODO Use the roaring::MultiOps trait

    let mut docids = RoaringBitmap::new();
    for word in term.all_single_words_except_prefix_db(ctx)? {
        if let Some(word_docids) = ctx.word_docids(word)? {
            docids |= word_docids;
        }
    }
    for phrase in term.all_phrases(ctx)? {
        docids |= ctx.get_phrase_docids(phrase)?;
    }

    if let Some(prefix) = term.use_prefix_db(ctx) {
        if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? {
            docids |= prefix_docids;
        }
    }

    Ok(docids)
}

pub fn compute_query_term_subset_docids_within_field_id(
    ctx: &mut SearchContext,
    term: &QueryTermSubset,
    fid: u16,
) -> Result<RoaringBitmap> {
    // TODO Use the roaring::MultiOps trait

    let mut docids = RoaringBitmap::new();
    for word in term.all_single_words_except_prefix_db(ctx)? {
        if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
            docids |= word_fid_docids;
        }
    }

    for phrase in term.all_phrases(ctx)? {
        // There may be false positives when resolving a phrase, so we're not
        // guaranteed that all of its words are within a single fid.
        // TODO: fix this?
        if let Some(word) = phrase.words(ctx).iter().flatten().next() {
            if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
                docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
            }
        }
    }

    if let Some(word_prefix) = term.use_prefix_db(ctx) {
        if let Some(word_fid_docids) =
            ctx.get_db_word_prefix_fid_docids(word_prefix.interned(), fid)?
        {
            docids |= word_fid_docids;
        }
    }

    Ok(docids)
}

pub fn compute_query_term_subset_docids_within_position(
    ctx: &mut SearchContext,
    term: &QueryTermSubset,
    position: u16,
) -> Result<RoaringBitmap> {
    // TODO Use the roaring::MultiOps trait
    let mut docids = RoaringBitmap::new();
    for word in term.all_single_words_except_prefix_db(ctx)? {
        if let Some(word_position_docids) =
            ctx.get_db_word_position_docids(word.interned(), position)?
        {
            docids |= word_position_docids;
        }
    }

    for phrase in term.all_phrases(ctx)? {
        // It's difficult to know the expected position of the words in the phrase,
        // so instead we just check the first one.
        // TODO: fix this?
        if let Some(word) = phrase.words(ctx).iter().flatten().next() {
            if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
                docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
            }
        }
    }

    if let Some(word_prefix) = term.use_prefix_db(ctx) {
        if let Some(word_position_docids) =
            ctx.get_db_word_prefix_position_docids(word_prefix.interned(), position)?
        {
            docids |= word_position_docids;
        }
    }
    Ok(docids)
}

/// Returns the subset of the input universe that satisfies the contraints of the input query graph.
pub fn compute_query_graph_docids(
    ctx: &mut SearchContext,
    q: &QueryGraph,
    universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
    // TODO: there must be a faster way to compute this big
    // roaring bitmap expression

    let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
    let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());

    let mut next_nodes_to_visit = VecDeque::new();
    next_nodes_to_visit.push_back(q.root_node);

    while let Some(node_id) = next_nodes_to_visit.pop_front() {
        let node = q.nodes.get(node_id);
        let predecessors = &node.predecessors;
        if !predecessors.is_subset(&nodes_resolved) {
            next_nodes_to_visit.push_back(node_id);
            continue;
        }
        // Take union of all predecessors
        let predecessors_docids =
            MultiOps::union(predecessors.iter().map(|p| path_nodes_docids.get(p)));

        let node_docids = match &node.data {
            QueryNodeData::Term(LocatedQueryTermSubset {
                term_subset,
                positions: _,
                term_ids: _,
            }) => {
                let node_docids = compute_query_term_subset_docids(ctx, term_subset)?;
                predecessors_docids & node_docids
            }
            QueryNodeData::Deleted => {
                panic!()
            }
            QueryNodeData::Start => universe.clone(),
            QueryNodeData::End => {
                return Ok(predecessors_docids);
            }
        };
        nodes_resolved.insert(node_id);
        *path_nodes_docids.get_mut(node_id) = node_docids;

        for succ in node.successors.iter() {
            if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
                next_nodes_to_visit.push_back(succ);
            }
        }

        for prec in node.predecessors.iter() {
            if q.nodes.get(prec).successors.is_subset(&nodes_resolved) {
                path_nodes_docids.get_mut(prec).clear();
            }
        }
    }
    panic!()
}

pub fn compute_phrase_docids(
    ctx: &mut SearchContext,
    phrase: Interned<Phrase>,
) -> Result<RoaringBitmap> {
    let Phrase { words } = ctx.phrase_interner.get(phrase).clone();

    if words.is_empty() {
        return Ok(RoaringBitmap::new());
    }
    let mut candidates = RoaringBitmap::new();
    for word in words.iter().flatten().copied() {
        if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
            candidates |= word_docids;
        } else {
            return Ok(RoaringBitmap::new());
        }
    }

    let winsize = words.len().min(3);

    for win in words.windows(winsize) {
        // Get all the documents with the matching distance for each word pairs.
        let mut bitmaps = Vec::with_capacity(winsize.pow(2));
        for (offset, &s1) in win
            .iter()
            .enumerate()
            .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
        {
            for (dist, &s2) in win
                .iter()
                .skip(offset + 1)
                .enumerate()
                .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
            {
                if dist == 0 {
                    match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
                        Some(m) => bitmaps.push(m),
                        // If there are no documents for this pair, there will be no
                        // results for the phrase query.
                        None => return Ok(RoaringBitmap::new()),
                    }
                } else {
                    let mut bitmap = RoaringBitmap::new();
                    for dist in 0..=dist {
                        if let Some(m) =
                            ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
                        {
                            bitmap |= m;
                        }
                    }
                    if bitmap.is_empty() {
                        return Ok(bitmap);
                    } else {
                        bitmaps.push(bitmap);
                    }
                }
            }
        }

        // We sort the bitmaps so that we perform the small intersections first, which is faster.
        bitmaps.sort_unstable_by_key(|a| a.len());

        for bitmap in bitmaps {
            candidates &= bitmap;

            // There will be no match, return early
            if candidates.is_empty() {
                break;
            }
        }
    }
    Ok(candidates)
}