MeiliSearch/milli/src/search/new/resolve_query_graph.rs

use std::collections::VecDeque;

use fxhash::FxHashMap;
use heed::{BytesDecode, RoTxn};
use roaring::{MultiOps, RoaringBitmap};

use super::db_cache::DatabaseCache;
use super::query_term::{Phrase, QueryTerm, WordDerivations};
use super::{QueryGraph, QueryNode};

use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};

// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
#[derive(Default)]
pub struct NodeDocIdsCache {
    pub cache: FxHashMap<u32, RoaringBitmap>,
}
impl NodeDocIdsCache {
    fn get_docids<'cache, 'transaction>(
        &'cache mut self,
        index: &Index,
        txn: &'transaction RoTxn,
        db_cache: &mut DatabaseCache<'transaction>,
        term: &QueryTerm,
        node_idx: u32,
    ) -> Result<&'cache RoaringBitmap> {
        if self.cache.contains_key(&node_idx) {
            return Ok(&self.cache[&node_idx]);
        };
        let docids = match term {
            QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?,
            QueryTerm::Word {
                derivations:
                    WordDerivations {
                        original,
                        zero_typo,
                        one_typo,
                        two_typos,
                        use_prefix_db,
                        synonyms,
                        split_words,
                    },
            } => {
                let mut or_docids = vec![];
                for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
                    if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
                        or_docids.push(word_docids);
                    }
                }
                if *use_prefix_db {
                    if let Some(prefix_docids) =
                        db_cache.get_prefix_docids(index, txn, original.as_str())?
                    {
                        or_docids.push(prefix_docids);
                    }
                }
                let mut docids = or_docids
                    .into_iter()
                    .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
                    .collect::<Vec<_>>();
                for synonym in synonyms {
                    // TODO: cache resolve_phrase?
                    docids.push(resolve_phrase(index, txn, db_cache, synonym)?);
                }
                if let Some((left, right)) = split_words {
                    if let Some(split_word_docids) =
                        db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)?
                    {
                        docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?);
                    }
                }

                MultiOps::union(docids)
            }
        };
        let _ = self.cache.insert(node_idx, docids);
        let docids = &self.cache[&node_idx];
        Ok(docids)
    }
}

pub fn resolve_query_graph<'transaction>(
    index: &Index,
    txn: &'transaction RoTxn,
    db_cache: &mut DatabaseCache<'transaction>,
    node_docids_cache: &mut NodeDocIdsCache,
    q: &QueryGraph,
    universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
    // TODO: there is definitely a faster way to compute this big
    // roaring bitmap expression

    let mut nodes_resolved = RoaringBitmap::new();
    let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];

    let mut next_nodes_to_visit = VecDeque::new();
    next_nodes_to_visit.push_front(q.root_node);

    while let Some(node) = next_nodes_to_visit.pop_front() {
        let predecessors = &q.edges[node as usize].predecessors;
        if !predecessors.is_subset(&nodes_resolved) {
            next_nodes_to_visit.push_back(node);
            continue;
        }
        // Take union of all predecessors
        let predecessors_iter = predecessors.iter().map(|p| &path_nodes_docids[p as usize]);
        let predecessors_docids = MultiOps::union(predecessors_iter);

        let n = &q.nodes[node as usize];

        let node_docids = match n {
            QueryNode::Term(located_term) => {
                let term = &located_term.value;
                let derivations_docids =
                    node_docids_cache.get_docids(index, txn, db_cache, term, node)?;
                predecessors_docids & derivations_docids
            }
            QueryNode::Deleted => {
                panic!()
            }
            QueryNode::Start => universe.clone(),
            QueryNode::End => {
                return Ok(predecessors_docids);
            }
        };
        nodes_resolved.insert(node);
        path_nodes_docids[node as usize] = node_docids;

        for succ in q.edges[node as usize].successors.iter() {
            if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
                next_nodes_to_visit.push_back(succ);
            }
        }

        // This is currently slow but could easily be implemented very efficiently
        for prec in q.edges[node as usize].predecessors.iter() {
            if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {
                path_nodes_docids[prec as usize].clear();
            }
        }
    }

    panic!()
}

pub fn resolve_phrase<'transaction>(
    index: &Index,
    txn: &'transaction RoTxn,
    db_cache: &mut DatabaseCache<'transaction>,
    phrase: &Phrase,
) -> Result<RoaringBitmap> {
    let Phrase { words } = phrase;
    let mut candidates = RoaringBitmap::new();
    let mut first_iter = true;
    let winsize = words.len().min(3);

    if words.is_empty() {
        return Ok(candidates);
    }

    for win in words.windows(winsize) {
        // Get all the documents with the matching distance for each word pairs.
        let mut bitmaps = Vec::with_capacity(winsize.pow(2));
        for (offset, s1) in win
            .iter()
            .enumerate()
            .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
        {
            for (dist, s2) in win
                .iter()
                .skip(offset + 1)
                .enumerate()
                .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
            {
                if dist == 0 {
                    match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? {
                        Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
                        // If there are no documents for this pair, there will be no
                        // results for the phrase query.
                        None => return Ok(RoaringBitmap::new()),
                    }
                } else {
                    let mut bitmap = RoaringBitmap::new();
                    for dist in 0..=dist {
                        if let Some(m) = db_cache.get_word_pair_proximity_docids(
                            index,
                            txn,
                            s1,
                            s2,
                            dist as u8 + 1,
                        )? {
                            bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
                        }
                    }
                    if bitmap.is_empty() {
                        return Ok(bitmap);
                    } else {
                        bitmaps.push(bitmap);
                    }
                }
            }
        }

        // We sort the bitmaps so that we perform the small intersections first, which is faster.
        bitmaps.sort_unstable_by_key(|a| a.len());

        for bitmap in bitmaps {
            if first_iter {
                candidates = bitmap;
                first_iter = false;
            } else {
                candidates &= bitmap;
            }
            // There will be no match, return early
            if candidates.is_empty() {
                break;
            }
        }
    }
    Ok(candidates)
}
Remove warnings 2023-02-28 11:49:24 +01:00			`use std::collections::VecDeque;`
Remove noise in codebase 2023-02-21 13:57:34 +01:00
Remove EdgeIndex and NodeIndex types, prefer u32 instead 2023-02-21 12:55:44 +01:00			`use fxhash::FxHashMap;`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`use heed::{BytesDecode, RoTxn};`
			`use roaring::{MultiOps, RoaringBitmap};`

			`use super::db_cache::DatabaseCache;`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`use super::query_term::{Phrase, QueryTerm, WordDerivations};`
			`use super::{QueryGraph, QueryNode};`

			`use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00
			`// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.`
			`#[derive(Default)]`
			`pub struct NodeDocIdsCache {`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`pub cache: FxHashMap<u32, RoaringBitmap>,`
			`}`
			`impl NodeDocIdsCache {`
			`fn get_docids<'cache, 'transaction>(`
			`&'cache mut self,`
			`index: &Index,`
			`txn: &'transaction RoTxn,`
			`db_cache: &mut DatabaseCache<'transaction>,`
			`term: &QueryTerm,`
			`node_idx: u32,`
			`) -> Result<&'cache RoaringBitmap> {`
			`if self.cache.contains_key(&node_idx) {`
			`return Ok(&self.cache[&node_idx]);`
			`};`
			`let docids = match term {`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?,`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`QueryTerm::Word {`
			`derivations:`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`WordDerivations {`
			`original,`
			`zero_typo,`
			`one_typo,`
			`two_typos,`
			`use_prefix_db,`
			`synonyms,`
			`split_words,`
			`},`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`} => {`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`let mut or_docids = vec![];`
			`for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {`
			`if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {`
			`or_docids.push(word_docids);`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`}`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`}`
			`if *use_prefix_db {`
			`if let Some(prefix_docids) =`
			`db_cache.get_prefix_docids(index, txn, original.as_str())?`
			`{`
			`or_docids.push(prefix_docids);`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`}`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`}`
			`let mut docids = or_docids`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`.into_iter()`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`.map(\|slice\| RoaringBitmapCodec::bytes_decode(slice).unwrap())`
			`.collect::<Vec<_>>();`
			`for synonym in synonyms {`
			`// TODO: cache resolve_phrase?`
			`docids.push(resolve_phrase(index, txn, db_cache, synonym)?);`
			`}`
			`if let Some((left, right)) = split_words {`
			`if let Some(split_word_docids) =`
			`db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)?`
			`{`
			`docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?);`
			`}`
			`}`

			`MultiOps::union(docids)`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`}`
			`};`
			`let _ = self.cache.insert(node_idx, docids);`
			`let docids = &self.cache[&node_idx];`
			`Ok(docids)`
			`}`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`}`

			`pub fn resolve_query_graph<'transaction>(`
			`index: &Index,`
			`txn: &'transaction RoTxn,`
			`db_cache: &mut DatabaseCache<'transaction>,`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`node_docids_cache: &mut NodeDocIdsCache,`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`q: &QueryGraph,`
			`universe: &RoaringBitmap,`
			`) -> Result<RoaringBitmap> {`
			`// TODO: there is definitely a faster way to compute this big`
			`// roaring bitmap expression`

Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`let mut nodes_resolved = RoaringBitmap::new();`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00
			`let mut next_nodes_to_visit = VecDeque::new();`
			`next_nodes_to_visit.push_front(q.root_node);`

			`while let Some(node) = next_nodes_to_visit.pop_front() {`
Remove EdgeIndex and NodeIndex types, prefer u32 instead 2023-02-21 12:55:44 +01:00			`let predecessors = &q.edges[node as usize].predecessors;`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`if !predecessors.is_subset(&nodes_resolved) {`
			`next_nodes_to_visit.push_back(node);`
			`continue;`
			`}`
			`// Take union of all predecessors`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`let predecessors_iter = predecessors.iter().map(\|p\| &path_nodes_docids[p as usize]);`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`let predecessors_docids = MultiOps::union(predecessors_iter);`

Remove EdgeIndex and NodeIndex types, prefer u32 instead 2023-02-21 12:55:44 +01:00			`let n = &q.nodes[node as usize];`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`let node_docids = match n {`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`QueryNode::Term(located_term) => {`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`let term = &located_term.value;`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`let derivations_docids =`
			`node_docids_cache.get_docids(index, txn, db_cache, term, node)?;`
			`predecessors_docids & derivations_docids`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`}`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`QueryNode::Deleted => {`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`panic!()`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`}`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00			`QueryNode::Start => universe.clone(),`
			`QueryNode::End => {`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`return Ok(predecessors_docids);`
			`}`
			`};`
Remove EdgeIndex and NodeIndex types, prefer u32 instead 2023-02-21 12:55:44 +01:00			`nodes_resolved.insert(node);`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`path_nodes_docids[node as usize] = node_docids;`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00
Remove EdgeIndex and NodeIndex types, prefer u32 instead 2023-02-21 12:55:44 +01:00			`for succ in q.edges[node as usize].successors.iter() {`
			`if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {`
			`next_nodes_to_visit.push_back(succ);`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`}`
			`}`
Remove noise in codebase 2023-02-21 13:57:34 +01:00
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`// This is currently slow but could easily be implemented very efficiently`
Remove EdgeIndex and NodeIndex types, prefer u32 instead 2023-02-21 12:55:44 +01:00			`for prec in q.edges[node as usize].predecessors.iter() {`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {`
Use a cache when resolving the query graph 2023-02-21 13:21:41 +01:00			`path_nodes_docids[prec as usize].clear();`
Implement a function to find a QueryGraph's docids 2023-02-21 09:45:17 +01:00			`}`
			`}`
			`}`

			`panic!()`
			`}`
WIP on split words and synonyms support 2023-03-02 21:27:57 +01:00
			`pub fn resolve_phrase<'transaction>(`
			`index: &Index,`
			`txn: &'transaction RoTxn,`
			`db_cache: &mut DatabaseCache<'transaction>,`
			`phrase: &Phrase,`
			`) -> Result<RoaringBitmap> {`
			`let Phrase { words } = phrase;`
			`let mut candidates = RoaringBitmap::new();`
			`let mut first_iter = true;`
			`let winsize = words.len().min(3);`

			`if words.is_empty() {`
			`return Ok(candidates);`
			`}`

			`for win in words.windows(winsize) {`
			`// Get all the documents with the matching distance for each word pairs.`
			`let mut bitmaps = Vec::with_capacity(winsize.pow(2));`
			`for (offset, s1) in win`
			`.iter()`
			`.enumerate()`
			`.filter_map(\|(index, word)\| word.as_ref().map(\|word\| (index, word)))`
			`{`
			`for (dist, s2) in win`
			`.iter()`
			`.skip(offset + 1)`
			`.enumerate()`
			`.filter_map(\|(index, word)\| word.as_ref().map(\|word\| (index, word)))`
			`{`
			`if dist == 0 {`
			`match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? {`
			`Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),`
			`// If there are no documents for this pair, there will be no`
			`// results for the phrase query.`
			`None => return Ok(RoaringBitmap::new()),`
			`}`
			`} else {`
			`let mut bitmap = RoaringBitmap::new();`
			`for dist in 0..=dist {`
			`if let Some(m) = db_cache.get_word_pair_proximity_docids(`
			`index,`
			`txn,`
			`s1,`
			`s2,`
			`dist as u8 + 1,`
			`)? {`
			`bitmap \|= CboRoaringBitmapCodec::deserialize_from(m)?;`
			`}`
			`}`
			`if bitmap.is_empty() {`
			`return Ok(bitmap);`
			`} else {`
			`bitmaps.push(bitmap);`
			`}`
			`}`
			`}`
			`}`

			`// We sort the bitmaps so that we perform the small intersections first, which is faster.`
			`bitmaps.sort_unstable_by_key(\|a\| a.len());`

			`for bitmap in bitmaps {`
			`if first_iter {`
			`candidates = bitmap;`
			`first_iter = false;`
			`} else {`
			`candidates &= bitmap;`
			`}`
			`// There will be no match, return early`
			`if candidates.is_empty() {`
			`break;`
			`}`
			`}`
			`}`
			`Ok(candidates)`
			`}`