MeiliSearch/milli/src/search/new/ranking_rules.rs

use roaring::RoaringBitmap;

use super::logger::SearchLogger;
use super::{QueryGraph, SearchContext};
// use crate::search::new::sort::Sort;
use crate::Result;

/// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`]
pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}

/// A type describing a placeholder search
#[derive(Clone)]
pub struct PlaceholderQuery;
impl RankingRuleQueryTrait for PlaceholderQuery {}
impl RankingRuleQueryTrait for QueryGraph {}

/// A trait that must be implemented by all ranking rules.
///
/// It is generic over `'search`, the lifetime of the search context
/// (i.e. the read transaction and the cache) and over `Query`, which
/// can be either [`PlaceholderQuery`] or [`QueryGraph`].
pub trait RankingRule<'search, Query: RankingRuleQueryTrait> {
    fn id(&self) -> String;

    /// Prepare the ranking rule such that it can start iterating over its
    /// buckets using [`next_bucket`](RankingRule::next_bucket).
    ///
    /// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
    fn start_iteration(
        &mut self,
        ctx: &mut SearchContext<'search>,
        logger: &mut dyn SearchLogger<Query>,
        universe: &RoaringBitmap,
        query: &Query,
    ) -> Result<()>;

    /// Return the next bucket of this ranking rule.
    ///
    /// The returned candidates MUST be a subset of the given universe.
    ///
    /// The universe given as argument is either:
    /// - a subset of the universe given to the previous call to [`next_bucket`](RankingRule::next_bucket); OR
    /// - the universe given to [`start_iteration`](RankingRule::start_iteration)
    fn next_bucket(
        &mut self,
        ctx: &mut SearchContext<'search>,
        logger: &mut dyn SearchLogger<Query>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<Query>>>;

    /// Finish iterating over the buckets, which yields control to the parent ranking rule
    /// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
    fn end_iteration(
        &mut self,
        ctx: &mut SearchContext<'search>,
        logger: &mut dyn SearchLogger<Query>,
    );
}

/// Output of a ranking rule, consisting of the query to be used
/// by the child ranking rule and a set of document ids.
#[derive(Debug)]
pub struct RankingRuleOutput<Q> {
    /// The query corresponding to the current bucket for the child ranking rule
    pub query: Q,
    /// The allowed candidates for the child ranking rule
    pub candidates: RoaringBitmap,
}

pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
    ctx: &mut SearchContext<'search>,
    mut ranking_rules: Vec<&mut dyn RankingRule<'search, Q>>,
    query_graph: &Q,
    universe: &RoaringBitmap,
    from: usize,
    length: usize,
    logger: &mut dyn SearchLogger<Q>,
) -> Result<Vec<u32>> {
    logger.initial_query(query_graph);

    logger.ranking_rules(&ranking_rules);

    if universe.len() < from as u64 {
        return Ok(vec![]);
    }

    let ranking_rules_len = ranking_rules.len();
    logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
    ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;

    let mut candidates: Vec<RoaringBitmap> = vec![RoaringBitmap::default(); ranking_rules_len];
    candidates[0] = universe.clone();

    let mut cur_ranking_rule_index = 0;

    /// Finish iterating over the current ranking rule, yielding
    /// control to the parent (or finishing the search if not possible).
    /// Update the candidates accordingly and inform the logger.
    macro_rules! back {
        () => {
            assert!(candidates[cur_ranking_rule_index].is_empty());
            logger.end_iteration_ranking_rule(
                cur_ranking_rule_index,
                ranking_rules[cur_ranking_rule_index],
                &candidates[cur_ranking_rule_index],
            );
            candidates[cur_ranking_rule_index].clear();
            ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
            if cur_ranking_rule_index == 0 {
                break;
            } else {
                cur_ranking_rule_index -= 1;
            }
        };
    }

    let mut results = vec![];
    let mut cur_offset = 0usize;

    /// Add the candidates to the results. Take the `from`, `limit`, and `cur_offset`
    /// into account and inform the logger.
    macro_rules! maybe_add_to_results {
        ($candidates:expr) => {
            let candidates = $candidates;
            let len = candidates.len();
            // if the candidates are empty, there is nothing to do;
            if !candidates.is_empty() {
                if cur_offset < from {
                    if cur_offset + (candidates.len() as usize) < from {
                        logger.skip_bucket_ranking_rule(
                            cur_ranking_rule_index,
                            ranking_rules[cur_ranking_rule_index],
                            &candidates,
                        );
                    } else {
                        let all_candidates = candidates.iter().collect::<Vec<_>>();
                        let (skipped_candidates, candidates) =
                            all_candidates.split_at(from - cur_offset);
                        logger.skip_bucket_ranking_rule(
                            cur_ranking_rule_index,
                            ranking_rules[cur_ranking_rule_index],
                            &skipped_candidates.into_iter().collect(),
                        );
                        let candidates = candidates
                            .iter()
                            .take(length - results.len())
                            .copied()
                            .collect::<Vec<_>>();
                        logger.add_to_results(&candidates);
                        results.extend(&candidates);
                    }
                } else {
                    let candidates =
                        candidates.iter().take(length - results.len()).collect::<Vec<u32>>();
                    logger.add_to_results(&candidates);
                    results.extend(&candidates);
                }
            }
            cur_offset += len as usize;
        };
    }
    while results.len() < length {
        // The universe for this bucket is zero or one element, so we don't need to sort
        // anything, just extend the results and go back to the parent ranking rule.
        if candidates[cur_ranking_rule_index].len() <= 1 {
            maybe_add_to_results!(&candidates[cur_ranking_rule_index]);
            candidates[cur_ranking_rule_index].clear();
            back!();
            continue;
        }

        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else {
            back!();
            continue;
        };

        logger.next_bucket_ranking_rule(
            cur_ranking_rule_index,
            ranking_rules[cur_ranking_rule_index],
            &candidates[cur_ranking_rule_index],
            &next_bucket.candidates,
        );

        assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
        candidates[cur_ranking_rule_index] -= &next_bucket.candidates;

        if cur_ranking_rule_index == ranking_rules_len - 1
            || next_bucket.candidates.len() <= 1
            || cur_offset + (next_bucket.candidates.len() as usize) < from
        {
            maybe_add_to_results!(&next_bucket.candidates);
            continue;
        }

        cur_ranking_rule_index += 1;
        candidates[cur_ranking_rule_index] = next_bucket.candidates.clone();
        logger.start_iteration_ranking_rule(
            cur_ranking_rule_index,
            ranking_rules[cur_ranking_rule_index],
            &next_bucket.query,
            &candidates[cur_ranking_rule_index],
        );
        ranking_rules[cur_ranking_rule_index].start_iteration(
            ctx,
            logger,
            &next_bucket.candidates,
            &next_bucket.query,
        )?;
    }

    Ok(results)
}
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`use roaring::RoaringBitmap;`
Cargo fmt 2023-03-08 09:55:53 +01:00
			`use super::logger::SearchLogger;`
			`use super::{QueryGraph, SearchContext};`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`// use crate::search::new::sort::Sort;`
Intern more values 2023-03-09 11:12:31 +01:00			`use crate::Result;`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00
Intern more values 2023-03-09 11:12:31 +01:00			/// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`]
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00
Intern more values 2023-03-09 11:12:31 +01:00			`/// A type describing a placeholder search`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`#[derive(Clone)]`
			`pub struct PlaceholderQuery;`
			`impl RankingRuleQueryTrait for PlaceholderQuery {}`
			`impl RankingRuleQueryTrait for QueryGraph {}`

Intern more values 2023-03-09 11:12:31 +01:00			`/// A trait that must be implemented by all ranking rules.`
			`///`
			/// It is generic over `'search`, the lifetime of the search context
			/// (i.e. the read transaction and the cache) and over `Query`, which
			/// can be either [`PlaceholderQuery`] or [`QueryGraph`].
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`pub trait RankingRule<'search, Query: RankingRuleQueryTrait> {`
Add a search logger 2023-02-22 15:34:37 +01:00			`fn id(&self) -> String;`

Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`/// Prepare the ranking rule such that it can start iterating over its`
			/// buckets using [`next_bucket`](RankingRule::next_bucket).
			`///`
			/// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`fn start_iteration(`
			`&mut self,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx: &mut SearchContext<'search>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`logger: &mut dyn SearchLogger<Query>,`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`universe: &RoaringBitmap,`
			`query: &Query,`
			`) -> Result<()>;`

Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`/// Return the next bucket of this ranking rule.`
			`///`
			`/// The returned candidates MUST be a subset of the given universe.`
			`///`
			`/// The universe given as argument is either:`
			/// - a subset of the universe given to the previous call to [`next_bucket`](RankingRule::next_bucket); OR
			/// - the universe given to [`start_iteration`](RankingRule::start_iteration)
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`fn next_bucket(`
			`&mut self,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx: &mut SearchContext<'search>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`logger: &mut dyn SearchLogger<Query>,`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`universe: &RoaringBitmap,`
			`) -> Result<Option<RankingRuleOutput<Query>>>;`

Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`/// Finish iterating over the buckets, which yields control to the parent ranking rule`
			/// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`fn end_iteration(`
			`&mut self,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx: &mut SearchContext<'search>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`logger: &mut dyn SearchLogger<Query>,`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`);`
			`}`

Intern more values 2023-03-09 11:12:31 +01:00			`/// Output of a ranking rule, consisting of the query to be used`
			`/// by the child ranking rule and a set of document ids.`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`#[derive(Debug)]`
			`pub struct RankingRuleOutput<Q> {`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`/// The query corresponding to the current bucket for the child ranking rule`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`pub query: Q,`
			`/// The allowed candidates for the child ranking rule`
			`pub candidates: RoaringBitmap,`
			`}`

Intern more values 2023-03-09 11:12:31 +01:00			`pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx: &mut SearchContext<'search>,`
Intern more values 2023-03-09 11:12:31 +01:00			`mut ranking_rules: Vec<&mut dyn RankingRule<'search, Q>>,`
			`query_graph: &Q,`
Fix: computation of initial universe, code organisation 2023-03-06 08:35:01 +01:00			`universe: &RoaringBitmap,`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`from: usize,`
			`length: usize,`
Intern more values 2023-03-09 11:12:31 +01:00			`logger: &mut dyn SearchLogger<Q>,`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`) -> Result<Vec<u32>> {`
Fix: computation of initial universe, code organisation 2023-03-06 08:35:01 +01:00			`logger.initial_query(query_graph);`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00
Add a search logger 2023-02-22 15:34:37 +01:00			`logger.ranking_rules(&ranking_rules);`

Add support for filters 2023-02-27 16:45:07 +01:00			`if universe.len() < from as u64 {`
			`return Ok(vec![]);`
			`}`

Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`let ranking_rules_len = ranking_rules.len();`
Fix: computation of initial universe, code organisation 2023-03-06 08:35:01 +01:00			`logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00
Add documentation 2023-03-08 13:26:29 +01:00			`let mut candidates: Vec<RoaringBitmap> = vec![RoaringBitmap::default(); ranking_rules_len];`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`candidates[0] = universe.clone();`

			`let mut cur_ranking_rule_index = 0;`
Fix more bugs + visual empty path cache logging 2023-02-27 15:04:40 +01:00
Intern more values 2023-03-09 11:12:31 +01:00			`/// Finish iterating over the current ranking rule, yielding`
			`/// control to the parent (or finishing the search if not possible).`
			`/// Update the candidates accordingly and inform the logger.`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`macro_rules! back {`
			`() => {`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`assert!(candidates[cur_ranking_rule_index].is_empty());`
Add a search logger 2023-02-22 15:34:37 +01:00			`logger.end_iteration_ranking_rule(`
			`cur_ranking_rule_index,`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`ranking_rules[cur_ranking_rule_index],`
Add a search logger 2023-02-22 15:34:37 +01:00			`&candidates[cur_ranking_rule_index],`
			`);`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`candidates[cur_ranking_rule_index].clear();`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`if cur_ranking_rule_index == 0 {`
			`break;`
			`} else {`
			`cur_ranking_rule_index -= 1;`
			`}`
			`};`
			`}`

			`let mut results = vec![];`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`let mut cur_offset = 0usize;`

Intern more values 2023-03-09 11:12:31 +01:00			/// Add the candidates to the results. Take the `from`, `limit`, and `cur_offset`
			`/// into account and inform the logger.`
Add support for filters 2023-02-27 16:45:07 +01:00			`macro_rules! maybe_add_to_results {`
Fix more bugs + visual empty path cache logging 2023-02-27 15:04:40 +01:00			`($candidates:expr) => {`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`let candidates = $candidates;`
			`let len = candidates.len();`
Add support for filters 2023-02-27 16:45:07 +01:00			`// if the candidates are empty, there is nothing to do;`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`if !candidates.is_empty() {`
			`if cur_offset < from {`
			`if cur_offset + (candidates.len() as usize) < from {`
			`logger.skip_bucket_ranking_rule(`
			`cur_ranking_rule_index,`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`ranking_rules[cur_ranking_rule_index],`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`&candidates,`
			`);`
			`} else {`
			`let all_candidates = candidates.iter().collect::<Vec<_>>();`
			`let (skipped_candidates, candidates) =`
			`all_candidates.split_at(from - cur_offset);`
			`logger.skip_bucket_ranking_rule(`
			`cur_ranking_rule_index,`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`ranking_rules[cur_ranking_rule_index],`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`&skipped_candidates.into_iter().collect(),`
			`);`
			`let candidates = candidates`
			`.iter()`
			`.take(length - results.len())`
			`.copied()`
			`.collect::<Vec<_>>();`
			`logger.add_to_results(&candidates);`
			`results.extend(&candidates);`
			`}`
			`} else {`
			`let candidates =`
Add documentation 2023-03-08 13:26:29 +01:00			`candidates.iter().take(length - results.len()).collect::<Vec<u32>>();`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`logger.add_to_results(&candidates);`
			`results.extend(&candidates);`
			`}`
			`}`
			`cur_offset += len as usize;`
Fix more bugs + visual empty path cache logging 2023-02-27 15:04:40 +01:00			`};`
			`}`
Add support for search offset and limit 2023-02-27 16:14:53 +01:00			`while results.len() < length {`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`// The universe for this bucket is zero or one element, so we don't need to sort`
			`// anything, just extend the results and go back to the parent ranking rule.`
			`if candidates[cur_ranking_rule_index].len() <= 1 {`
Add support for filters 2023-02-27 16:45:07 +01:00			`maybe_add_to_results!(&candidates[cur_ranking_rule_index]);`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`candidates[cur_ranking_rule_index].clear();`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`back!();`
			`continue;`
			`}`
Add a search logger 2023-02-22 15:34:37 +01:00
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else {`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`back!();`
			`continue;`
			`};`

Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`logger.next_bucket_ranking_rule(`
			`cur_ranking_rule_index,`
			`ranking_rules[cur_ranking_rule_index],`
			`&candidates[cur_ranking_rule_index],`
			`&next_bucket.candidates,`
			`);`

Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`candidates[cur_ranking_rule_index] -= &next_bucket.candidates;`

Add support for filters 2023-02-27 16:45:07 +01:00			`if cur_ranking_rule_index == ranking_rules_len - 1`
			`\|\| next_bucket.candidates.len() <= 1`
			`\|\| cur_offset + (next_bucket.candidates.len() as usize) < from`
			`{`
			`maybe_add_to_results!(&next_bucket.candidates);`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`continue;`
			`}`
Add support for filters 2023-02-27 16:45:07 +01:00
			`cur_ranking_rule_index += 1;`
			`candidates[cur_ranking_rule_index] = next_bucket.candidates.clone();`
			`logger.start_iteration_ranking_rule(`
			`cur_ranking_rule_index,`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`ranking_rules[cur_ranking_rule_index],`
Add support for filters 2023-02-27 16:45:07 +01:00			`&next_bucket.query,`
			`&candidates[cur_ranking_rule_index],`
			`);`
			`ranking_rules[cur_ranking_rule_index].start_iteration(`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`ctx,`
Add support for filters 2023-02-27 16:45:07 +01:00			`logger,`
			`&next_bucket.candidates,`
			`&next_bucket.query,`
			`)?;`
Introduce a common way to manage the coordination between ranking rules 2023-02-21 09:44:03 +01:00			`}`

			`Ok(results)`
			`}`