MeiliSearch/milli/src/search/new/sort.rs

use heed::BytesDecode;
use roaring::RoaringBitmap;

use super::logger::SearchLogger;
use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext};
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
use crate::heed_codec::{BytesRefCodec, StrRefCodec};
use crate::score_details::{self, ScoreDetails};
use crate::search::facet::{ascending_facet_sort, descending_facet_sort};
use crate::{FieldId, Index, Result};

pub trait RankingRuleOutputIter<'ctx, Query> {
    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
}

pub struct RankingRuleOutputIterWrapper<'ctx, Query> {
    iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'ctx>,
}
impl<'ctx, Query> RankingRuleOutputIterWrapper<'ctx, Query> {
    pub fn new(iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'ctx>) -> Self {
        Self { iter }
    }
}
impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWrapper<'ctx, Query> {
    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
        match self.iter.next() {
            Some(x) => x.map(Some),
            None => Ok(None),
        }
    }
}

// `Query` type parameter: the same as the type parameter to bucket_sort
// implements RankingRuleQuery trait, either querygraph or placeholdersearch
// The sort ranking rule doesn't need the query parameter, it is doing the same thing
// whether we're doing a querygraph or placeholder search.
//
// Query Stored anyway because every ranking rule must return a query from next_bucket
// ---
// "Mismatch" between new/old impl.:
// - old impl: roaring bitmap as input, ranking rule iterates other all the buckets
// - new impl: still works like that but it shouldn't, because the universe may change for every call to next_bucket, itself due to:
//    1. elements that were already returned by the ranking rule are subtracted from the universe, also done in the old impl (subtracted from the candidates)
//    2. NEW in the new impl.: distinct rule might have been applied btwn calls to next_bucket
// new impl ignores docs removed in (2), which is a missed perf opt issue, see `next_bucket`
// this perf problem is P2
// mostly happens when many documents map to the same distinct attribute value.
pub struct Sort<'ctx, Query> {
    field_name: String,
    field_id: Option<FieldId>,
    is_ascending: bool,
    original_query: Option<Query>,
    iter: Option<RankingRuleOutputIterWrapper<'ctx, Query>>,
    must_redact: bool,
}
impl<'ctx, Query> Sort<'ctx, Query> {
    pub fn new(
        index: &Index,
        rtxn: &'ctx heed::RoTxn,
        field_name: String,
        is_ascending: bool,
    ) -> Result<Self> {
        let fields_ids_map = index.fields_ids_map(rtxn)?;
        let field_id = fields_ids_map.id(&field_name);
        let must_redact = Self::must_redact(index, rtxn, &field_name)?;

        Ok(Self {
            field_name,
            field_id,
            is_ascending,
            original_query: None,
            iter: None,
            must_redact,
        })
    }

    fn must_redact(index: &Index, rtxn: &'ctx heed::RoTxn, field_name: &str) -> Result<bool> {
        let Some(displayed_fields) = index.displayed_fields(rtxn)? else {
            return Ok(false);
        };

        Ok(!displayed_fields.iter().any(|&field| field == field_name))
    }
}

impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, Query> {
    fn id(&self) -> String {
        let Self { field_name, is_ascending, .. } = self;
        format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc" })
    }
    fn start_iteration(
        &mut self,
        ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<Query>,
        parent_candidates: &RoaringBitmap,
        parent_query: &Query,
    ) -> Result<()> {
        let iter: RankingRuleOutputIterWrapper<Query> = match self.field_id {
            Some(field_id) => {
                let number_db = ctx
                    .index
                    .facet_id_f64_docids
                    .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
                let string_db = ctx
                    .index
                    .facet_id_string_docids
                    .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();

                let (number_iter, string_iter) = if self.is_ascending {
                    let number_iter = ascending_facet_sort(
                        ctx.txn,
                        number_db,
                        field_id,
                        parent_candidates.clone(),
                    )?;
                    let string_iter = ascending_facet_sort(
                        ctx.txn,
                        string_db,
                        field_id,
                        parent_candidates.clone(),
                    )?;

                    (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter))
                } else {
                    let number_iter = descending_facet_sort(
                        ctx.txn,
                        number_db,
                        field_id,
                        parent_candidates.clone(),
                    )?;
                    let string_iter = descending_facet_sort(
                        ctx.txn,
                        string_db,
                        field_id,
                        parent_candidates.clone(),
                    )?;

                    (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter))
                };
                let number_iter = number_iter.map(|r| -> Result<_> {
                    let (docids, bytes) = r?;
                    Ok((
                        docids,
                        serde_json::Value::Number(
                            serde_json::Number::from_f64(
                                OrderedF64Codec::bytes_decode(bytes).expect("some number"),
                            )
                            .expect("too big float"),
                        ),
                    ))
                });
                let string_iter = string_iter.map(|r| -> Result<_> {
                    let (docids, bytes) = r?;
                    Ok((
                        docids,
                        serde_json::Value::String(
                            StrRefCodec::bytes_decode(bytes).expect("some string").to_owned(),
                        ),
                    ))
                });

                let query_graph = parent_query.clone();
                let ascending = self.is_ascending;
                let field_name = self.field_name.clone();
                let must_redact = self.must_redact;
                RankingRuleOutputIterWrapper::new(Box::new(number_iter.chain(string_iter).map(
                    move |r| {
                        let (docids, value) = r?;
                        Ok(RankingRuleOutput {
                            query: query_graph.clone(),
                            candidates: docids,
                            score: ScoreDetails::Sort(score_details::Sort {
                                field_name: field_name.clone(),
                                ascending,
                                redacted: must_redact,
                                value,
                            }),
                        })
                    },
                )))
            }
            None => RankingRuleOutputIterWrapper::new(Box::new(std::iter::empty())),
        };
        self.original_query = Some(parent_query.clone());
        self.iter = Some(iter);
        Ok(())
    }

    fn next_bucket(
        &mut self,
        _ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<Query>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<Query>>> {
        let iter = self.iter.as_mut().unwrap();
        if let Some(mut bucket) = iter.next_bucket()? {
            bucket.candidates &= universe;
            Ok(Some(bucket))
        } else {
            let query = self.original_query.as_ref().unwrap().clone();
            Ok(Some(RankingRuleOutput {
                query,
                candidates: universe.clone(),
                score: ScoreDetails::Sort(score_details::Sort {
                    field_name: self.field_name.clone(),
                    ascending: self.is_ascending,
                    redacted: self.must_redact,
                    value: serde_json::Value::Null,
                }),
            }))
        }
    }

    fn end_iteration(
        &mut self,
        _ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<Query>,
    ) {
        self.original_query = None;
        self.iter = None;
    }
}
Score for sort 2023-06-15 17:36:40 +02:00			`use heed::BytesDecode;`
Cargo fmt 2023-03-08 09:55:53 +01:00			`use roaring::RoaringBitmap;`

Add a search logger 2023-02-22 15:34:37 +01:00			`use super::logger::SearchLogger;`
Intern more values 2023-03-09 11:12:31 +01:00			`use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext};`
Score for sort 2023-06-15 17:36:40 +02:00			`use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};`
Move to the v0.20.0-alpha.9 of heed 2023-11-27 11:52:22 +01:00			`use crate::heed_codec::{BytesRefCodec, StrRefCodec};`
Score for sort 2023-06-15 17:36:40 +02:00			`use crate::score_details::{self, ScoreDetails};`
cargo fmt 2023-03-20 09:30:10 +01:00			`use crate::search::facet::{ascending_facet_sort, descending_facet_sort};`
			`use crate::{FieldId, Index, Result};`
Intern more values 2023-03-09 11:12:31 +01:00
Rename lifetime 2023-03-13 14:03:48 +01:00			`pub trait RankingRuleOutputIter<'ctx, Query> {`
Intern more values 2023-03-09 11:12:31 +01:00			`fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;`
			`}`

Rename lifetime 2023-03-13 14:03:48 +01:00			`pub struct RankingRuleOutputIterWrapper<'ctx, Query> {`
			`iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'ctx>,`
Intern more values 2023-03-09 11:12:31 +01:00			`}`
Rename lifetime 2023-03-13 14:03:48 +01:00			`impl<'ctx, Query> RankingRuleOutputIterWrapper<'ctx, Query> {`
			`pub fn new(iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'ctx>) -> Self {`
Intern more values 2023-03-09 11:12:31 +01:00			`Self { iter }`
			`}`
			`}`
Rename lifetime 2023-03-13 14:03:48 +01:00			`impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWrapper<'ctx, Query> {`
Intern more values 2023-03-09 11:12:31 +01:00			`fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {`
			`match self.iter.next() {`
			`Some(x) => x.map(Some),`
			`None => Ok(None),`
			`}`
			`}`
			`}`

Add TODO notes 2023-03-27 11:04:04 +02:00			// `Query` type parameter: the same as the type parameter to bucket_sort
			`// implements RankingRuleQuery trait, either querygraph or placeholdersearch`
			`// The sort ranking rule doesn't need the query parameter, it is doing the same thing`
			`// whether we're doing a querygraph or placeholder search.`
			`//`
			`// Query Stored anyway because every ranking rule must return a query from next_bucket`
			`// ---`
			`// "Mismatch" between new/old impl.:`
			`// - old impl: roaring bitmap as input, ranking rule iterates other all the buckets`
			`// - new impl: still works like that but it shouldn't, because the universe may change for every call to next_bucket, itself due to:`
			`// 1. elements that were already returned by the ranking rule are subtracted from the universe, also done in the old impl (subtracted from the candidates)`
			`// 2. NEW in the new impl.: distinct rule might have been applied btwn calls to next_bucket`
			// new impl ignores docs removed in (2), which is a missed perf opt issue, see `next_bucket`
			`// this perf problem is P2`
			`// mostly happens when many documents map to the same distinct attribute value.`
Rename lifetime 2023-03-13 14:03:48 +01:00			`pub struct Sort<'ctx, Query> {`
Add a search logger 2023-02-22 15:34:37 +01:00			`field_name: String,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`field_id: Option<FieldId>,`
			`is_ascending: bool,`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`original_query: Option<Query>,`
Rename lifetime 2023-03-13 14:03:48 +01:00			`iter: Option<RankingRuleOutputIterWrapper<'ctx, Query>>,`
Score for sort 2023-06-15 17:36:40 +02:00			`must_redact: bool,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`}`
Rename lifetime 2023-03-13 14:03:48 +01:00			`impl<'ctx, Query> Sort<'ctx, Query> {`
Initialize query time ranking rule for query search 2023-03-28 12:40:52 +02:00			`pub fn new(`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`index: &Index,`
Rename lifetime 2023-03-13 14:03:48 +01:00			`rtxn: &'ctx heed::RoTxn,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`field_name: String,`
			`is_ascending: bool,`
			`) -> Result<Self> {`
			`let fields_ids_map = index.fields_ids_map(rtxn)?;`
			`let field_id = fields_ids_map.id(&field_name);`
Score for sort 2023-06-15 17:36:40 +02:00			`let must_redact = Self::must_redact(index, rtxn, &field_name)?;`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00
Score for sort 2023-06-15 17:36:40 +02:00			`Ok(Self {`
			`field_name,`
			`field_id,`
			`is_ascending,`
			`original_query: None,`
			`iter: None,`
			`must_redact,`
			`})`
			`}`

			`fn must_redact(index: &Index, rtxn: &'ctx heed::RoTxn, field_name: &str) -> Result<bool> {`
Format let-else ❤️ 🎉 2023-07-03 10:20:28 +02:00			`let Some(displayed_fields) = index.displayed_fields(rtxn)? else {`
			`return Ok(false);`
			`};`
Score for sort 2023-06-15 17:36:40 +02:00
			`Ok(!displayed_fields.iter().any(\|&field\| field == field_name))`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`}`
			`}`

Rename lifetime 2023-03-13 14:03:48 +01:00			`impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, Query> {`
Add a search logger 2023-02-22 15:34:37 +01:00			`fn id(&self) -> String {`
			`let Self { field_name, is_ascending, .. } = self;`
Fix sort id 2023-06-06 18:21:31 +02:00			`format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc" })`
Add a search logger 2023-02-22 15:34:37 +01:00			`}`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`fn start_iteration(`
			`&mut self,`
Rename lifetime 2023-03-13 14:03:48 +01:00			`ctx: &mut SearchContext<'ctx>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`_logger: &mut dyn SearchLogger<Query>,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`parent_candidates: &RoaringBitmap,`
cargo fmt 2023-03-20 09:30:10 +01:00			`parent_query: &Query,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`) -> Result<()> {`
			`let iter: RankingRuleOutputIterWrapper<Query> = match self.field_id {`
			`Some(field_id) => {`
Update new sort ranking rule after rebasing 2023-03-20 09:26:11 +01:00			`let number_db = ctx`
			`.index`
			`.facet_id_f64_docids`
Move to the v0.20.0-alpha.9 of heed 2023-11-27 11:52:22 +01:00			`.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();`
Update new sort ranking rule after rebasing 2023-03-20 09:26:11 +01:00			`let string_db = ctx`
			`.index`
			`.facet_id_string_docids`
Move to the v0.20.0-alpha.9 of heed 2023-11-27 11:52:22 +01:00			`.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00
Update new sort ranking rule after rebasing 2023-03-20 09:26:11 +01:00			`let (number_iter, string_iter) = if self.is_ascending {`
			`let number_iter = ascending_facet_sort(`
			`ctx.txn,`
			`number_db,`
			`field_id,`
			`parent_candidates.clone(),`
			`)?;`
			`let string_iter = ascending_facet_sort(`
			`ctx.txn,`
			`string_db,`
			`field_id,`
			`parent_candidates.clone(),`
			`)?;`

			`(itertools::Either::Left(number_iter), itertools::Either::Left(string_iter))`
			`} else {`
			`let number_iter = descending_facet_sort(`
			`ctx.txn,`
			`number_db,`
			`field_id,`
			`parent_candidates.clone(),`
			`)?;`
			`let string_iter = descending_facet_sort(`
			`ctx.txn,`
			`string_db,`
			`field_id,`
			`parent_candidates.clone(),`
			`)?;`

			`(itertools::Either::Right(number_iter), itertools::Either::Right(string_iter))`
			`};`
Score for sort 2023-06-15 17:36:40 +02:00			`let number_iter = number_iter.map(\|r\| -> Result<_> {`
			`let (docids, bytes) = r?;`
			`Ok((`
			`docids,`
			`serde_json::Value::Number(`
			`serde_json::Number::from_f64(`
			`OrderedF64Codec::bytes_decode(bytes).expect("some number"),`
			`)`
			`.expect("too big float"),`
			`),`
			`))`
			`});`
			`let string_iter = string_iter.map(\|r\| -> Result<_> {`
			`let (docids, bytes) = r?;`
			`Ok((`
			`docids,`
			`serde_json::Value::String(`
			`StrRefCodec::bytes_decode(bytes).expect("some string").to_owned(),`
			`),`
			`))`
			`});`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00
cargo fmt 2023-03-20 09:30:10 +01:00			`let query_graph = parent_query.clone();`
Score for sort 2023-06-15 17:36:40 +02:00			`let ascending = self.is_ascending;`
			`let field_name = self.field_name.clone();`
			`let must_redact = self.must_redact;`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`RankingRuleOutputIterWrapper::new(Box::new(number_iter.chain(string_iter).map(`
Update new sort ranking rule after rebasing 2023-03-20 09:26:11 +01:00			`move \|r\| {`
Score for sort 2023-06-15 17:36:40 +02:00			`let (docids, value) = r?;`
			`Ok(RankingRuleOutput {`
			`query: query_graph.clone(),`
			`candidates: docids,`
			`score: ScoreDetails::Sort(score_details::Sort {`
			`field_name: field_name.clone(),`
			`ascending,`
			`redacted: must_redact,`
			`value,`
			`}),`
			`})`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`},`
			`)))`
			`}`
			`None => RankingRuleOutputIterWrapper::new(Box::new(std::iter::empty())),`
			`};`
cargo fmt 2023-03-20 09:30:10 +01:00			`self.original_query = Some(parent_query.clone());`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`self.iter = Some(iter);`
			`Ok(())`
			`}`

			`fn next_bucket(`
			`&mut self,`
Rename lifetime 2023-03-13 14:03:48 +01:00			`_ctx: &mut SearchContext<'ctx>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`_logger: &mut dyn SearchLogger<Query>,`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`universe: &RoaringBitmap,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`) -> Result<Option<RankingRuleOutput<Query>>> {`
			`let iter = self.iter.as_mut().unwrap();`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`if let Some(mut bucket) = iter.next_bucket()? {`
			`bucket.candidates &= universe;`
			`Ok(Some(bucket))`
			`} else {`
			`let query = self.original_query.as_ref().unwrap().clone();`
Score for sort 2023-06-15 17:36:40 +02:00			`Ok(Some(RankingRuleOutput {`
			`query,`
			`candidates: universe.clone(),`
			`score: ScoreDetails::Sort(score_details::Sort {`
			`field_name: self.field_name.clone(),`
			`ascending: self.is_ascending,`
			`redacted: self.must_redact,`
			`value: serde_json::Value::Null,`
			`}),`
			`}))`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`}`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`}`

			`fn end_iteration(`
			`&mut self,`
Rename lifetime 2023-03-13 14:03:48 +01:00			`_ctx: &mut SearchContext<'ctx>,`
Add a search logger 2023-02-22 15:34:37 +01:00			`_logger: &mut dyn SearchLogger<Query>,`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`) {`
Add sort ranking rule to new search impl 2023-02-28 12:42:29 +01:00			`self.original_query = None;`
Introduce the sort ranking rule working with the new search structures 2023-02-21 09:49:43 +01:00			`self.iter = None;`
			`}`
			`}`