Introduce a common way to manage the coordination between ranking rules

2025-07-04 12:27:13 +02:00 · 2023-02-21 09:44:03 +01:00 · 2023-02-21 09:44:03 +01:00 · ce0d1e0e13
commit ce0d1e0e13
parent 5065d8b0c1
1 changed files with 523 additions and 0 deletions
--- a/milli/src/search/new/ranking_rules.rs
+++ b/milli/src/search/new/ranking_rules.rs
@ -0,0 +1,523 @@
 use heed::RoTxn;
 use roaring::RoaringBitmap;
 use super::db_cache::DatabaseCache;
 use super::resolve_query_graph::resolve_query_graph;
 use super::QueryGraph;
 use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
 use crate::new::ranking_rule_graph::proximity::ProximityGraph;
 use crate::new::words::Words;
 // use crate::search::new::sort::Sort;
 use crate::{Index, Result, TermsMatchingStrategy};
 pub trait RankingRuleOutputIter<'transaction, Query> {
    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
 }
 pub struct RankingRuleOutputIterWrapper<'transaction, Query> {
    iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
 }
 impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> {
    pub fn new(
        iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
    ) -> Self {
        Self { iter }
    }
 }
 impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query>
    for RankingRuleOutputIterWrapper<'transaction, Query>
 {
    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
        match self.iter.next() {
            Some(x) => x.map(Some),
            None => Ok(None),
        }
    }
 }
 pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}
 #[derive(Clone)]
 pub struct PlaceholderQuery;
 impl RankingRuleQueryTrait for PlaceholderQuery {}
 impl RankingRuleQueryTrait for QueryGraph {}
 pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
    // TODO: add an update_candidates function to deal with distinct
    // attributes?
    fn start_iteration(
        &mut self,
        index: &Index,
        txn: &'transaction RoTxn,
        db_cache: &mut DatabaseCache<'transaction>,
        universe: &RoaringBitmap,
        query: &Query,
    ) -> Result<()>;
    fn next_bucket(
        &mut self,
        index: &Index,
        txn: &'transaction RoTxn,
        db_cache: &mut DatabaseCache<'transaction>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<Query>>>;
    fn end_iteration(
        &mut self,
        index: &Index,
        txn: &'transaction RoTxn,
        db_cache: &mut DatabaseCache<'transaction>,
    );
 }
 #[derive(Debug)]
 pub struct RankingRuleOutput<Q> {
    /// The query tree that must be used by the child ranking rule to fetch candidates.
    pub query: Q,
    /// The allowed candidates for the child ranking rule
    pub candidates: RoaringBitmap,
 }
 #[allow(unused)]
 pub fn get_start_universe<'transaction>(
    index: &Index,
    txn: &'transaction RoTxn,
    db_cache: &mut DatabaseCache<'transaction>,
    query_graph: &QueryGraph,
    term_matching_strategy: TermsMatchingStrategy,
    // filters: Filters,
    // mut distinct: Option<D>,
 ) -> Result<RoaringBitmap> {
    // NOTE:
    //
    // There is a performance problem when using `distinct` + exhaustive number of hits,
    // especially for search that yield many results (many ~= almost all of the
    // dataset).
    //
    // We'll solve it later. Maybe there are smart ways to go about it.
    //
    // For example, if there are millions of possible values for the distinct attribute,
    // then we could just look at the documents which share any distinct attribute with
    // another one, and remove the later docids them from the universe.
    // => NO! because we don't know which one to remove, only after the sorting is done can we know it
    // => this kind of computation can be done, but only in the evaluation of the number
    // of hits for the documents that aren't returned by the search.
    //
    // `Distinct` otherwise should always be computed during
    let universe = index.documents_ids(txn).unwrap();
    // resolve the whole query tree to retrieve an exhaustive list of documents matching the query.
    // NOTE: this is wrong
    // Instead, we should only compute the documents corresponding to the last remaining
    // word, 2-gram, and 3-gran.
    // let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?;
    // Distinct should be lazy if placeholder?
    //
    // // because the initial_candidates should be an exhaustive count of the matching documents,
    // // we precompute the distinct attributes.
    // let initial_candidates = match &mut distinct {
    //     Some(distinct) => {
    //         let mut initial_candidates = RoaringBitmap::new();
    //         for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) {
    //             initial_candidates.insert(c?);
    //         }
    //         initial_candidates
    //     }
    //     None => candidates.clone(),
    // };
    Ok(/*candidates*/ universe)
 }
 pub fn execute_search<'transaction>(
    index: &Index,
    txn: &'transaction heed::RoTxn,
    // TODO: ranking rules parameter
    db_cache: &mut DatabaseCache<'transaction>,
    universe: &RoaringBitmap,
    query_graph: &QueryGraph,
    // _from: usize,
    // _length: usize,
 ) -> Result<Vec<u32>> {
    let words = Words::new(TermsMatchingStrategy::Last);
    // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?;
    let proximity = GraphBasedRankingRule::<ProximityGraph>::default();
    // TODO: ranking rules given as argument
    let mut ranking_rules: Vec<Box<dyn RankingRule<'transaction, QueryGraph>>> =
        vec![Box::new(words), Box::new(proximity) /*  Box::new(sort) */];
    let ranking_rules_len = ranking_rules.len();
    ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?;
    // TODO: parent_candidates could be used only during debugging?
    let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
    candidates[0] = universe.clone();
    let mut cur_ranking_rule_index = 0;
    macro_rules! back {
        () => {
            candidates[cur_ranking_rule_index].clear();
            ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache);
            if cur_ranking_rule_index == 0 {
                break;
            } else {
                cur_ranking_rule_index -= 1;
            }
        };
    }
    let mut results = vec![];
    // TODO: skip buckets when we want to start from an offset
    while results.len() < 20 {
        // The universe for this bucket is zero or one element, so we don't need to sort
        // anything, just extend the results and go back to the parent ranking rule.
        if candidates[cur_ranking_rule_index].len() <= 1 {
            results.extend(&candidates[cur_ranking_rule_index]);
            back!();
            continue;
        }
        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, &candidates[cur_ranking_rule_index])? else {
            back!();
            continue;
        };
        candidates[cur_ranking_rule_index] -= &next_bucket.candidates;
        if next_bucket.candidates.len() <= 1 {
            // Only zero or one candidate, no need to sort through the child ranking rule.
            results.extend(next_bucket.candidates);
            continue;
        } else {
            // many candidates, give to next ranking rule, if any
            if cur_ranking_rule_index == ranking_rules_len - 1 {
                // TODO: don't extend too much, up to the limit only
                results.extend(next_bucket.candidates);
            } else {
                cur_ranking_rule_index += 1;
                candidates[cur_ranking_rule_index] = next_bucket.candidates.clone();
                ranking_rules[cur_ranking_rule_index].start_iteration(
                    index,
                    txn,
                    db_cache,
                    &next_bucket.candidates,
                    &next_bucket.query,
                )?;
            }
        }
    }
    Ok(results)
 }
 #[cfg(test)]
 mod tests {
    use std::fs::File;
    use std::io::{BufRead, BufReader, Cursor, Seek};
    use std::time::Instant;
    use heed::EnvOpenOptions;
    use super::{execute_search, get_start_universe};
    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
    use crate::index::tests::TempIndex;
    use crate::new::db_cache::DatabaseCache;
    use crate::new::make_query_graph;
    use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
    use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
    #[test]
    fn execute_new_search() {
        let index = TempIndex::new();
        index
            .add_documents(documents!([
                {
                    "id": 7,
                    "text": "the super quick super brown fox jumps over",
                },
                {
                    "id": 8,
                    "text": "the super quick brown fox jumps over",
                },
                {
                    "id": 9,
                    "text": "the quick super brown fox jumps over",
                },
                {
                    "id": 10,
                    "text": "the quick brown fox jumps over",
                },
                {
                    "id": 11,
                    "text": "the quick brown fox jumps over the lazy dog",
                },
                {
                    "id": 12,
                    "text": "the quick brown cat jumps over the lazy dog",
                },
            ]))
            .unwrap();
        let txn = index.read_txn().unwrap();
        let mut db_cache = DatabaseCache::default();
        let query_graph =
            make_query_graph(&index, &txn, &mut db_cache, "the quick brown fox jumps over")
                .unwrap();
        println!("{}", query_graph.graphviz());
        // TODO: filters + maybe distinct attributes?
        let universe = get_start_universe(
            &index,
            &txn,
            &mut db_cache,
            &query_graph,
            TermsMatchingStrategy::Last,
        )
        .unwrap();
        println!("universe: {universe:?}");
        let results =
            execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /*  0, 20 */)
                .unwrap();
        println!("{results:?}")
    }
    #[test]
    fn search_movies_new() {
        let mut options = EnvOpenOptions::new();
        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
        let index = Index::new(options, "data_movies").unwrap();
        let txn = index.read_txn().unwrap();
        let primary_key = index.primary_key(&txn).unwrap().unwrap();
        let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
        // loop {
        //     let start = Instant::now();
        //     let mut db_cache = DatabaseCache::default();
        //     let query_graph = make_query_graph(
        //         &index,
        //         &txn,
        //         &mut db_cache,
        //         "released from prison by the government",
        //     )
        //     .unwrap();
        //     // println!("{}", query_graph.graphviz());
        //     // TODO: filters + maybe distinct attributes?
        //     let universe = get_start_universe(
        //         &index,
        //         &txn,
        //         &mut db_cache,
        //         &query_graph,
        //         TermsMatchingStrategy::Last,
        //     )
        //     .unwrap();
        //     // println!("universe: {universe:?}");
        //     let results = execute_search(
        //         &index,
        //         &txn,
        //         &mut db_cache,
        //         &universe,
        //         &query_graph, /*  0, 20 */
        //     )
        //     .unwrap();
        //     let elapsed = start.elapsed();
        //     println!("{}us: {results:?}", elapsed.as_micros());
        // }
        let start = Instant::now();
        let mut db_cache = DatabaseCache::default();
        let query_graph =
            make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government")
                .unwrap();
        // println!("{}", query_graph.graphviz());
        // TODO: filters + maybe distinct attributes?
        let universe = get_start_universe(
            &index,
            &txn,
            &mut db_cache,
            &query_graph,
            TermsMatchingStrategy::Last,
        )
        .unwrap();
        // println!("universe: {universe:?}");
        let results =
            execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /*  0, 20 */)
                .unwrap();
        let elapsed = start.elapsed();
        let ids = index
            .documents(&txn, results.iter().copied())
            .unwrap()
            .into_iter()
            .map(|x| {
                let obkv = &x.1;
                let id = obkv.get(primary_key).unwrap();
                let id: serde_json::Value = serde_json::from_slice(id).unwrap();
                id.as_str().unwrap().to_owned()
            })
            .collect::<Vec<_>>();
        println!("{}us: {results:?}", elapsed.as_micros());
        println!("external ids: {ids:?}");
    }
    #[test]
    fn search_movies_old() {
        let mut options = EnvOpenOptions::new();
        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
        let index = Index::new(options, "data_movies").unwrap();
        let txn = index.read_txn().unwrap();
        let start = Instant::now();
        let mut s = Search::new(&txn, &index);
        s.query("released from prison by the government");
        s.terms_matching_strategy(TermsMatchingStrategy::Last);
        // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
        let docs = s.execute().unwrap();
        let elapsed = start.elapsed();
        println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
    }
    #[test]
    fn _settings_movies() {
        let mut options = EnvOpenOptions::new();
        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
        let index = Index::new(options, "data_movies").unwrap();
        let mut wtxn = index.write_txn().unwrap();
        // let primary_key = "id";
        // let searchable_fields = vec!["title", "overview"];
        // let filterable_fields = vec!["release_date", "genres"];
        // let sortable_fields = vec[];
        let config = IndexerConfig::default();
        let mut builder = Settings::new(&mut wtxn, &index, &config);
        builder.set_min_word_len_one_typo(5);
        builder.set_min_word_len_two_typos(100);
        // builder.set_primary_key(primary_key.to_owned());
        // let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
        // builder.set_searchable_fields(searchable_fields);
        // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
        // builder.set_filterable_fields(filterable_fields);
        builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
        // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
        // builder.set_sortable_fields(sortable_fields);
        builder.execute(|_| (), || false).unwrap();
    }
    // #[test]
    fn _index_movies() {
        let mut options = EnvOpenOptions::new();
        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
        let index = Index::new(options, "data_movies").unwrap();
        let mut wtxn = index.write_txn().unwrap();
        let primary_key = "id";
        let searchable_fields = vec!["title", "overview"];
        let filterable_fields = vec!["release_date", "genres"];
        // let sortable_fields = vec[];
        let config = IndexerConfig::default();
        let mut builder = Settings::new(&mut wtxn, &index, &config);
        builder.set_primary_key(primary_key.to_owned());
        let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
        builder.set_searchable_fields(searchable_fields);
        let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
        builder.set_filterable_fields(filterable_fields);
        builder.set_criteria(vec![Criterion::Words]);
        // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
        // builder.set_sortable_fields(sortable_fields);
        builder.execute(|_| (), || false).unwrap();
        let config = IndexerConfig::default();
        let indexing_config = IndexDocumentsConfig::default();
        let builder =
            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
                .unwrap();
        let documents = documents_from(
            "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
            "json",
        );
        let (builder, user_error) = builder.add_documents(documents).unwrap();
        user_error.unwrap();
        builder.execute().unwrap();
        wtxn.commit().unwrap();
        index.prepare_for_closing().wait();
    }
    fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
        let reader = File::open(filename)
            .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
        let reader = BufReader::new(reader);
        let documents = match filetype {
            "csv" => documents_from_csv(reader).unwrap(),
            "json" => documents_from_json(reader).unwrap(),
            "jsonl" => documents_from_jsonl(reader).unwrap(),
            otherwise => panic!("invalid update format {:?}", otherwise),
        };
        DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
    }
    fn documents_from_jsonl(reader: impl BufRead) -> crate::Result<Vec<u8>> {
        let mut documents = DocumentsBatchBuilder::new(Vec::new());
        for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
            let object = result.unwrap();
            documents.append_json_object(&object)?;
        }
        documents.into_inner().map_err(Into::into)
    }
    fn documents_from_json(reader: impl BufRead) -> crate::Result<Vec<u8>> {
        let mut documents = DocumentsBatchBuilder::new(Vec::new());
        documents.append_json_array(reader)?;
        documents.into_inner().map_err(Into::into)
    }
    fn documents_from_csv(reader: impl BufRead) -> crate::Result<Vec<u8>> {
        let csv = csv::Reader::from_reader(reader);
        let mut documents = DocumentsBatchBuilder::new(Vec::new());
        documents.append_csv(csv)?;
        documents.into_inner().map_err(Into::into)
    }
 }