From 4a467739cdfd6c6eafb73ae57649199729bf8d7b Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Mar 2024 11:21:46 +0100 Subject: [PATCH] implements a first version of the cutoff without settings --- .../src/analytics/segment_analytics.rs | 9 ++ meilisearch/src/search.rs | 39 ++++++--- milli/examples/search.rs | 3 +- milli/src/index.rs | 1 + milli/src/lib.rs | 35 ++++++++ milli/src/search/hybrid.rs | 2 + milli/src/search/mod.rs | 82 +++++++++++-------- milli/src/search/new/bucket_sort.rs | 27 +++++- milli/src/search/new/matches/mod.rs | 3 +- milli/src/search/new/mod.rs | 16 +++- milli/tests/search/mod.rs | 45 +++++++++- 11 files changed, 210 insertions(+), 52 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 7dfc52900..99298bd43 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -579,6 +579,7 @@ pub struct SearchAggregator { // requests total_received: usize, total_succeeded: usize, + total_degraded: usize, time_spent: BinaryHeap, // sort @@ -758,9 +759,13 @@ impl SearchAggregator { hits_info: _, facet_distribution: _, facet_stats: _, + degraded, } = result; self.total_succeeded = self.total_succeeded.saturating_add(1); + if *degraded { + self.total_degraded = self.total_degraded.saturating_add(1); + } self.time_spent.push(*processing_time_ms as usize); } @@ -802,6 +807,7 @@ impl SearchAggregator { semantic_ratio, embedder, hybrid, + total_degraded, } = other; if self.timestamp.is_none() { @@ -816,6 +822,7 @@ impl SearchAggregator { // request self.total_received = self.total_received.saturating_add(total_received); self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.total_degraded = self.total_degraded.saturating_add(total_degraded); self.time_spent.append(time_spent); // sort @@ -921,6 +928,7 @@ impl SearchAggregator { semantic_ratio, embedder, hybrid, + total_degraded, } = self; if total_received == 0 { @@ -940,6 +948,7 @@ impl SearchAggregator { "total_succeeded": total_succeeded, "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics "total_received": total_received, + "total_degraded": total_degraded, }, "sort": { "with_geoPoint": sort_with_geo_point, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index e65192d16..9bc7b69fc 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1,7 +1,7 @@ use std::cmp::min; use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::str::FromStr; -use std::time::Instant; +use std::time::{Duration, Instant}; use deserr::Deserr; use either::Either; @@ -14,7 +14,7 @@ use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{self, ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::DistributionShift; -use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues}; +use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; use milli::tokenizer::TokenizerBuilder; @@ -323,6 +323,9 @@ pub struct SearchResult { pub facet_distribution: Option>>, #[serde(skip_serializing_if = "Option::is_none")] pub facet_stats: Option>, + + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub degraded: bool, } #[derive(Serialize, Debug, Clone, PartialEq)] @@ -382,8 +385,10 @@ fn prepare_search<'t>( query: &'t SearchQuery, features: RoFeatures, distribution: Option, + time_budget: TimeBudget, ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { let mut search = index.search(rtxn); + search.time_budget(time_budget); if query.vector.is_some() { features.check_vector("Passing `vector` as a query parameter")?; @@ -491,19 +496,26 @@ pub fn perform_search( distribution: Option, ) -> Result { let before_search = Instant::now(); + let time_budget = TimeBudget::new(Duration::from_millis(150)); let rtxn = index.read_txn()?; let (search, is_finite_pagination, max_total_hits, offset) = - prepare_search(index, &rtxn, &query, features, distribution)?; + prepare_search(index, &rtxn, &query, features, distribution, time_budget)?; - let milli::SearchResult { documents_ids, matching_words, candidates, document_scores, .. } = - match &query.hybrid { - Some(hybrid) => match *hybrid.semantic_ratio { - ratio if ratio == 0.0 || ratio == 1.0 => search.execute()?, - ratio => search.execute_hybrid(ratio)?, - }, - None => search.execute()?, - }; + let milli::SearchResult { + documents_ids, + matching_words, + candidates, + document_scores, + degraded, + .. + } = match &query.hybrid { + Some(hybrid) => match *hybrid.semantic_ratio { + ratio if ratio == 0.0 || ratio == 1.0 => search.execute()?, + ratio => search.execute_hybrid(ratio)?, + }, + None => search.execute()?, + }; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -700,6 +712,7 @@ pub fn perform_search( processing_time_ms: before_search.elapsed().as_millis(), facet_distribution, facet_stats, + degraded, }; Ok(result) } @@ -712,9 +725,11 @@ pub fn perform_facet_search( features: RoFeatures, ) -> Result { let before_search = Instant::now(); + let time_budget = TimeBudget::new(Duration::from_millis(150)); let rtxn = index.read_txn()?; - let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features, None)?; + let (search, _, _, _) = + prepare_search(index, &rtxn, &search_query, features, None, time_budget)?; let mut facet_search = SearchForFacetValues::new(facet_name, search, search_query.hybrid.is_some()); if let Some(facet_query) = &facet_query { diff --git a/milli/examples/search.rs b/milli/examples/search.rs index a94677771..8640acf42 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -6,7 +6,7 @@ use std::time::Instant; use heed::EnvOpenOptions; use milli::{ execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext, - SearchLogger, TermsMatchingStrategy, + SearchLogger, TermsMatchingStrategy, TimeBudget, }; #[global_allocator] @@ -65,6 +65,7 @@ fn main() -> Result<(), Box> { None, &mut DefaultSearchLogger, logger, + TimeBudget::max(), )?; if let Some((logger, dir)) = detailed_logger { logger.finish(&mut ctx, Path::new(dir))?; diff --git a/milli/src/index.rs b/milli/src/index.rs index 2c3977403..e79c137e7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2421,6 +2421,7 @@ pub(crate) mod tests { candidates: _, document_scores: _, mut documents_ids, + degraded: _, } = search.execute().unwrap(); let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); documents_ids.sort_unstable(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 5effcea3d..eedd25f7e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -30,6 +30,7 @@ pub mod snapshot_tests; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; +use std::fmt; use std::hash::BuildHasherDefault; use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}; @@ -104,6 +105,40 @@ pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; +#[derive(Clone, Copy)] +pub struct TimeBudget { + started_at: std::time::Instant, + budget: std::time::Duration, +} + +impl fmt::Debug for TimeBudget { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TimeBudget") + .field("started_at", &self.started_at) + .field("budget", &self.budget) + .field("left", &(self.budget - self.started_at.elapsed())) + .finish() + } +} + +impl TimeBudget { + pub fn new(budget: std::time::Duration) -> Self { + Self { started_at: std::time::Instant::now(), budget } + } + + pub fn max() -> Self { + Self::new(std::time::Duration::from_secs(u64::MAX)) + } + + pub fn exceeded(&self) -> bool { + self.must_stop() + } + + pub fn must_stop(&self) -> bool { + self.started_at.elapsed() > self.budget + } +} + // Convert an absolute word position into a relative position. // Return the field id of the attribute related to the absolute position // and the relative position in the attribute. diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index b4c79f7f5..9d8b3860d 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -106,6 +106,7 @@ impl ScoreWithRatioResult { candidates: left.candidates | right.candidates, documents_ids, document_scores, + degraded: false, } } } @@ -131,6 +132,7 @@ impl<'a> Search<'a> { index: self.index, distribution_shift: self.distribution_shift, embedder_name: self.embedder_name.clone(), + time_budget: self.time_budget, }; let vector_query = search.vector.take(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dc8354486..b14d88d03 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,7 +11,7 @@ use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::DistributionShift; use crate::{ execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, - SearchContext, + SearchContext, TimeBudget, }; // Building these factories is not free. @@ -43,6 +43,8 @@ pub struct Search<'a> { index: &'a Index, distribution_shift: Option, embedder_name: Option, + + time_budget: TimeBudget, } impl<'a> Search<'a> { @@ -64,6 +66,7 @@ impl<'a> Search<'a> { index, distribution_shift: None, embedder_name: None, + time_budget: TimeBudget::max(), } } @@ -143,6 +146,11 @@ impl<'a> Search<'a> { self } + pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> { + self.time_budget = time_budget; + self + } + pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { let ctx = SearchContext::new(self.index, self.rtxn); @@ -169,36 +177,43 @@ impl<'a> Search<'a> { } let universe = filtered_universe(&ctx, &self.filter)?; - let PartialSearchResult { located_query_terms, candidates, documents_ids, document_scores } = - match self.vector.as_ref() { - Some(vector) => execute_vector_search( - &mut ctx, - vector, - self.scoring_strategy, - universe, - &self.sort_criteria, - self.geo_strategy, - self.offset, - self.limit, - self.distribution_shift, - embedder_name, - )?, - None => execute_search( - &mut ctx, - self.query.as_deref(), - self.terms_matching_strategy, - self.scoring_strategy, - self.exhaustive_number_hits, - universe, - &self.sort_criteria, - self.geo_strategy, - self.offset, - self.limit, - Some(self.words_limit), - &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - )?, - }; + let PartialSearchResult { + located_query_terms, + candidates, + documents_ids, + document_scores, + degraded, + } = match self.vector.as_ref() { + Some(vector) => execute_vector_search( + &mut ctx, + vector, + self.scoring_strategy, + universe, + &self.sort_criteria, + self.geo_strategy, + self.offset, + self.limit, + self.distribution_shift, + embedder_name, + self.time_budget, + )?, + None => execute_search( + &mut ctx, + self.query.as_deref(), + self.terms_matching_strategy, + self.scoring_strategy, + self.exhaustive_number_hits, + universe, + &self.sort_criteria, + self.geo_strategy, + self.offset, + self.limit, + Some(self.words_limit), + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + self.time_budget, + )?, + }; // consume context and located_query_terms to build MatchingWords. let matching_words = match located_query_terms { @@ -206,7 +221,7 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - Ok(SearchResult { matching_words, candidates, document_scores, documents_ids }) + Ok(SearchResult { matching_words, candidates, document_scores, documents_ids, degraded }) } } @@ -229,6 +244,7 @@ impl fmt::Debug for Search<'_> { index: _, distribution_shift, embedder_name, + time_budget, } = self; f.debug_struct("Search") .field("query", query) @@ -244,6 +260,7 @@ impl fmt::Debug for Search<'_> { .field("words_limit", words_limit) .field("distribution_shift", distribution_shift) .field("embedder_name", embedder_name) + .field("time_bduget", time_budget) .finish() } } @@ -254,6 +271,7 @@ pub struct SearchResult { pub candidates: RoaringBitmap, pub documents_ids: Vec, pub document_scores: Vec>, + pub degraded: bool, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 02528e378..7fc830c1f 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -5,12 +5,14 @@ use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; use super::SearchContext; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; -use crate::Result; +use crate::{Result, TimeBudget}; pub struct BucketSortOutput { pub docids: Vec, pub scores: Vec>, pub all_candidates: RoaringBitmap, + + pub degraded: bool, } // TODO: would probably be good to regroup some of these inside of a struct? @@ -25,6 +27,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( length: usize, scoring_strategy: ScoringStrategy, logger: &mut dyn SearchLogger, + time_budget: TimeBudget, ) -> Result { logger.initial_query(query); logger.ranking_rules(&ranking_rules); @@ -41,6 +44,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( docids: vec![], scores: vec![], all_candidates: universe.clone(), + degraded: false, }); } if ranking_rules.is_empty() { @@ -74,6 +78,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( scores: vec![Default::default(); results.len()], docids: results, all_candidates, + degraded: false, }); } else { let docids: Vec = universe.iter().skip(from).take(length).collect(); @@ -81,6 +86,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( scores: vec![Default::default(); docids.len()], docids, all_candidates: universe.clone(), + degraded: false, }); }; } @@ -154,6 +160,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } while valid_docids.len() < length { + if time_budget.exceeded() { + let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); + maybe_add_to_results!(bucket); + + return Ok(BucketSortOutput { + scores: vec![Default::default(); valid_docids.len()], + docids: valid_docids, + all_candidates, + degraded: true, + }); + } + // The universe for this bucket is zero, so we don't need to sort // anything, just go back to the parent ranking rule. if ranking_rule_universes[cur_ranking_rule_index].is_empty() @@ -219,7 +237,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( )?; } - Ok(BucketSortOutput { docids: valid_docids, scores: valid_scores, all_candidates }) + Ok(BucketSortOutput { + docids: valid_docids, + scores: valid_scores, + all_candidates, + degraded: false, + }) } /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8de1d9262..2913f206d 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -502,7 +502,7 @@ mod tests { use super::*; use crate::index::tests::TempIndex; - use crate::{execute_search, filtered_universe, SearchContext}; + use crate::{execute_search, filtered_universe, SearchContext, TimeBudget}; impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { @@ -522,6 +522,7 @@ mod tests { Some(10), &mut crate::DefaultSearchLogger, &mut crate::DefaultSearchLogger, + TimeBudget::max(), ) .unwrap(); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ae661e3f6..ad996f363 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -52,7 +52,8 @@ use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::DistributionShift; use crate::{ - AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, + AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget, + UserError, }; /// A structure used throughout the execution of a search query. @@ -518,6 +519,7 @@ pub fn execute_vector_search( length: usize, distribution_shift: Option, embedder_name: &str, + time_budget: TimeBudget, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -537,7 +539,7 @@ pub fn execute_vector_search( let placeholder_search_logger: &mut dyn SearchLogger = &mut placeholder_search_logger; - let BucketSortOutput { docids, scores, all_candidates } = bucket_sort( + let BucketSortOutput { docids, scores, all_candidates, degraded } = bucket_sort( ctx, ranking_rules, &PlaceholderQuery, @@ -546,6 +548,7 @@ pub fn execute_vector_search( length, scoring_strategy, placeholder_search_logger, + time_budget, )?; Ok(PartialSearchResult { @@ -553,6 +556,7 @@ pub fn execute_vector_search( document_scores: scores, documents_ids: docids, located_query_terms: None, + degraded, }) } @@ -572,6 +576,7 @@ pub fn execute_search( words_limit: Option, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, + time_budget: TimeBudget, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -648,6 +653,7 @@ pub fn execute_search( length, scoring_strategy, query_graph_logger, + time_budget, )? } else { let ranking_rules = @@ -661,10 +667,11 @@ pub fn execute_search( length, scoring_strategy, placeholder_search_logger, + time_budget, )? }; - let BucketSortOutput { docids, scores, mut all_candidates } = bucket_sort_output; + let BucketSortOutput { docids, scores, mut all_candidates, degraded } = bucket_sort_output; let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?; // The candidates is the universe unless the exhaustive number of hits @@ -682,6 +689,7 @@ pub fn execute_search( document_scores: scores, documents_ids: docids, located_query_terms, + degraded, }) } @@ -742,4 +750,6 @@ pub struct PartialSearchResult { pub candidates: RoaringBitmap, pub documents_ids: Vec, pub document_scores: Vec>, + + pub degraded: bool, } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 9193ab762..ab6befa60 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,14 +1,19 @@ use std::cmp::Reverse; use std::collections::HashSet; use std::io::Cursor; +use std::time::Duration; use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use meili_snap::snapshot; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy}; +use milli::{ + AscDesc, Criterion, DocumentId, Filter, Index, Member, Object, Search, TermsMatchingStrategy, + TimeBudget, +}; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -349,3 +354,41 @@ where let result = serde_json::Value::deserialize(deserializer)?; Ok(Some(result)) } + +#[test] +fn basic_degraded_search() { + use Criterion::*; + let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; + let index = setup_search_index_with_criteria(&criteria); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.time_budget(TimeBudget::new(Duration::from_millis(0))); + + let result = search.execute().unwrap(); + assert!(result.degraded); +} + +#[test] +fn degraded_search_cannot_skip_filter() { + use Criterion::*; + let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; + let index = setup_search_index_with_criteria(&criteria); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.time_budget(TimeBudget::new(Duration::from_millis(0))); + let filter_condition = Filter::from_str("tag = etiopia").unwrap().unwrap(); + search.filter(filter_condition); + + let result = search.execute().unwrap(); + assert!(result.degraded); + snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###" + RoaringBitmap<[0, 2, 5, 8, 11, 14]> + [0, 2, 5, 8, 11, 14] + "###); +}