From 6e0526090aa827ffd302d57d4a78f0fc25010589 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 25 Jun 2025 15:36:12 +0200 Subject: [PATCH 01/23] Implement sorting documents --- .../src/routes/indexes/documents.rs | 13 ++++ .../milli/src/facet/facet_sort_recursive.rs | 68 +++++++++++++++++++ crates/milli/src/facet/mod.rs | 1 + 3 files changed, 82 insertions(+) create mode 100644 crates/milli/src/facet/facet_sort_recursive.rs diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 50eec46fe..99ca2b7df 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -17,6 +17,10 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::milli::facet::facet_sort_recursive::recursive_facet_sort; +use meilisearch_types::milli::facet::{ascending_facet_sort, descending_facet_sort}; +use meilisearch_types::milli::heed_codec::facet::FacetGroupKeyCodec; +use meilisearch_types::milli::heed_codec::BytesRefCodec; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; @@ -1533,6 +1537,15 @@ fn retrieve_documents>( })? } + let fields = vec![(0, true)]; + let number_db = index + .facet_id_f64_docids + .remap_key_type::>(); + let string_db = index + .facet_id_string_docids + .remap_key_type::>(); + candidates = recursive_facet_sort(&rtxn, number_db, string_db, &fields, candidates)?; + let (it, number_of_documents) = { let number_of_documents = candidates.len(); ( diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs new file mode 100644 index 000000000..a6bbad906 --- /dev/null +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -0,0 +1,68 @@ +use roaring::RoaringBitmap; +use heed::Database; +use crate::{facet::{ascending_facet_sort, descending_facet_sort}, heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}}; + +pub fn recursive_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + fields: &[(u16, bool)], + candidates: RoaringBitmap, +) -> heed::Result { + let (field_id, ascending) = match fields.first() { + Some(first) => *first, + None => return Ok(candidates), + }; + + let (number_iter, string_iter) = if ascending { + let number_iter = ascending_facet_sort( + rtxn, + number_db, + field_id, + candidates.clone(), + )?; + let string_iter = ascending_facet_sort( + rtxn, + string_db, + field_id, + candidates, + )?; + + (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) + } else { + let number_iter = descending_facet_sort( + rtxn, + number_db, + field_id, + candidates.clone(), + )?; + let string_iter = descending_facet_sort( + rtxn, + string_db, + field_id, + candidates, + )?; + + (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) + }; + + let chained_iter = number_iter.chain(string_iter); + let mut result = RoaringBitmap::new(); + for part in chained_iter { + let (inner_candidates, _) = part?; + if inner_candidates.len() <= 1 || fields.len() <= 1 { + result |= inner_candidates; + } else { + let inner_candidates = recursive_facet_sort( + rtxn, + number_db, + string_db, + &fields[1..], + inner_candidates, + )?; + result |= inner_candidates; + } + } + + Ok(result) +} diff --git a/crates/milli/src/facet/mod.rs b/crates/milli/src/facet/mod.rs index 274d2588d..a6351b42c 100644 --- a/crates/milli/src/facet/mod.rs +++ b/crates/milli/src/facet/mod.rs @@ -1,6 +1,7 @@ mod facet_type; mod facet_value; pub mod value_encoding; +pub mod facet_sort_recursive; pub use self::facet_type::FacetType; pub use self::facet_value::FacetValue; From b05cb80803873d4a17829ba15296b2ad33f3e856 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 25 Jun 2025 16:41:08 +0200 Subject: [PATCH 02/23] Take sort criteria from the request --- .../src/routes/indexes/documents.rs | 50 ++++++++++++------- .../milli/src/facet/facet_sort_recursive.rs | 39 +++++++++++++-- crates/milli/src/search/new/mod.rs | 19 +++---- 3 files changed, 79 insertions(+), 29 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 99ca2b7df..d91f43d21 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::io::{ErrorKind, Seek as _}; use std::marker::PhantomData; +use std::str::FromStr; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -18,12 +19,9 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::facet::facet_sort_recursive::recursive_facet_sort; -use meilisearch_types::milli::facet::{ascending_facet_sort, descending_facet_sort}; -use meilisearch_types::milli::heed_codec::facet::FacetGroupKeyCodec; -use meilisearch_types::milli::heed_codec::BytesRefCodec; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; -use meilisearch_types::milli::DocumentId; +use meilisearch_types::milli::{AscDesc, DocumentId}; use meilisearch_types::serde_cs::vec::CS; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; @@ -46,6 +44,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::indexes::search::fix_sort_query_parameters; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; @@ -410,6 +409,8 @@ pub struct BrowseQueryGet { #[param(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrQueryParamError)] filter: Option, + #[deserr(default, error = DeserrQueryParamError)] + sort: Option, // TODO: change deser error } #[derive(Debug, Deserr, ToSchema)] @@ -434,6 +435,9 @@ pub struct BrowseQuery { #[schema(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrJsonError)] filter: Option, + #[schema(default, value_type = Option>, example = json!(["title:asc", "rating:desc"]))] + #[deserr(default, error = DeserrJsonError)] // TODO: Change error + pub sort: Option>, } /// Get documents with POST @@ -575,7 +579,7 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids } = + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids, sort } = params.into_inner(); let filter = match filter { @@ -586,15 +590,14 @@ pub async fn get_documents( None => None, }; - let ids = ids.map(|ids| ids.into_iter().map(Into::into).collect()); - let query = BrowseQuery { offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), retrieve_vectors: retrieve_vectors.0, filter, - ids, + ids: ids.map(|ids| ids.into_iter().map(Into::into).collect()), + sort: sort.map(|attr| fix_sort_query_parameters(&attr)), }; analytics.publish( @@ -619,7 +622,7 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids, sort } = query; let retrieve_vectors = RetrieveVectors::new(retrieve_vectors); @@ -637,6 +640,22 @@ fn documents_by_query( None }; + let sort_criteria = if let Some(sort) = &sort { + let sorts: Vec<_> = + match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { + Ok(sorts) => sorts, + Err(asc_desc_error) => { + return Err(milli::Error::from(milli::SortError::from( + asc_desc_error, + )) + .into()) + } + }; + Some(sorts) + } else { + None + }; + let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents( &index, @@ -647,6 +666,7 @@ fn documents_by_query( fields, retrieve_vectors, index_scheduler.features(), + sort_criteria, )?; let ret = PaginationView::new(offset, limit, total as usize, documents); @@ -1505,6 +1525,7 @@ fn retrieve_documents>( attributes_to_retrieve: Option>, retrieve_vectors: RetrieveVectors, features: RoFeatures, + sort_criteria: Option>, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -1537,14 +1558,9 @@ fn retrieve_documents>( })? } - let fields = vec![(0, true)]; - let number_db = index - .facet_id_f64_docids - .remap_key_type::>(); - let string_db = index - .facet_id_string_docids - .remap_key_type::>(); - candidates = recursive_facet_sort(&rtxn, number_db, string_db, &fields, candidates)?; + if let Some(sort) = sort_criteria { + candidates = recursive_facet_sort(index, &rtxn, &sort, candidates)?; + } let (it, number_of_documents) = { let number_of_documents = candidates.len(); diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index a6bbad906..c0fd6ca6f 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -1,8 +1,8 @@ use roaring::RoaringBitmap; use heed::Database; -use crate::{facet::{ascending_facet_sort, descending_facet_sort}, heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}}; +use crate::{heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}, search::{facet::{ascending_facet_sort, descending_facet_sort}, new::check_sort_criteria}, AscDesc, Member}; -pub fn recursive_facet_sort<'t>( +fn recursive_facet_sort_inner<'t>( rtxn: &'t heed::RoTxn<'t>, number_db: Database, FacetGroupValueCodec>, string_db: Database, FacetGroupValueCodec>, @@ -53,7 +53,7 @@ pub fn recursive_facet_sort<'t>( if inner_candidates.len() <= 1 || fields.len() <= 1 { result |= inner_candidates; } else { - let inner_candidates = recursive_facet_sort( + let inner_candidates = recursive_facet_sort_inner( rtxn, number_db, string_db, @@ -66,3 +66,36 @@ pub fn recursive_facet_sort<'t>( Ok(result) } + +pub fn recursive_facet_sort<'t>( + index: &crate::Index, + rtxn: &'t heed::RoTxn<'t>, + sort: &[AscDesc], + candidates: RoaringBitmap, +) -> crate::Result { + check_sort_criteria(index, rtxn, Some(sort))?; + + let mut fields = Vec::new(); + let fields_ids_map = index.fields_ids_map(rtxn)?; + for sort in sort { + let (field_id, ascending) = match sort { + AscDesc::Asc(Member::Field(field)) => (fields_ids_map.id(field), true), + AscDesc::Desc(Member::Field(field)) => (fields_ids_map.id(field), false), + AscDesc::Asc(Member::Geo(_)) => todo!(), + AscDesc::Desc(Member::Geo(_)) => todo!(), + }; + if let Some(field_id) = field_id { + fields.push((field_id, ascending)); // FIXME: Should this return an error if the field is not found? + } + } + + let number_db = index + .facet_id_f64_docids + .remap_key_type::>(); + let string_db = index + .facet_id_string_docids + .remap_key_type::>(); + + let candidates = recursive_facet_sort_inner(rtxn, number_db, string_db, &fields, candidates)?; + Ok(candidates) +} diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index a65b4076b..5cb4c9fd5 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -638,7 +638,7 @@ pub fn execute_vector_search( time_budget: TimeBudget, ranking_score_threshold: Option, ) -> Result { - check_sort_criteria(ctx, sort_criteria.as_ref())?; + check_sort_criteria(ctx.index, ctx.txn, sort_criteria.as_deref())?; // FIXME: input universe = universe & documents_with_vectors // for now if we're computing embeddings for ALL documents, we can assume that this is just universe @@ -702,7 +702,7 @@ pub fn execute_search( ranking_score_threshold: Option, locales: Option<&Vec>, ) -> Result { - check_sort_criteria(ctx, sort_criteria.as_ref())?; + check_sort_criteria(ctx.index, ctx.txn, sort_criteria.as_deref())?; let mut used_negative_operator = false; let mut located_query_terms = None; @@ -872,9 +872,10 @@ pub fn execute_search( }) } -fn check_sort_criteria( - ctx: &SearchContext<'_>, - sort_criteria: Option<&Vec>, +pub(crate) fn check_sort_criteria( + index: &Index, + rtxn: &RoTxn<'_>, + sort_criteria: Option<&[AscDesc]>, ) -> Result<()> { let sort_criteria = if let Some(sort_criteria) = sort_criteria { sort_criteria @@ -888,19 +889,19 @@ fn check_sort_criteria( // We check that the sort ranking rule exists and throw an // error if we try to use it and that it doesn't. - let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort); + let sort_ranking_rule_missing = !index.criteria(rtxn)?.contains(&crate::Criterion::Sort); if sort_ranking_rule_missing { return Err(UserError::SortRankingRuleMissing.into()); } // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. - let sortable_fields = ctx.index.sortable_fields(ctx.txn)?; + let sortable_fields = index.sortable_fields(rtxn)?; for asc_desc in sort_criteria { match asc_desc.member() { Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { let (valid_fields, hidden_fields) = - ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; + index.remove_hidden_fields(rtxn, sortable_fields)?; return Err(UserError::InvalidSortableAttribute { field: field.to_string(), @@ -911,7 +912,7 @@ fn check_sort_criteria( } Member::Geo(_) if !sortable_fields.contains(RESERVED_GEO_FIELD_NAME) => { let (valid_fields, hidden_fields) = - ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; + index.remove_hidden_fields(rtxn, sortable_fields)?; return Err(UserError::InvalidSortableAttribute { field: RESERVED_GEO_FIELD_NAME.to_string(), From 4534dc2cab1ede94527dc3c58690a3d0798b21c6 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 25 Jun 2025 16:45:32 +0200 Subject: [PATCH 03/23] Create another deserr error --- crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch/src/routes/indexes/documents.rs | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index d2500b7e1..2eb22035e 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -237,6 +237,7 @@ InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQU MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; MissingDocumentEditionFunction , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; +InvalidDocumentSort , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ; InvalidVectorsType , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index d91f43d21..425930ced 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -409,8 +409,8 @@ pub struct BrowseQueryGet { #[param(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrQueryParamError)] filter: Option, - #[deserr(default, error = DeserrQueryParamError)] - sort: Option, // TODO: change deser error + #[deserr(default, error = DeserrQueryParamError)] + sort: Option, } #[derive(Debug, Deserr, ToSchema)] @@ -436,8 +436,8 @@ pub struct BrowseQuery { #[deserr(default, error = DeserrJsonError)] filter: Option, #[schema(default, value_type = Option>, example = json!(["title:asc", "rating:desc"]))] - #[deserr(default, error = DeserrJsonError)] // TODO: Change error - pub sort: Option>, + #[deserr(default, error = DeserrJsonError)] + sort: Option>, } /// Get documents with POST From 340d9e6edc71621b3c4dba7e95bbd7cf101453ff Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Fri, 27 Jun 2025 14:40:55 +0200 Subject: [PATCH 04/23] Optimize facet sort 5 to 10x speedup --- .../src/routes/indexes/documents.rs | 21 +- .../milli/src/facet/facet_sort_recursive.rs | 295 ++++++++++++++---- 2 files changed, 256 insertions(+), 60 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 425930ced..bcd227300 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1558,19 +1558,32 @@ fn retrieve_documents>( })? } + let mut facet_sort = None; if let Some(sort) = sort_criteria { - candidates = recursive_facet_sort(index, &rtxn, &sort, candidates)?; + facet_sort = Some(recursive_facet_sort(index, &rtxn, &sort, &candidates)?) } - let (it, number_of_documents) = { + let (it, number_of_documents) = if let Some(facet_sort) = &facet_sort { + let number_of_documents = candidates.len(); + let iter = facet_sort.iter()?; + ( + itertools::Either::Left(some_documents( + index, + &rtxn, + iter.map(|d| d.unwrap()).skip(offset).take(limit), + retrieve_vectors, + )?), + number_of_documents, + ) + } else { let number_of_documents = candidates.len(); ( - some_documents( + itertools::Either::Right(some_documents( index, &rtxn, candidates.into_iter().skip(offset).take(limit), retrieve_vectors, - )?, + )?), number_of_documents, ) }; diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index c0fd6ca6f..47c3696f3 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -1,78 +1,256 @@ use roaring::RoaringBitmap; use heed::Database; -use crate::{heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}, search::{facet::{ascending_facet_sort, descending_facet_sort}, new::check_sort_criteria}, AscDesc, Member}; +use crate::{heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}, search::{facet::{ascending_facet_sort, descending_facet_sort}, new::check_sort_criteria}, AscDesc, DocumentId, Member}; -fn recursive_facet_sort_inner<'t>( - rtxn: &'t heed::RoTxn<'t>, +/// Builder for a [`SortedDocumentsIterator`]. +/// Most builders won't ever be built, because pagination will skip them. +pub struct SortedDocumentsIteratorBuilder<'ctx> { + rtxn: &'ctx heed::RoTxn<'ctx>, number_db: Database, FacetGroupValueCodec>, string_db: Database, FacetGroupValueCodec>, - fields: &[(u16, bool)], + fields: &'ctx [(u16, bool)], candidates: RoaringBitmap, -) -> heed::Result { - let (field_id, ascending) = match fields.first() { - Some(first) => *first, - None => return Ok(candidates), - }; +} - let (number_iter, string_iter) = if ascending { - let number_iter = ascending_facet_sort( +impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { + /// Performs the sort and builds a [`SortedDocumentsIterator`]. + fn build(self) -> heed::Result> { + let SortedDocumentsIteratorBuilder { rtxn, number_db, - field_id, - candidates.clone(), - )?; - let string_iter = ascending_facet_sort( - rtxn, string_db, - field_id, + fields, candidates, - )?; + } = self; + let size = candidates.len() as usize; - (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) - } else { - let number_iter = descending_facet_sort( - rtxn, - number_db, - field_id, - candidates.clone(), - )?; - let string_iter = descending_facet_sort( - rtxn, - string_db, - field_id, - candidates, - )?; + // There is no point sorting a 1-element array + if size <= 1 { + return Ok(SortedDocumentsIterator::Leaf { + size, + values: Box::new(candidates.into_iter()), + }); + } - (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) - }; + // There is no variable to sort on + let Some((field_id, ascending)) = fields.first().copied() else { + return Ok(SortedDocumentsIterator::Leaf { + size, + values: Box::new(candidates.into_iter()), + }); + }; - let chained_iter = number_iter.chain(string_iter); - let mut result = RoaringBitmap::new(); - for part in chained_iter { - let (inner_candidates, _) = part?; - if inner_candidates.len() <= 1 || fields.len() <= 1 { - result |= inner_candidates; + // Perform the sort on the first field + let (number_iter, string_iter) = if ascending { + let number_iter = ascending_facet_sort( + rtxn, + number_db, + field_id, + candidates.clone(), + )?; + let string_iter = ascending_facet_sort( + rtxn, + string_db, + field_id, + candidates, + )?; + + (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) } else { - let inner_candidates = recursive_facet_sort_inner( + let number_iter = descending_facet_sort( + rtxn, + number_db, + field_id, + candidates.clone(), + )?; + let string_iter = descending_facet_sort( + rtxn, + string_db, + field_id, + candidates, + )?; + + (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) + }; + + // Create builders for the next level of the tree + let number_db2 = number_db; + let string_db2 = string_db; + let number_iter = number_iter.map(move |r| -> heed::Result { + let (docids, _bytes) = r?; + Ok(SortedDocumentsIteratorBuilder { rtxn, number_db, string_db, - &fields[1..], - inner_candidates, - )?; - result |= inner_candidates; - } - } + fields: &fields[1..], + candidates: docids, + }) + }); + let string_iter = string_iter.map(move |r| -> heed::Result { + let (docids, _bytes) = r?; + Ok(SortedDocumentsIteratorBuilder { + rtxn, + number_db: number_db2, + string_db: string_db2, + fields: &fields[1..], + candidates: docids, + }) + }); - Ok(result) + Ok(SortedDocumentsIterator::Branch { + current_child: None, + next_children_size: size, + next_children: Box::new(number_iter.chain(string_iter)), + }) + } } -pub fn recursive_facet_sort<'t>( - index: &crate::Index, - rtxn: &'t heed::RoTxn<'t>, +/// A [`SortedDocumentsIterator`] allows efficient access to a continuous range of sorted documents. +/// This is ideal in the context of paginated queries in which only a small number of documents are needed at a time. +/// Search operations will only be performed upon access. +pub enum SortedDocumentsIterator<'ctx> { + Leaf { + /// The exact number of documents remaining + size: usize, + values: Box + 'ctx> + }, + Branch { + /// The current child, got from the children iterator + current_child: Option>>, + /// The exact number of documents remaining, excluding documents in the current child + next_children_size: usize, + /// Iterators to become the current child once it is exhausted + next_children: Box>> + 'ctx>, + } +} + +impl SortedDocumentsIterator<'_> { + /// Takes care of updating the current child if it is `None`, and also updates the size + fn update_current<'ctx>(current_child: &mut Option>>, next_children_size: &mut usize, next_children: &mut Box>> + 'ctx>) -> heed::Result<()> { + if current_child.is_none() { + *current_child = match next_children.next() { + Some(Ok(builder)) => { + let next_child = Box::new(builder.build()?); + *next_children_size -= next_child.size_hint().0; + Some(next_child) + }, + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + } + Ok(()) + } +} + +impl Iterator for SortedDocumentsIterator<'_> { + type Item = heed::Result; + + fn nth(&mut self, n: usize) -> Option { + // If it's at the leaf level, just forward the call to the values iterator + let (current_child, next_children, next_children_size) = match self { + SortedDocumentsIterator::Leaf { values, size } => { + *size = size.saturating_sub(n); + return values.nth(n).map(Ok) + }, + SortedDocumentsIterator::Branch { current_child, next_children, next_children_size } => (current_child, next_children, next_children_size), + }; + + // Otherwise don't directly iterate over children, skip them if we know we will go further + let mut to_skip = n - 1; + while to_skip > 0 { + if let Err(e) = SortedDocumentsIterator::update_current(current_child, next_children_size, next_children) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; // No more inner iterators, everything has been consumed. + }; + + if to_skip >= inner.size_hint().0 { + // The current child isn't large enough to contain the nth element. + // Skip it and continue with the next one. + to_skip -= inner.size_hint().0; + *current_child = None; + continue; + } else { + // The current iterator is large enough, so we can forward the call to it. + return inner.nth(to_skip + 1); + } + } + + self.next() + } + + fn size_hint(&self) -> (usize, Option) { + let size = match self { + SortedDocumentsIterator::Leaf { size, .. } => *size, + SortedDocumentsIterator::Branch { next_children_size, current_child: Some(current_child), .. } => current_child.size_hint().0 + next_children_size, + SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => *next_children_size, + }; + + (size, Some(size)) + } + + fn next(&mut self) -> Option { + match self { + SortedDocumentsIterator::Leaf { values, size } => { + let result = values.next().map(Ok); + if result.is_some() { + *size -= 1; + } + result + }, + SortedDocumentsIterator::Branch { current_child, next_children_size, next_children } => { + let mut result = None; + while result.is_none() { + // Ensure we have selected an iterator to work with + if let Err(e) = SortedDocumentsIterator::update_current(current_child, next_children_size, next_children) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; + }; + + result = inner.next(); + + // If the current iterator is exhausted, we need to try the next one + if result.is_none() { + *current_child = None; + } + } + result + } + } + } +} + +/// A structure owning the data needed during the lifetime of a [`SortedDocumentsIterator`]. +pub struct SortedDocuments<'ctx> { + rtxn: &'ctx heed::RoTxn<'ctx>, + fields: Vec<(u16, bool)>, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + candidates: &'ctx RoaringBitmap, +} + +impl <'ctx> SortedDocuments<'ctx> { + pub fn iter(&'ctx self) -> heed::Result> { + let builder = SortedDocumentsIteratorBuilder { + rtxn: self.rtxn, + number_db: self.number_db, + string_db: self.string_db, + fields: &self.fields, + candidates: self.candidates.clone(), + }; + builder.build() + } +} + +pub fn recursive_facet_sort<'ctx>( + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, sort: &[AscDesc], - candidates: RoaringBitmap, -) -> crate::Result { + candidates: &'ctx RoaringBitmap, +) -> crate::Result> { check_sort_criteria(index, rtxn, Some(sort))?; let mut fields = Vec::new(); @@ -88,7 +266,7 @@ pub fn recursive_facet_sort<'t>( fields.push((field_id, ascending)); // FIXME: Should this return an error if the field is not found? } } - + let number_db = index .facet_id_f64_docids .remap_key_type::>(); @@ -96,6 +274,11 @@ pub fn recursive_facet_sort<'t>( .facet_id_string_docids .remap_key_type::>(); - let candidates = recursive_facet_sort_inner(rtxn, number_db, string_db, &fields, candidates)?; - Ok(candidates) + Ok(SortedDocuments { + rtxn, + fields, + number_db, + string_db, + candidates, + }) } From 63827bbee04e7a98e444901d2d0b2e83e46f7fe3 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 30 Jun 2025 11:59:59 +0200 Subject: [PATCH 05/23] Move sorting code out of search --- .gitignore | 8 +- crates/milli/src/documents/geo_sort.rs | 182 ++++++++++++++++++++ crates/milli/src/documents/mod.rs | 1 + crates/milli/src/search/new/distinct.rs | 2 +- crates/milli/src/search/new/geo_sort.rs | 210 ++++-------------------- crates/milli/src/search/new/mod.rs | 2 +- 6 files changed, 227 insertions(+), 178 deletions(-) create mode 100644 crates/milli/src/documents/geo_sort.rs diff --git a/.gitignore b/.gitignore index 07453a58f..d28baee77 100644 --- a/.gitignore +++ b/.gitignore @@ -11,12 +11,18 @@ /bench /_xtask_benchmark.ms /benchmarks +.DS_Store # Snapshots ## ... large *.full.snap -## ... unreviewed +## ... unreviewed *.snap.new +## ... pending +*.pending-snap + +# Tmp files +.tmp* # Fuzzcheck data for the facet indexing fuzz test crates/milli/fuzz/update::facet::incremental::fuzz::fuzz/ diff --git a/crates/milli/src/documents/geo_sort.rs b/crates/milli/src/documents/geo_sort.rs new file mode 100644 index 000000000..5b3968b39 --- /dev/null +++ b/crates/milli/src/documents/geo_sort.rs @@ -0,0 +1,182 @@ +use std::collections::VecDeque; + +use heed::RoTxn; +use roaring::RoaringBitmap; +use rstar::RTree; + +use crate::{ + distance_between_two_points, lat_lng_to_xyz, + search::new::geo_sort::{geo_value, opposite_of}, + GeoPoint, GeoSortStrategy, Index, +}; + +// TODO: Make it take a mut reference to cache +#[allow(clippy::too_many_arguments)] +pub fn fill_cache( + index: &Index, + txn: &RoTxn, + strategy: GeoSortStrategy, + ascending: bool, + target_point: [f64; 2], + field_ids: &Option<[u16; 2]>, + rtree: &mut Option>, + geo_candidates: &RoaringBitmap, + cached_sorted_docids: &mut VecDeque<(u32, [f64; 2])>, +) -> crate::Result<()> { + debug_assert!(cached_sorted_docids.is_empty()); + + // lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree` + let rtree = if strategy.use_rtree(geo_candidates.len() as usize) { + if let Some(rtree) = rtree.as_ref() { + // get rtree from cache + Some(rtree) + } else { + let rtree2 = index.geo_rtree(txn)?.expect("geo candidates but no rtree"); + // insert rtree in cache and returns it. + // Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation. + Some(&*rtree.insert(rtree2)) + } + } else { + None + }; + + let cache_size = strategy.cache_size(); + if let Some(rtree) = rtree { + if ascending { + let point = lat_lng_to_xyz(&target_point); + for point in rtree.nearest_neighbor_iter(&point) { + if geo_candidates.contains(point.data.0) { + cached_sorted_docids.push_back(point.data); + if cached_sorted_docids.len() >= cache_size { + break; + } + } + } + } else { + // in the case of the desc geo sort we look for the closest point to the opposite of the queried point + // and we insert the points in reverse order they get reversed when emptying the cache later on + let point = lat_lng_to_xyz(&opposite_of(target_point)); + for point in rtree.nearest_neighbor_iter(&point) { + if geo_candidates.contains(point.data.0) { + cached_sorted_docids.push_front(point.data); + if cached_sorted_docids.len() >= cache_size { + break; + } + } + } + } + } else { + // the iterative version + let [lat, lng] = field_ids.expect("fill_buffer can't be called without the lat&lng"); + + let mut documents = geo_candidates + .iter() + .map(|id| -> crate::Result<_> { Ok((id, geo_value(id, lat, lng, index, txn)?)) }) + .collect::>>()?; + // computing the distance between two points is expensive thus we cache the result + documents.sort_by_cached_key(|(_, p)| distance_between_two_points(&target_point, p) as usize); + cached_sorted_docids.extend(documents); + }; + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +pub fn next_bucket( + index: &Index, + txn: &RoTxn, + universe: &RoaringBitmap, + strategy: GeoSortStrategy, + ascending: bool, + target_point: [f64; 2], + field_ids: &Option<[u16; 2]>, + rtree: &mut Option>, + + cached_sorted_docids: &mut VecDeque<(u32, [f64; 2])>, + geo_candidates: &RoaringBitmap, + + // Limit the number of docs in a single bucket to avoid unexpectedly large overhead + max_bucket_size: u64, + // Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal + distance_error_margin: f64, +) -> crate::Result)>> { + let mut geo_candidates = geo_candidates & universe; + + if geo_candidates.is_empty() { + return Ok(Some((universe.clone(), None))); + } + + let next = |cache: &mut VecDeque<_>| { + if ascending { + cache.pop_front() + } else { + cache.pop_back() + } + }; + let put_back = |cache: &mut VecDeque<_>, x: _| { + if ascending { + cache.push_front(x) + } else { + cache.push_back(x) + } + }; + + let mut current_bucket = RoaringBitmap::new(); + // current_distance stores the first point and distance in current bucket + let mut current_distance: Option<([f64; 2], f64)> = None; + loop { + // The loop will only exit when we have found all points with equal distance or have exhausted the candidates. + if let Some((id, point)) = next(cached_sorted_docids) { + if geo_candidates.contains(id) { + let distance = distance_between_two_points(&target_point, &point); + if let Some((point0, bucket_distance)) = current_distance.as_ref() { + if (bucket_distance - distance).abs() > distance_error_margin { + // different distance, point belongs to next bucket + put_back(cached_sorted_docids, (id, point)); + return Ok(Some((current_bucket, Some(point0.to_owned())))); + } else { + // same distance, point belongs to current bucket + current_bucket.insert(id); + // remove from candidates to prevent it from being added to the cache again + geo_candidates.remove(id); + // current bucket size reaches limit, force return + if current_bucket.len() == max_bucket_size { + return Ok(Some((current_bucket, Some(point0.to_owned())))); + } + } + } else { + // first doc in current bucket + current_distance = Some((point, distance)); + current_bucket.insert(id); + geo_candidates.remove(id); + // current bucket size reaches limit, force return + if current_bucket.len() == max_bucket_size { + return Ok(Some((current_bucket, Some(point.to_owned())))); + } + } + } + } else { + // cache exhausted, we need to refill it + fill_cache( + index, + txn, + strategy, + ascending, + target_point, + field_ids, + rtree, + &geo_candidates, + cached_sorted_docids, + )?; + + if cached_sorted_docids.is_empty() { + // candidates exhausted, exit + if let Some((point0, _)) = current_distance.as_ref() { + return Ok(Some((current_bucket, Some(point0.to_owned())))); + } else { + return Ok(Some((universe.clone(), None))); + } + } + } + } +} diff --git a/crates/milli/src/documents/mod.rs b/crates/milli/src/documents/mod.rs index f43f7e842..6a05f61a5 100644 --- a/crates/milli/src/documents/mod.rs +++ b/crates/milli/src/documents/mod.rs @@ -3,6 +3,7 @@ mod enriched; mod primary_key; mod reader; mod serde_impl; +pub mod geo_sort; use std::fmt::Debug; use std::io; diff --git a/crates/milli/src/search/new/distinct.rs b/crates/milli/src/search/new/distinct.rs index 36172302a..48ad152ee 100644 --- a/crates/milli/src/search/new/distinct.rs +++ b/crates/milli/src/search/new/distinct.rs @@ -82,7 +82,7 @@ fn facet_value_docids( } /// Return an iterator over each number value in the given field of the given document. -fn facet_number_values<'a>( +pub(crate) fn facet_number_values<'a>( docid: u32, field_id: u16, index: &Index, diff --git a/crates/milli/src/search/new/geo_sort.rs b/crates/milli/src/search/new/geo_sort.rs index 3e7fe3458..a52a84575 100644 --- a/crates/milli/src/search/new/geo_sort.rs +++ b/crates/milli/src/search/new/geo_sort.rs @@ -7,12 +7,10 @@ use rstar::RTree; use super::facet_string_values; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; +use crate::documents::geo_sort::{fill_cache, next_bucket}; use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}; use crate::score_details::{self, ScoreDetails}; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext, - SearchLogger, -}; +use crate::{GeoPoint, Index, Result, SearchContext, SearchLogger}; const FID_SIZE: usize = 2; const DOCID_SIZE: usize = 4; @@ -134,62 +132,17 @@ impl GeoSort { ctx: &mut SearchContext<'_>, geo_candidates: &RoaringBitmap, ) -> Result<()> { - debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); - debug_assert!(self.cached_sorted_docids.is_empty()); - - // lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree` - let rtree = if self.strategy.use_rtree(geo_candidates.len() as usize) { - if let Some(rtree) = self.rtree.as_ref() { - // get rtree from cache - Some(rtree) - } else { - let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"); - // insert rtree in cache and returns it. - // Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation. - Some(&*self.rtree.insert(rtree)) - } - } else { - None - }; - - let cache_size = self.strategy.cache_size(); - if let Some(rtree) = rtree { - if self.ascending { - let point = lat_lng_to_xyz(&self.point); - for point in rtree.nearest_neighbor_iter(&point) { - if geo_candidates.contains(point.data.0) { - self.cached_sorted_docids.push_back(point.data); - if self.cached_sorted_docids.len() >= cache_size { - break; - } - } - } - } else { - // in the case of the desc geo sort we look for the closest point to the opposite of the queried point - // and we insert the points in reverse order they get reversed when emptying the cache later on - let point = lat_lng_to_xyz(&opposite_of(self.point)); - for point in rtree.nearest_neighbor_iter(&point) { - if geo_candidates.contains(point.data.0) { - self.cached_sorted_docids.push_front(point.data); - if self.cached_sorted_docids.len() >= cache_size { - break; - } - } - } - } - } else { - // the iterative version - let [lat, lng] = self.field_ids.unwrap(); - - let mut documents = geo_candidates - .iter() - .map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) }) - .collect::>>()?; - // computing the distance between two points is expensive thus we cache the result - documents - .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); - self.cached_sorted_docids.extend(documents); - }; + fill_cache( + ctx.index, + ctx.txn, + self.strategy, + self.ascending, + self.point, + &self.field_ids, + &mut self.rtree, + geo_candidates, + &mut self.cached_sorted_docids, + )?; Ok(()) } @@ -199,7 +152,7 @@ impl GeoSort { /// /// If it is not able to find it in the facet number index it will extract it /// from the facet string index and parse it as f64 (as the geo extraction behaves). -fn geo_value( +pub(crate) fn geo_value( docid: u32, field_lat: u16, field_lng: u16, @@ -267,124 +220,31 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { ) -> Result>> { let query = self.query.as_ref().unwrap().clone(); - let mut geo_candidates = &self.geo_candidates & universe; - - if geo_candidates.is_empty() { - return Ok(Some(RankingRuleOutput { + next_bucket( + ctx.index, + ctx.txn, + universe, + self.strategy, + self.ascending, + self.point, + &self.field_ids, + &mut self.rtree, + &mut self.cached_sorted_docids, + &self.geo_candidates, + self.max_bucket_size, + self.distance_error_margin, + ) + .map(|o| { + o.map(|(candidates, point)| RankingRuleOutput { query, - candidates: universe.clone(), + candidates, score: ScoreDetails::GeoSort(score_details::GeoSort { target_point: self.point, ascending: self.ascending, - value: None, + value: point, }), - })); - } - - let ascending = self.ascending; - let next = |cache: &mut VecDeque<_>| { - if ascending { - cache.pop_front() - } else { - cache.pop_back() - } - }; - let put_back = |cache: &mut VecDeque<_>, x: _| { - if ascending { - cache.push_front(x) - } else { - cache.push_back(x) - } - }; - - let mut current_bucket = RoaringBitmap::new(); - // current_distance stores the first point and distance in current bucket - let mut current_distance: Option<([f64; 2], f64)> = None; - loop { - // The loop will only exit when we have found all points with equal distance or have exhausted the candidates. - if let Some((id, point)) = next(&mut self.cached_sorted_docids) { - if geo_candidates.contains(id) { - let distance = distance_between_two_points(&self.point, &point); - if let Some((point0, bucket_distance)) = current_distance.as_ref() { - if (bucket_distance - distance).abs() > self.distance_error_margin { - // different distance, point belongs to next bucket - put_back(&mut self.cached_sorted_docids, (id, point)); - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point0.to_owned()), - }), - })); - } else { - // same distance, point belongs to current bucket - current_bucket.insert(id); - // remove from cadidates to prevent it from being added to the cache again - geo_candidates.remove(id); - // current bucket size reaches limit, force return - if current_bucket.len() == self.max_bucket_size { - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point0.to_owned()), - }), - })); - } - } - } else { - // first doc in current bucket - current_distance = Some((point, distance)); - current_bucket.insert(id); - geo_candidates.remove(id); - // current bucket size reaches limit, force return - if current_bucket.len() == self.max_bucket_size { - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point.to_owned()), - }), - })); - } - } - } - } else { - // cache exhausted, we need to refill it - self.fill_buffer(ctx, &geo_candidates)?; - - if self.cached_sorted_docids.is_empty() { - // candidates exhausted, exit - if let Some((point0, _)) = current_distance.as_ref() { - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point0.to_owned()), - }), - })); - } else { - return Ok(Some(RankingRuleOutput { - query, - candidates: universe.clone(), - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: None, - }), - })); - } - } - } - } + }) + }) } #[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")] @@ -396,7 +256,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { } /// Compute the antipodal coordinate of `coord` -fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { +pub(crate) fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { coord[0] *= -1.; // in the case of x,0 we want to return x,180 if coord[1] > 0. { diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index 5cb4c9fd5..da5e971af 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -1,7 +1,7 @@ mod bucket_sort; mod db_cache; mod distinct; -mod geo_sort; +pub(crate) mod geo_sort; mod graph_based_ranking_rule; mod interner; mod limits; From e35d58b531d5753aa47371bc056831b6edf4b2c9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 30 Jun 2025 13:12:00 +0200 Subject: [PATCH 06/23] Move geosort code out of search --- .../src/routes/indexes/documents.rs | 16 +- crates/milli/src/documents/geo_sort.rs | 153 ++++++++++++++-- crates/milli/src/documents/mod.rs | 3 +- .../milli/src/facet/facet_sort_recursive.rs | 171 +++++++++--------- crates/milli/src/facet/mod.rs | 2 +- crates/milli/src/lib.rs | 5 +- crates/milli/src/search/mod.rs | 8 +- crates/milli/src/search/new/distinct.rs | 2 +- crates/milli/src/search/new/geo_sort.rs | 124 +------------ crates/milli/src/search/new/mod.rs | 16 +- 10 files changed, 257 insertions(+), 243 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index bcd227300..5545c870e 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -641,16 +641,12 @@ fn documents_by_query( }; let sort_criteria = if let Some(sort) = &sort { - let sorts: Vec<_> = - match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { - Ok(sorts) => sorts, - Err(asc_desc_error) => { - return Err(milli::Error::from(milli::SortError::from( - asc_desc_error, - )) - .into()) - } - }; + let sorts: Vec<_> = match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { + Ok(sorts) => sorts, + Err(asc_desc_error) => { + return Err(milli::Error::from(milli::SortError::from(asc_desc_error)).into()) + } + }; Some(sorts) } else { None diff --git a/crates/milli/src/documents/geo_sort.rs b/crates/milli/src/documents/geo_sort.rs index 5b3968b39..5b899e6d5 100644 --- a/crates/milli/src/documents/geo_sort.rs +++ b/crates/milli/src/documents/geo_sort.rs @@ -1,14 +1,70 @@ -use std::collections::VecDeque; - -use heed::RoTxn; +use crate::{ + distance_between_two_points, + heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}, + lat_lng_to_xyz, + search::new::{facet_string_values, facet_values_prefix_key}, + GeoPoint, Index, +}; +use heed::{ + types::{Bytes, Unit}, + RoPrefix, RoTxn, +}; use roaring::RoaringBitmap; use rstar::RTree; +use std::collections::VecDeque; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, - search::new::geo_sort::{geo_value, opposite_of}, - GeoPoint, GeoSortStrategy, Index, -}; +#[derive(Debug, Clone, Copy)] +pub struct GeoSortParameter { + // Define the strategy used by the geo sort + pub strategy: GeoSortStrategy, + // Limit the number of docs in a single bucket to avoid unexpectedly large overhead + pub max_bucket_size: u64, + // Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal + pub distance_error_margin: f64, +} + +impl Default for GeoSortParameter { + fn default() -> Self { + Self { + strategy: GeoSortStrategy::default(), + max_bucket_size: 1000, + distance_error_margin: 1.0, + } + } +} +/// Define the strategy used by the geo sort. +/// The parameter represents the cache size, and, in the case of the Dynamic strategy, +/// the point where we move from using the iterative strategy to the rtree. +#[derive(Debug, Clone, Copy)] +pub enum GeoSortStrategy { + AlwaysIterative(usize), + AlwaysRtree(usize), + Dynamic(usize), +} + +impl Default for GeoSortStrategy { + fn default() -> Self { + GeoSortStrategy::Dynamic(1000) + } +} + +impl GeoSortStrategy { + pub fn use_rtree(&self, candidates: usize) -> bool { + match self { + GeoSortStrategy::AlwaysIterative(_) => false, + GeoSortStrategy::AlwaysRtree(_) => true, + GeoSortStrategy::Dynamic(i) => candidates >= *i, + } + } + + pub fn cache_size(&self) -> usize { + match self { + GeoSortStrategy::AlwaysIterative(i) + | GeoSortStrategy::AlwaysRtree(i) + | GeoSortStrategy::Dynamic(i) => *i, + } + } +} // TODO: Make it take a mut reference to cache #[allow(clippy::too_many_arguments)] @@ -74,7 +130,8 @@ pub fn fill_cache( .map(|id| -> crate::Result<_> { Ok((id, geo_value(id, lat, lng, index, txn)?)) }) .collect::>>()?; // computing the distance between two points is expensive thus we cache the result - documents.sort_by_cached_key(|(_, p)| distance_between_two_points(&target_point, p) as usize); + documents + .sort_by_cached_key(|(_, p)| distance_between_two_points(&target_point, p) as usize); cached_sorted_docids.extend(documents); }; @@ -86,19 +143,13 @@ pub fn next_bucket( index: &Index, txn: &RoTxn, universe: &RoaringBitmap, - strategy: GeoSortStrategy, ascending: bool, target_point: [f64; 2], field_ids: &Option<[u16; 2]>, rtree: &mut Option>, - cached_sorted_docids: &mut VecDeque<(u32, [f64; 2])>, geo_candidates: &RoaringBitmap, - - // Limit the number of docs in a single bucket to avoid unexpectedly large overhead - max_bucket_size: u64, - // Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal - distance_error_margin: f64, + parameter: GeoSortParameter, ) -> crate::Result)>> { let mut geo_candidates = geo_candidates & universe; @@ -130,7 +181,7 @@ pub fn next_bucket( if geo_candidates.contains(id) { let distance = distance_between_two_points(&target_point, &point); if let Some((point0, bucket_distance)) = current_distance.as_ref() { - if (bucket_distance - distance).abs() > distance_error_margin { + if (bucket_distance - distance).abs() > parameter.distance_error_margin { // different distance, point belongs to next bucket put_back(cached_sorted_docids, (id, point)); return Ok(Some((current_bucket, Some(point0.to_owned())))); @@ -140,7 +191,7 @@ pub fn next_bucket( // remove from candidates to prevent it from being added to the cache again geo_candidates.remove(id); // current bucket size reaches limit, force return - if current_bucket.len() == max_bucket_size { + if current_bucket.len() == parameter.max_bucket_size { return Ok(Some((current_bucket, Some(point0.to_owned())))); } } @@ -150,7 +201,7 @@ pub fn next_bucket( current_bucket.insert(id); geo_candidates.remove(id); // current bucket size reaches limit, force return - if current_bucket.len() == max_bucket_size { + if current_bucket.len() == parameter.max_bucket_size { return Ok(Some((current_bucket, Some(point.to_owned())))); } } @@ -160,7 +211,7 @@ pub fn next_bucket( fill_cache( index, txn, - strategy, + parameter.strategy, ascending, target_point, field_ids, @@ -180,3 +231,65 @@ pub fn next_bucket( } } } + +/// Return an iterator over each number value in the given field of the given document. +fn facet_number_values<'a>( + docid: u32, + field_id: u16, + index: &Index, + txn: &'a RoTxn<'a>, +) -> crate::Result, Unit>> { + let key = facet_values_prefix_key(field_id, docid); + + let iter = index + .field_id_docid_facet_f64s + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type(); + + Ok(iter) +} + +/// Extracts the lat and long values from a single document. +/// +/// If it is not able to find it in the facet number index it will extract it +/// from the facet string index and parse it as f64 (as the geo extraction behaves). +pub(crate) fn geo_value( + docid: u32, + field_lat: u16, + field_lng: u16, + index: &Index, + rtxn: &RoTxn<'_>, +) -> crate::Result<[f64; 2]> { + let extract_geo = |geo_field: u16| -> crate::Result { + match facet_number_values(docid, geo_field, index, rtxn)?.next() { + Some(Ok(((_, _, geo), ()))) => Ok(geo), + Some(Err(e)) => Err(e.into()), + None => match facet_string_values(docid, geo_field, index, rtxn)?.next() { + Some(Ok((_, geo))) => { + Ok(geo.parse::().expect("cannot parse geo field as f64")) + } + Some(Err(e)) => Err(e.into()), + None => panic!("A geo faceted document doesn't contain any lat or lng"), + }, + } + }; + + let lat = extract_geo(field_lat)?; + let lng = extract_geo(field_lng)?; + + Ok([lat, lng]) +} + +/// Compute the antipodal coordinate of `coord` +pub(crate) fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { + coord[0] *= -1.; + // in the case of x,0 we want to return x,180 + if coord[1] > 0. { + coord[1] -= 180.; + } else { + coord[1] += 180.; + } + + coord +} diff --git a/crates/milli/src/documents/mod.rs b/crates/milli/src/documents/mod.rs index 6a05f61a5..b515c4e98 100644 --- a/crates/milli/src/documents/mod.rs +++ b/crates/milli/src/documents/mod.rs @@ -1,9 +1,9 @@ mod builder; mod enriched; +pub mod geo_sort; mod primary_key; mod reader; mod serde_impl; -pub mod geo_sort; use std::fmt::Debug; use std::io; @@ -20,6 +20,7 @@ pub use primary_key::{ pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; +pub use self::geo_sort::{GeoSortParameter, GeoSortStrategy}; use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::{FieldId, Object, Result}; diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 47c3696f3..7342114ef 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -1,6 +1,16 @@ -use roaring::RoaringBitmap; +use crate::{ + heed_codec::{ + facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, + BytesRefCodec, + }, + search::{ + facet::{ascending_facet_sort, descending_facet_sort}, + new::check_sort_criteria, + }, + AscDesc, DocumentId, Member, +}; use heed::Database; -use crate::{heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}, search::{facet::{ascending_facet_sort, descending_facet_sort}, new::check_sort_criteria}, AscDesc, DocumentId, Member}; +use roaring::RoaringBitmap; /// Builder for a [`SortedDocumentsIterator`]. /// Most builders won't ever be built, because pagination will skip them. @@ -15,13 +25,8 @@ pub struct SortedDocumentsIteratorBuilder<'ctx> { impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { /// Performs the sort and builds a [`SortedDocumentsIterator`]. fn build(self) -> heed::Result> { - let SortedDocumentsIteratorBuilder { - rtxn, - number_db, - string_db, - fields, - candidates, - } = self; + let SortedDocumentsIteratorBuilder { rtxn, number_db, string_db, fields, candidates } = + self; let size = candidates.len() as usize; // There is no point sorting a 1-element array @@ -42,33 +47,13 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { // Perform the sort on the first field let (number_iter, string_iter) = if ascending { - let number_iter = ascending_facet_sort( - rtxn, - number_db, - field_id, - candidates.clone(), - )?; - let string_iter = ascending_facet_sort( - rtxn, - string_db, - field_id, - candidates, - )?; + let number_iter = ascending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; + let string_iter = ascending_facet_sort(rtxn, string_db, field_id, candidates)?; (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) } else { - let number_iter = descending_facet_sort( - rtxn, - number_db, - field_id, - candidates.clone(), - )?; - let string_iter = descending_facet_sort( - rtxn, - string_db, - field_id, - candidates, - )?; + let number_iter = descending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; + let string_iter = descending_facet_sort(rtxn, string_db, field_id, candidates)?; (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) }; @@ -76,26 +61,28 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { // Create builders for the next level of the tree let number_db2 = number_db; let string_db2 = string_db; - let number_iter = number_iter.map(move |r| -> heed::Result { - let (docids, _bytes) = r?; - Ok(SortedDocumentsIteratorBuilder { - rtxn, - number_db, - string_db, - fields: &fields[1..], - candidates: docids, - }) - }); - let string_iter = string_iter.map(move |r| -> heed::Result { - let (docids, _bytes) = r?; - Ok(SortedDocumentsIteratorBuilder { - rtxn, - number_db: number_db2, - string_db: string_db2, - fields: &fields[1..], - candidates: docids, - }) - }); + let number_iter = + number_iter.map(move |r| -> heed::Result { + let (docids, _bytes) = r?; + Ok(SortedDocumentsIteratorBuilder { + rtxn, + number_db, + string_db, + fields: &fields[1..], + candidates: docids, + }) + }); + let string_iter = + string_iter.map(move |r| -> heed::Result { + let (docids, _bytes) = r?; + Ok(SortedDocumentsIteratorBuilder { + rtxn, + number_db: number_db2, + string_db: string_db2, + fields: &fields[1..], + candidates: docids, + }) + }); Ok(SortedDocumentsIterator::Branch { current_child: None, @@ -112,7 +99,7 @@ pub enum SortedDocumentsIterator<'ctx> { Leaf { /// The exact number of documents remaining size: usize, - values: Box + 'ctx> + values: Box + 'ctx>, }, Branch { /// The current child, got from the children iterator @@ -120,20 +107,27 @@ pub enum SortedDocumentsIterator<'ctx> { /// The exact number of documents remaining, excluding documents in the current child next_children_size: usize, /// Iterators to become the current child once it is exhausted - next_children: Box>> + 'ctx>, - } + next_children: + Box>> + 'ctx>, + }, } impl SortedDocumentsIterator<'_> { /// Takes care of updating the current child if it is `None`, and also updates the size - fn update_current<'ctx>(current_child: &mut Option>>, next_children_size: &mut usize, next_children: &mut Box>> + 'ctx>) -> heed::Result<()> { + fn update_current<'ctx>( + current_child: &mut Option>>, + next_children_size: &mut usize, + next_children: &mut Box< + dyn Iterator>> + 'ctx, + >, + ) -> heed::Result<()> { if current_child.is_none() { *current_child = match next_children.next() { Some(Ok(builder)) => { let next_child = Box::new(builder.build()?); *next_children_size -= next_child.size_hint().0; Some(next_child) - }, + } Some(Err(e)) => return Err(e), None => return Ok(()), }; @@ -150,15 +144,23 @@ impl Iterator for SortedDocumentsIterator<'_> { let (current_child, next_children, next_children_size) = match self { SortedDocumentsIterator::Leaf { values, size } => { *size = size.saturating_sub(n); - return values.nth(n).map(Ok) - }, - SortedDocumentsIterator::Branch { current_child, next_children, next_children_size } => (current_child, next_children, next_children_size), + return values.nth(n).map(Ok); + } + SortedDocumentsIterator::Branch { + current_child, + next_children, + next_children_size, + } => (current_child, next_children, next_children_size), }; // Otherwise don't directly iterate over children, skip them if we know we will go further let mut to_skip = n - 1; while to_skip > 0 { - if let Err(e) = SortedDocumentsIterator::update_current(current_child, next_children_size, next_children) { + if let Err(e) = SortedDocumentsIterator::update_current( + current_child, + next_children_size, + next_children, + ) { return Some(Err(e)); } let Some(inner) = current_child else { @@ -183,8 +185,14 @@ impl Iterator for SortedDocumentsIterator<'_> { fn size_hint(&self) -> (usize, Option) { let size = match self { SortedDocumentsIterator::Leaf { size, .. } => *size, - SortedDocumentsIterator::Branch { next_children_size, current_child: Some(current_child), .. } => current_child.size_hint().0 + next_children_size, - SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => *next_children_size, + SortedDocumentsIterator::Branch { + next_children_size, + current_child: Some(current_child), + .. + } => current_child.size_hint().0 + next_children_size, + SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => { + *next_children_size + } }; (size, Some(size)) @@ -198,12 +206,20 @@ impl Iterator for SortedDocumentsIterator<'_> { *size -= 1; } result - }, - SortedDocumentsIterator::Branch { current_child, next_children_size, next_children } => { + } + SortedDocumentsIterator::Branch { + current_child, + next_children_size, + next_children, + } => { let mut result = None; while result.is_none() { // Ensure we have selected an iterator to work with - if let Err(e) = SortedDocumentsIterator::update_current(current_child, next_children_size, next_children) { + if let Err(e) = SortedDocumentsIterator::update_current( + current_child, + next_children_size, + next_children, + ) { return Some(Err(e)); } let Some(inner) = current_child else { @@ -232,7 +248,7 @@ pub struct SortedDocuments<'ctx> { candidates: &'ctx RoaringBitmap, } -impl <'ctx> SortedDocuments<'ctx> { +impl<'ctx> SortedDocuments<'ctx> { pub fn iter(&'ctx self) -> heed::Result> { let builder = SortedDocumentsIteratorBuilder { rtxn: self.rtxn, @@ -266,19 +282,10 @@ pub fn recursive_facet_sort<'ctx>( fields.push((field_id, ascending)); // FIXME: Should this return an error if the field is not found? } } - - let number_db = index - .facet_id_f64_docids - .remap_key_type::>(); - let string_db = index - .facet_id_string_docids - .remap_key_type::>(); - Ok(SortedDocuments { - rtxn, - fields, - number_db, - string_db, - candidates, - }) + let number_db = index.facet_id_f64_docids.remap_key_type::>(); + let string_db = + index.facet_id_string_docids.remap_key_type::>(); + + Ok(SortedDocuments { rtxn, fields, number_db, string_db, candidates }) } diff --git a/crates/milli/src/facet/mod.rs b/crates/milli/src/facet/mod.rs index a6351b42c..8b0b9a25e 100644 --- a/crates/milli/src/facet/mod.rs +++ b/crates/milli/src/facet/mod.rs @@ -1,7 +1,7 @@ +pub mod facet_sort_recursive; mod facet_type; mod facet_value; pub mod value_encoding; -pub mod facet_sort_recursive; pub use self::facet_type::FacetType; pub use self::facet_value::FacetValue; diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 504b4c68d..6fdae86b3 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -43,12 +43,13 @@ use std::fmt; use std::hash::BuildHasherDefault; use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}; +pub use documents::GeoSortStrategy; pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; pub use search::new::{ - execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext, - SearchLogger, VisualSearchLogger, + execute_search, filtered_universe, DefaultSearchLogger, SearchContext, SearchLogger, + VisualSearchLogger, }; use serde_json::Value; pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 62183afc3..48013b2ee 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -9,6 +9,8 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; +use crate::documents::GeoSortParameter; +use crate::documents::GeoSortStrategy; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; use crate::score_details::{ScoreDetails, ScoringStrategy}; @@ -46,7 +48,7 @@ pub struct Search<'a> { sort_criteria: Option>, distinct: Option, searchable_attributes: Option<&'a [String]>, - geo_param: new::GeoSortParameter, + geo_param: GeoSortParameter, terms_matching_strategy: TermsMatchingStrategy, scoring_strategy: ScoringStrategy, words_limit: usize, @@ -69,7 +71,7 @@ impl<'a> Search<'a> { sort_criteria: None, distinct: None, searchable_attributes: None, - geo_param: new::GeoSortParameter::default(), + geo_param: GeoSortParameter::default(), terms_matching_strategy: TermsMatchingStrategy::default(), scoring_strategy: Default::default(), exhaustive_number_hits: false, @@ -145,7 +147,7 @@ impl<'a> Search<'a> { } #[cfg(test)] - pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> { + pub fn geo_sort_strategy(&mut self, strategy: GeoSortStrategy) -> &mut Search<'a> { self.geo_param.strategy = strategy; self } diff --git a/crates/milli/src/search/new/distinct.rs b/crates/milli/src/search/new/distinct.rs index 48ad152ee..455b495f5 100644 --- a/crates/milli/src/search/new/distinct.rs +++ b/crates/milli/src/search/new/distinct.rs @@ -118,7 +118,7 @@ pub fn facet_string_values<'a>( } #[allow(clippy::drop_non_drop)] -fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { +pub(crate) fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } diff --git a/crates/milli/src/search/new/geo_sort.rs b/crates/milli/src/search/new/geo_sort.rs index a52a84575..47001267d 100644 --- a/crates/milli/src/search/new/geo_sort.rs +++ b/crates/milli/src/search/new/geo_sort.rs @@ -8,6 +8,7 @@ use rstar::RTree; use super::facet_string_values; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use crate::documents::geo_sort::{fill_cache, next_bucket}; +use crate::documents::{GeoSortParameter, GeoSortStrategy}; use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}; use crate::score_details::{self, ScoreDetails}; use crate::{GeoPoint, Index, Result, SearchContext, SearchLogger}; @@ -20,75 +21,10 @@ fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } -/// Return an iterator over each number value in the given field of the given document. -fn facet_number_values<'a>( - docid: u32, - field_id: u16, - index: &Index, - txn: &'a RoTxn<'a>, -) -> Result, Unit>> { - let key = facet_values_prefix_key(field_id, docid); - - let iter = index - .field_id_docid_facet_f64s - .remap_key_type::() - .prefix_iter(txn, &key)? - .remap_key_type(); - - Ok(iter) -} - -#[derive(Debug, Clone, Copy)] -pub struct Parameter { - // Define the strategy used by the geo sort - pub strategy: Strategy, - // Limit the number of docs in a single bucket to avoid unexpectedly large overhead - pub max_bucket_size: u64, - // Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal - pub distance_error_margin: f64, -} - -impl Default for Parameter { - fn default() -> Self { - Self { strategy: Strategy::default(), max_bucket_size: 1000, distance_error_margin: 1.0 } - } -} -/// Define the strategy used by the geo sort. -/// The parameter represents the cache size, and, in the case of the Dynamic strategy, -/// the point where we move from using the iterative strategy to the rtree. -#[derive(Debug, Clone, Copy)] -pub enum Strategy { - AlwaysIterative(usize), - AlwaysRtree(usize), - Dynamic(usize), -} - -impl Default for Strategy { - fn default() -> Self { - Strategy::Dynamic(1000) - } -} - -impl Strategy { - pub fn use_rtree(&self, candidates: usize) -> bool { - match self { - Strategy::AlwaysIterative(_) => false, - Strategy::AlwaysRtree(_) => true, - Strategy::Dynamic(i) => candidates >= *i, - } - } - - pub fn cache_size(&self) -> usize { - match self { - Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i, - } - } -} - pub struct GeoSort { query: Option, - strategy: Strategy, + strategy: GeoSortStrategy, ascending: bool, point: [f64; 2], field_ids: Option<[u16; 2]>, @@ -105,12 +41,12 @@ pub struct GeoSort { impl GeoSort { pub fn new( - parameter: Parameter, + parameter: GeoSortParameter, geo_faceted_docids: RoaringBitmap, point: [f64; 2], ascending: bool, ) -> Result { - let Parameter { strategy, max_bucket_size, distance_error_margin } = parameter; + let GeoSortParameter { strategy, max_bucket_size, distance_error_margin } = parameter; Ok(Self { query: None, strategy, @@ -148,37 +84,6 @@ impl GeoSort { } } -/// Extracts the lat and long values from a single document. -/// -/// If it is not able to find it in the facet number index it will extract it -/// from the facet string index and parse it as f64 (as the geo extraction behaves). -pub(crate) fn geo_value( - docid: u32, - field_lat: u16, - field_lng: u16, - index: &Index, - rtxn: &RoTxn<'_>, -) -> Result<[f64; 2]> { - let extract_geo = |geo_field: u16| -> Result { - match facet_number_values(docid, geo_field, index, rtxn)?.next() { - Some(Ok(((_, _, geo), ()))) => Ok(geo), - Some(Err(e)) => Err(e.into()), - None => match facet_string_values(docid, geo_field, index, rtxn)?.next() { - Some(Ok((_, geo))) => { - Ok(geo.parse::().expect("cannot parse geo field as f64")) - } - Some(Err(e)) => Err(e.into()), - None => panic!("A geo faceted document doesn't contain any lat or lng"), - }, - } - }; - - let lat = extract_geo(field_lat)?; - let lng = extract_geo(field_lng)?; - - Ok([lat, lng]) -} - impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { fn id(&self) -> String { "geo_sort".to_owned() @@ -224,15 +129,17 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { ctx.index, ctx.txn, universe, - self.strategy, self.ascending, self.point, &self.field_ids, &mut self.rtree, &mut self.cached_sorted_docids, &self.geo_candidates, - self.max_bucket_size, - self.distance_error_margin, + GeoSortParameter { + strategy: self.strategy, + max_bucket_size: self.max_bucket_size, + distance_error_margin: self.distance_error_margin, + }, ) .map(|o| { o.map(|(candidates, point)| RankingRuleOutput { @@ -254,16 +161,3 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.cached_sorted_docids.clear(); } } - -/// Compute the antipodal coordinate of `coord` -pub(crate) fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { - coord[0] *= -1.; - // in the case of x,0 we want to return x,180 - if coord[1] > 0. { - coord[1] -= 180.; - } else { - coord[1] += 180.; - } - - coord -} diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index da5e971af..b5258413e 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -46,14 +46,14 @@ use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use roaring::RoaringBitmap; use sort::Sort; -use self::distinct::facet_string_values; +pub(crate) use self::distinct::{facet_string_values, facet_values_prefix_key}; use self::geo_sort::GeoSort; -pub use self::geo_sort::{Parameter as GeoSortParameter, Strategy as GeoSortStrategy}; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; use crate::attribute_patterns::{match_pattern, PatternMatch}; use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::documents::GeoSortParameter; use crate::index::PrefixSearch; use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; @@ -319,7 +319,7 @@ fn resolve_negative_phrases( fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, ) -> Result>> { let mut sort = false; let mut sorted_fields = HashSet::new(); @@ -371,7 +371,7 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( fn get_ranking_rules_for_vector<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, limit_plus_offset: usize, target: &[f32], embedder_name: &str, @@ -448,7 +448,7 @@ fn get_ranking_rules_for_vector<'ctx>( fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, terms_matching_strategy: TermsMatchingStrategy, ) -> Result>> { // query graph search @@ -559,7 +559,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( ranking_rules: &mut Vec>, sorted_fields: &mut HashSet, geo_sorted: &mut bool, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, ) -> Result<()> { let sort_criteria = sort_criteria.clone().unwrap_or_default(); ranking_rules.reserve(sort_criteria.len()); @@ -629,7 +629,7 @@ pub fn execute_vector_search( universe: RoaringBitmap, sort_criteria: &Option>, distinct: &Option, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, from: usize, length: usize, embedder_name: &str, @@ -692,7 +692,7 @@ pub fn execute_search( mut universe: RoaringBitmap, sort_criteria: &Option>, distinct: &Option, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, from: usize, length: usize, words_limit: Option, From f86f4f619f2b69d408347b85c631f0e3942e888c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 30 Jun 2025 13:57:30 +0200 Subject: [PATCH 07/23] Implement geo sort on documents --- .../src/routes/indexes/documents.rs | 2 +- crates/milli/src/documents/geo_sort.rs | 1 - .../milli/src/facet/facet_sort_recursive.rs | 180 +++++++++++++++--- crates/milli/src/search/mod.rs | 3 +- crates/milli/src/search/new/geo_sort.rs | 14 +- 5 files changed, 152 insertions(+), 48 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 5545c870e..be6d647f7 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1556,7 +1556,7 @@ fn retrieve_documents>( let mut facet_sort = None; if let Some(sort) = sort_criteria { - facet_sort = Some(recursive_facet_sort(index, &rtxn, &sort, &candidates)?) + facet_sort = Some(recursive_facet_sort(index, &rtxn, sort, &candidates)?) } let (it, number_of_documents) = if let Some(facet_sort) = &facet_sort { diff --git a/crates/milli/src/documents/geo_sort.rs b/crates/milli/src/documents/geo_sort.rs index 5b899e6d5..0750dfe5c 100644 --- a/crates/milli/src/documents/geo_sort.rs +++ b/crates/milli/src/documents/geo_sort.rs @@ -66,7 +66,6 @@ impl GeoSortStrategy { } } -// TODO: Make it take a mut reference to cache #[allow(clippy::too_many_arguments)] pub fn fill_cache( index: &Index, diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 7342114ef..87da20391 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -1,4 +1,7 @@ +use std::collections::VecDeque; + use crate::{ + documents::{geo_sort::next_bucket, GeoSortParameter}, heed_codec::{ facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec, @@ -12,38 +15,64 @@ use crate::{ use heed::Database; use roaring::RoaringBitmap; +#[derive(Debug, Clone, Copy)] +enum AscDescId { + Facet { field_id: u16, ascending: bool }, + Geo { field_ids: [u16; 2], target_point: [f64; 2], ascending: bool }, +} + /// Builder for a [`SortedDocumentsIterator`]. /// Most builders won't ever be built, because pagination will skip them. pub struct SortedDocumentsIteratorBuilder<'ctx> { + index: &'ctx crate::Index, rtxn: &'ctx heed::RoTxn<'ctx>, number_db: Database, FacetGroupValueCodec>, string_db: Database, FacetGroupValueCodec>, - fields: &'ctx [(u16, bool)], + fields: &'ctx [AscDescId], candidates: RoaringBitmap, + geo_candidates: &'ctx RoaringBitmap, } impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { /// Performs the sort and builds a [`SortedDocumentsIterator`]. - fn build(self) -> heed::Result> { - let SortedDocumentsIteratorBuilder { rtxn, number_db, string_db, fields, candidates } = - self; - let size = candidates.len() as usize; + fn build(self) -> crate::Result> { + let size = self.candidates.len() as usize; // There is no point sorting a 1-element array if size <= 1 { return Ok(SortedDocumentsIterator::Leaf { size, - values: Box::new(candidates.into_iter()), + values: Box::new(self.candidates.into_iter()), }); } - // There is no variable to sort on - let Some((field_id, ascending)) = fields.first().copied() else { - return Ok(SortedDocumentsIterator::Leaf { + match self.fields.first().copied() { + Some(AscDescId::Facet { field_id, ascending }) => self.build_facet(field_id, ascending), + Some(AscDescId::Geo { field_ids, target_point, ascending }) => { + self.build_geo(field_ids, target_point, ascending) + } + None => Ok(SortedDocumentsIterator::Leaf { size, - values: Box::new(candidates.into_iter()), - }); - }; + values: Box::new(self.candidates.into_iter()), + }), + } + } + + fn build_facet( + self, + field_id: u16, + ascending: bool, + ) -> crate::Result> { + let SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields, + candidates, + geo_candidates, + } = self; + let size = candidates.len() as usize; // Perform the sort on the first field let (number_iter, string_iter) = if ascending { @@ -62,25 +91,29 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { let number_db2 = number_db; let string_db2 = string_db; let number_iter = - number_iter.map(move |r| -> heed::Result { + number_iter.map(move |r| -> crate::Result { let (docids, _bytes) = r?; Ok(SortedDocumentsIteratorBuilder { + index, rtxn, number_db, string_db, fields: &fields[1..], candidates: docids, + geo_candidates, }) }); let string_iter = - string_iter.map(move |r| -> heed::Result { + string_iter.map(move |r| -> crate::Result { let (docids, _bytes) = r?; Ok(SortedDocumentsIteratorBuilder { + index, rtxn, number_db: number_db2, string_db: string_db2, fields: &fields[1..], candidates: docids, + geo_candidates, }) }); @@ -90,6 +123,60 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { next_children: Box::new(number_iter.chain(string_iter)), }) } + + fn build_geo( + self, + field_ids: [u16; 2], + target_point: [f64; 2], + ascending: bool, + ) -> crate::Result> { + let SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields, + candidates, + geo_candidates, + } = self; + + let mut cache = VecDeque::new(); + let mut rtree = None; + let size = candidates.len() as usize; + + let next_children = std::iter::from_fn(move || { + match next_bucket( + index, + rtxn, + &candidates, + ascending, + target_point, + &Some(field_ids), + &mut rtree, + &mut cache, + geo_candidates, + GeoSortParameter::default(), + ) { + Ok(Some((docids, _point))) => Some(Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: &fields[1..], + candidates: docids, + geo_candidates, + })), + Ok(None) => None, + Err(e) => Some(Err(e)), + } + }); + + Ok(SortedDocumentsIterator::Branch { + current_child: None, + next_children_size: size, // TODO: confirm all candidates will be included + next_children: Box::new(next_children), + }) + } } /// A [`SortedDocumentsIterator`] allows efficient access to a continuous range of sorted documents. @@ -108,7 +195,7 @@ pub enum SortedDocumentsIterator<'ctx> { next_children_size: usize, /// Iterators to become the current child once it is exhausted next_children: - Box>> + 'ctx>, + Box>> + 'ctx>, }, } @@ -118,9 +205,9 @@ impl SortedDocumentsIterator<'_> { current_child: &mut Option>>, next_children_size: &mut usize, next_children: &mut Box< - dyn Iterator>> + 'ctx, + dyn Iterator>> + 'ctx, >, - ) -> heed::Result<()> { + ) -> crate::Result<()> { if current_child.is_none() { *current_child = match next_children.next() { Some(Ok(builder)) => { @@ -137,7 +224,7 @@ impl SortedDocumentsIterator<'_> { } impl Iterator for SortedDocumentsIterator<'_> { - type Item = heed::Result; + type Item = crate::Result; fn nth(&mut self, n: usize) -> Option { // If it's at the leaf level, just forward the call to the values iterator @@ -241,21 +328,25 @@ impl Iterator for SortedDocumentsIterator<'_> { /// A structure owning the data needed during the lifetime of a [`SortedDocumentsIterator`]. pub struct SortedDocuments<'ctx> { + index: &'ctx crate::Index, rtxn: &'ctx heed::RoTxn<'ctx>, - fields: Vec<(u16, bool)>, + fields: Vec, number_db: Database, FacetGroupValueCodec>, string_db: Database, FacetGroupValueCodec>, candidates: &'ctx RoaringBitmap, + geo_candidates: RoaringBitmap, } impl<'ctx> SortedDocuments<'ctx> { - pub fn iter(&'ctx self) -> heed::Result> { + pub fn iter(&'ctx self) -> crate::Result> { let builder = SortedDocumentsIteratorBuilder { + index: self.index, rtxn: self.rtxn, number_db: self.number_db, string_db: self.string_db, fields: &self.fields, candidates: self.candidates.clone(), + geo_candidates: &self.geo_candidates, }; builder.build() } @@ -264,28 +355,55 @@ impl<'ctx> SortedDocuments<'ctx> { pub fn recursive_facet_sort<'ctx>( index: &'ctx crate::Index, rtxn: &'ctx heed::RoTxn<'ctx>, - sort: &[AscDesc], + sort: Vec, candidates: &'ctx RoaringBitmap, ) -> crate::Result> { - check_sort_criteria(index, rtxn, Some(sort))?; + check_sort_criteria(index, rtxn, Some(&sort))?; let mut fields = Vec::new(); let fields_ids_map = index.fields_ids_map(rtxn)?; + let geo_candidates = index.geo_faceted_documents_ids(rtxn)?; // TODO: skip when no geo sort for sort in sort { - let (field_id, ascending) = match sort { - AscDesc::Asc(Member::Field(field)) => (fields_ids_map.id(field), true), - AscDesc::Desc(Member::Field(field)) => (fields_ids_map.id(field), false), - AscDesc::Asc(Member::Geo(_)) => todo!(), - AscDesc::Desc(Member::Geo(_)) => todo!(), + match sort { + AscDesc::Asc(Member::Field(field)) => { + if let Some(field_id) = fields_ids_map.id(&field) { + fields.push(AscDescId::Facet { field_id, ascending: true }); + } + } + AscDesc::Desc(Member::Field(field)) => { + if let Some(field_id) = fields_ids_map.id(&field) { + fields.push(AscDescId::Facet { field_id, ascending: false }); + } + } + AscDesc::Asc(Member::Geo(target_point)) => { + if let (Some(lat), Some(lng)) = + (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) + { + fields.push(AscDescId::Geo { + field_ids: [lat, lng], + target_point, + ascending: true, + }); + } + } + AscDesc::Desc(Member::Geo(target_point)) => { + if let (Some(lat), Some(lng)) = + (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) + { + fields.push(AscDescId::Geo { + field_ids: [lat, lng], + target_point, + ascending: false, + }); + } + } }; - if let Some(field_id) = field_id { - fields.push((field_id, ascending)); // FIXME: Should this return an error if the field is not found? - } + // FIXME: Should this return an error if the field is not found? } let number_db = index.facet_id_f64_docids.remap_key_type::>(); let string_db = index.facet_id_string_docids.remap_key_type::>(); - Ok(SortedDocuments { rtxn, fields, number_db, string_db, candidates }) + Ok(SortedDocuments { index, rtxn, fields, number_db, string_db, candidates, geo_candidates }) } diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 48013b2ee..b073d271c 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -10,7 +10,6 @@ pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FAC pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; use crate::documents::GeoSortParameter; -use crate::documents::GeoSortStrategy; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; use crate::score_details::{ScoreDetails, ScoringStrategy}; @@ -147,7 +146,7 @@ impl<'a> Search<'a> { } #[cfg(test)] - pub fn geo_sort_strategy(&mut self, strategy: GeoSortStrategy) -> &mut Search<'a> { + pub fn geo_sort_strategy(&mut self, strategy: crate::GeoSortStrategy) -> &mut Search<'a> { self.geo_param.strategy = strategy; self } diff --git a/crates/milli/src/search/new/geo_sort.rs b/crates/milli/src/search/new/geo_sort.rs index 47001267d..6c7d7b03b 100644 --- a/crates/milli/src/search/new/geo_sort.rs +++ b/crates/milli/src/search/new/geo_sort.rs @@ -1,25 +1,13 @@ use std::collections::VecDeque; -use heed::types::{Bytes, Unit}; -use heed::{RoPrefix, RoTxn}; use roaring::RoaringBitmap; use rstar::RTree; -use super::facet_string_values; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use crate::documents::geo_sort::{fill_cache, next_bucket}; use crate::documents::{GeoSortParameter, GeoSortStrategy}; -use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}; use crate::score_details::{self, ScoreDetails}; -use crate::{GeoPoint, Index, Result, SearchContext, SearchLogger}; - -const FID_SIZE: usize = 2; -const DOCID_SIZE: usize = 4; - -#[allow(clippy::drop_non_drop)] -fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { - concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) -} +use crate::{GeoPoint, Result, SearchContext, SearchLogger}; pub struct GeoSort { query: Option, From f6803dd7d100b9d419747c69fc03f2207a64dfa9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 30 Jun 2025 14:05:23 +0200 Subject: [PATCH 08/23] Simplify iterator chaining in facet sort --- .../milli/src/facet/facet_sort_recursive.rs | 43 ++++++------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 87da20391..6f26ad16f 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -88,39 +88,24 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { }; // Create builders for the next level of the tree - let number_db2 = number_db; - let string_db2 = string_db; - let number_iter = - number_iter.map(move |r| -> crate::Result { - let (docids, _bytes) = r?; - Ok(SortedDocumentsIteratorBuilder { - index, - rtxn, - number_db, - string_db, - fields: &fields[1..], - candidates: docids, - geo_candidates, - }) - }); - let string_iter = - string_iter.map(move |r| -> crate::Result { - let (docids, _bytes) = r?; - Ok(SortedDocumentsIteratorBuilder { - index, - rtxn, - number_db: number_db2, - string_db: string_db2, - fields: &fields[1..], - candidates: docids, - geo_candidates, - }) - }); + let number_iter = number_iter.map(|r| r.map(|(d, _)| d)); + let string_iter = string_iter.map(|r| r.map(|(d, _)| d)); + let next_children = number_iter.chain(string_iter).map(move |r| { + Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: &fields[1..], + candidates: r?, + geo_candidates, + }) + }); Ok(SortedDocumentsIterator::Branch { current_child: None, next_children_size: size, - next_children: Box::new(number_iter.chain(string_iter)), + next_children: Box::new(next_children), }) } From 29e9c74a49dd152c1db37e6b54d547dfeebc3746 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 30 Jun 2025 16:17:04 +0200 Subject: [PATCH 09/23] Merge two ifs --- crates/meilisearch/src/routes/indexes/documents.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index be6d647f7..d9b3f106f 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1554,13 +1554,10 @@ fn retrieve_documents>( })? } - let mut facet_sort = None; - if let Some(sort) = sort_criteria { - facet_sort = Some(recursive_facet_sort(index, &rtxn, sort, &candidates)?) - } - - let (it, number_of_documents) = if let Some(facet_sort) = &facet_sort { + let facet_sort; + let (it, number_of_documents) = if let Some(sort) = sort_criteria { let number_of_documents = candidates.len(); + facet_sort = recursive_facet_sort(index, &rtxn, sort, &candidates)?; let iter = facet_sort.iter()?; ( itertools::Either::Left(some_documents( From eb2c2815b63c049d5e2aeabe6b67283bb2759ca6 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 10:00:10 +0200 Subject: [PATCH 10/23] Fix panic --- .../milli/src/facet/facet_sort_recursive.rs | 65 +++++++++++++------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 6f26ad16f..213d18624 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -128,37 +128,60 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { let mut cache = VecDeque::new(); let mut rtree = None; let size = candidates.len() as usize; + let not_geo_candidates = candidates.clone() - geo_candidates; + let mut geo_remaining = size - not_geo_candidates.len() as usize; + let mut not_geo_candidates = Some(not_geo_candidates); let next_children = std::iter::from_fn(move || { - match next_bucket( - index, - rtxn, - &candidates, - ascending, - target_point, - &Some(field_ids), - &mut rtree, - &mut cache, - geo_candidates, - GeoSortParameter::default(), - ) { - Ok(Some((docids, _point))) => Some(Ok(SortedDocumentsIteratorBuilder { + // Find the next bucket of geo-sorted documents. + // next_bucket loops and will go back to the beginning so we use a variable to track how many are left. + if geo_remaining > 0 { + if let Ok(Some((docids, _point))) = next_bucket( index, rtxn, - number_db, - string_db, - fields: &fields[1..], - candidates: docids, + &candidates, + ascending, + target_point, + &Some(field_ids), + &mut rtree, + &mut cache, geo_candidates, - })), - Ok(None) => None, - Err(e) => Some(Err(e)), + GeoSortParameter::default(), + ) { + geo_remaining -= docids.len() as usize; + return Some(Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: &fields[1..], + candidates: docids, + geo_candidates, + })); + } } + + // Once all geo candidates have been processed, we can return the others + if let Some(not_geo_candidates) = not_geo_candidates.take() { + if !not_geo_candidates.is_empty() { + return Some(Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: &fields[1..], + candidates: not_geo_candidates, + geo_candidates, + })); + } + } + + None }); Ok(SortedDocumentsIterator::Branch { current_child: None, - next_children_size: size, // TODO: confirm all candidates will be included + next_children_size: size, next_children: Box::new(next_children), }) } From f4a908669cb4db3df0ce9ff68c5cd13d0b26cf5c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 10:02:15 +0200 Subject: [PATCH 11/23] Add tests --- crates/meilisearch/tests/common/index.rs | 2 + .../tests/documents/get_documents.rs | 289 +++++++++++++++++- crates/meilisearch/tests/vector/settings.rs | 9 +- .../milli/src/facet/facet_sort_recursive.rs | 2 +- 4 files changed, 291 insertions(+), 11 deletions(-) diff --git a/crates/meilisearch/tests/common/index.rs b/crates/meilisearch/tests/common/index.rs index e324d2ff5..f1fdeba91 100644 --- a/crates/meilisearch/tests/common/index.rs +++ b/crates/meilisearch/tests/common/index.rs @@ -562,5 +562,7 @@ pub struct GetAllDocumentsOptions { pub offset: Option, #[serde(skip_serializing_if = "Option::is_none")] pub fields: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub sort: Option>, pub retrieve_vectors: bool, } diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 63dc224c2..2267b8f5d 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -5,8 +5,8 @@ use urlencoding::encode as urlencode; use crate::common::encoder::Encoder; use crate::common::{ - shared_does_not_exists_index, shared_empty_index, shared_index_with_test_set, - GetAllDocumentsOptions, Server, Value, + shared_does_not_exists_index, shared_empty_index, shared_index_with_geo_documents, + shared_index_with_test_set, GetAllDocumentsOptions, Server, Value, }; use crate::json; @@ -83,6 +83,291 @@ async fn get_document() { ); } +#[actix_rt::test] +async fn get_document_sorted() { + let server = Server::new_shared(); + let index = server.unique_index(); + index.load_test_set().await; + + let (task, _status_code) = + index.update_settings_sortable_attributes(json!(["age", "email", "gender", "name"])).await; + server.wait_task(task.uid()).await.succeeded(); + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["id", "age", "email"]), + sort: Some(vec!["age:asc", "email:desc"]), + ..Default::default() + }) + .await; + let results = response["results"].as_array().unwrap(); + snapshot!(json_string!(results), @r#" + [ + { + "id": 5, + "age": 20, + "email": "warrenwatson@chorizon.com" + }, + { + "id": 6, + "age": 20, + "email": "sheliaberry@chorizon.com" + }, + { + "id": 57, + "age": 20, + "email": "kaitlinconner@chorizon.com" + }, + { + "id": 45, + "age": 20, + "email": "irenebennett@chorizon.com" + }, + { + "id": 40, + "age": 21, + "email": "staffordemerson@chorizon.com" + }, + { + "id": 41, + "age": 21, + "email": "salinasgamble@chorizon.com" + }, + { + "id": 63, + "age": 21, + "email": "knowleshebert@chorizon.com" + }, + { + "id": 50, + "age": 21, + "email": "guerramcintyre@chorizon.com" + }, + { + "id": 44, + "age": 22, + "email": "jonispears@chorizon.com" + }, + { + "id": 56, + "age": 23, + "email": "tuckerbarry@chorizon.com" + }, + { + "id": 51, + "age": 23, + "email": "keycervantes@chorizon.com" + }, + { + "id": 60, + "age": 23, + "email": "jodyherrera@chorizon.com" + }, + { + "id": 70, + "age": 23, + "email": "glassperkins@chorizon.com" + }, + { + "id": 75, + "age": 24, + "email": "emmajacobs@chorizon.com" + }, + { + "id": 68, + "age": 24, + "email": "angelinadyer@chorizon.com" + }, + { + "id": 17, + "age": 25, + "email": "ortegabrennan@chorizon.com" + }, + { + "id": 76, + "age": 25, + "email": "claricegardner@chorizon.com" + }, + { + "id": 43, + "age": 25, + "email": "arnoldbender@chorizon.com" + }, + { + "id": 12, + "age": 25, + "email": "aidakirby@chorizon.com" + }, + { + "id": 9, + "age": 26, + "email": "kellimendez@chorizon.com" + } + ] + "#); + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["id", "gender", "name"]), + sort: Some(vec!["gender:asc", "name:asc"]), + ..Default::default() + }) + .await; + let results = response["results"].as_array().unwrap(); + snapshot!(json_string!(results), @r#" + [ + { + "id": 3, + "name": "Adeline Flynn", + "gender": "female" + }, + { + "id": 12, + "name": "Aida Kirby", + "gender": "female" + }, + { + "id": 68, + "name": "Angelina Dyer", + "gender": "female" + }, + { + "id": 15, + "name": "Aurelia Contreras", + "gender": "female" + }, + { + "id": 36, + "name": "Barbra Valenzuela", + "gender": "female" + }, + { + "id": 23, + "name": "Blanca Mcclain", + "gender": "female" + }, + { + "id": 53, + "name": "Caitlin Burnett", + "gender": "female" + }, + { + "id": 71, + "name": "Candace Sawyer", + "gender": "female" + }, + { + "id": 65, + "name": "Carole Rowland", + "gender": "female" + }, + { + "id": 33, + "name": "Cecilia Greer", + "gender": "female" + }, + { + "id": 1, + "name": "Cherry Orr", + "gender": "female" + }, + { + "id": 38, + "name": "Christina Short", + "gender": "female" + }, + { + "id": 7, + "name": "Chrystal Boyd", + "gender": "female" + }, + { + "id": 76, + "name": "Clarice Gardner", + "gender": "female" + }, + { + "id": 73, + "name": "Eleanor Shepherd", + "gender": "female" + }, + { + "id": 75, + "name": "Emma Jacobs", + "gender": "female" + }, + { + "id": 16, + "name": "Estella Bass", + "gender": "female" + }, + { + "id": 62, + "name": "Estelle Ramirez", + "gender": "female" + }, + { + "id": 20, + "name": "Florence Long", + "gender": "female" + }, + { + "id": 42, + "name": "Graciela Russell", + "gender": "female" + } + ] + "#); +} + +#[actix_rt::test] +async fn get_document_geosorted() { + let index = shared_index_with_geo_documents().await; + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + sort: Some(vec!["_geoPoint(45.4777599, 9.1967508):asc"]), + ..Default::default() + }) + .await; + let results = response["results"].as_array().unwrap(); + snapshot!(json_string!(results), @r#" + [ + { + "id": 2, + "name": "La Bella Italia", + "address": "456 Elm Street, Townsville", + "type": "Italian", + "rating": 9, + "_geo": { + "lat": "45.4777599", + "lng": "9.1967508" + } + }, + { + "id": 1, + "name": "Taco Truck", + "address": "444 Salsa Street, Burritoville", + "type": "Mexican", + "rating": 9, + "_geo": { + "lat": 34.0522, + "lng": -118.2437 + } + }, + { + "id": 3, + "name": "Crêpe Truck", + "address": "2 Billig Avenue, Rouenville", + "type": "French", + "rating": 10 + } + ] + "#); +} + +// TODO test on not sortable attributes + #[actix_rt::test] async fn error_get_unexisting_index_all_documents() { let index = shared_does_not_exists_index().await; diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index 50253f930..d26174faf 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -101,14 +101,7 @@ async fn reset_embedder_documents() { server.wait_task(response.uid()).await; // Make sure the documents are still present - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { - limit: None, - offset: None, - retrieve_vectors: false, - fields: None, - }) - .await; + let (documents, _code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(json_string!(documents), @r###" { "results": [ diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 213d18624..504f80d12 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -160,7 +160,7 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { })); } } - + // Once all geo candidates have been processed, we can return the others if let Some(not_geo_candidates) = not_geo_candidates.take() { if !not_geo_candidates.is_empty() { From 8326f34ad127629bf33f81945fa621bfc1e16fc0 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:35:28 +0200 Subject: [PATCH 12/23] Add analytics --- .../src/routes/indexes/documents.rs | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index d9b3f106f..c8198d9a7 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -138,6 +138,8 @@ pub struct DocumentsFetchAggregator { per_document_id: bool, // if a filter was used per_filter: bool, + // if documents were sorted + sort: bool, #[serde(rename = "vector.retrieve_vectors")] retrieve_vectors: bool, @@ -156,16 +158,28 @@ pub struct DocumentsFetchAggregator { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { - PerDocumentId { retrieve_vectors: bool }, - Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool, ids: usize }, + PerDocumentId { + retrieve_vectors: bool, + sort: bool, + }, + Normal { + with_filter: bool, + limit: usize, + offset: usize, + retrieve_vectors: bool, + sort: bool, + ids: usize, + }, } impl DocumentsFetchAggregator { pub fn from_query(query: &DocumentFetchKind) -> Self { - let (limit, offset, retrieve_vectors) = match query { - DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), - DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { - (*limit, *offset, *retrieve_vectors) + let (limit, offset, retrieve_vectors, sort) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors, sort } => { + (1, 0, *retrieve_vectors, *sort) + } + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, sort, .. } => { + (*limit, *offset, *retrieve_vectors, *sort) } }; @@ -179,6 +193,7 @@ impl DocumentsFetchAggregator { per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, + sort, retrieve_vectors, max_document_ids: ids, @@ -196,6 +211,7 @@ impl Aggregate for DocumentsFetchAggregator { Box::new(Self { per_document_id: self.per_document_id | new.per_document_id, per_filter: self.per_filter | new.per_filter, + sort: self.sort | new.sort, retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, max_limit: self.max_limit.max(new.max_limit), max_offset: self.max_offset.max(new.max_offset), @@ -279,6 +295,7 @@ pub async fn get_document( retrieve_vectors: param_retrieve_vectors.0, per_document_id: true, per_filter: false, + sort: false, max_limit: 0, max_offset: 0, max_document_ids: 0, @@ -503,6 +520,7 @@ pub async fn documents_by_query_post( analytics.publish( DocumentsFetchAggregator:: { per_filter: body.filter.is_some(), + sort: body.sort.is_some(), retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, max_offset: body.offset, @@ -603,6 +621,7 @@ pub async fn get_documents( analytics.publish( DocumentsFetchAggregator:: { per_filter: query.filter.is_some(), + sort: query.sort.is_some(), retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, max_offset: query.offset, From 8aacd6374acd2387d2c66f46a09105ec2867e441 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:50:01 +0200 Subject: [PATCH 13/23] Optimize geo sort --- crates/milli/src/facet/facet_sort_recursive.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 504f80d12..a4d65f91d 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -370,7 +370,7 @@ pub fn recursive_facet_sort<'ctx>( let mut fields = Vec::new(); let fields_ids_map = index.fields_ids_map(rtxn)?; - let geo_candidates = index.geo_faceted_documents_ids(rtxn)?; // TODO: skip when no geo sort + let mut need_geo_candidates = false; for sort in sort { match sort { AscDesc::Asc(Member::Field(field)) => { @@ -387,6 +387,7 @@ pub fn recursive_facet_sort<'ctx>( if let (Some(lat), Some(lng)) = (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) { + need_geo_candidates = true; fields.push(AscDescId::Geo { field_ids: [lat, lng], target_point, @@ -398,6 +399,7 @@ pub fn recursive_facet_sort<'ctx>( if let (Some(lat), Some(lng)) = (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) { + need_geo_candidates = true; fields.push(AscDescId::Geo { field_ids: [lat, lng], target_point, @@ -409,6 +411,12 @@ pub fn recursive_facet_sort<'ctx>( // FIXME: Should this return an error if the field is not found? } + let geo_candidates = if need_geo_candidates { + index.geo_faceted_documents_ids(rtxn)? + } else { + RoaringBitmap::new() + }; + let number_db = index.facet_id_f64_docids.remap_key_type::>(); let string_db = index.facet_id_string_docids.remap_key_type::>(); From 283944ea8979e007afb47f6dcd63e245d1542fcb Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 12:03:50 +0200 Subject: [PATCH 14/23] Differentiate between document sort error and search sort error --- crates/meilisearch-types/src/error.rs | 3 ++- .../meilisearch/src/routes/indexes/documents.rs | 2 +- .../meilisearch/src/search/federated/perform.rs | 7 +++---- crates/meilisearch/src/search/mod.rs | 2 +- crates/milli/src/asc_desc.rs | 16 ++++++++++------ crates/milli/src/error.rs | 4 ++-- crates/milli/src/facet/facet_sort_recursive.rs | 2 +- 7 files changed, 20 insertions(+), 16 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 2eb22035e..9cf1b93a0 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -483,7 +483,8 @@ impl ErrorCode for milli::Error { UserError::InvalidVectorsMapType { .. } | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, - UserError::SortError(_) => Code::InvalidSearchSort, + UserError::SortError { search: true, .. } => Code::InvalidSearchSort, + UserError::SortError { search: false, .. } => Code::InvalidDocumentSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { Code::InvalidSettingsTypoTolerance } diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index c8198d9a7..b66eec535 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -663,7 +663,7 @@ fn documents_by_query( let sorts: Vec<_> = match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { Ok(sorts) => sorts, Err(asc_desc_error) => { - return Err(milli::Error::from(milli::SortError::from(asc_desc_error)).into()) + return Err(milli::SortError::from(asc_desc_error).into_documents_error().into()) } }; Some(sorts) diff --git a/crates/meilisearch/src/search/federated/perform.rs b/crates/meilisearch/src/search/federated/perform.rs index 5ad64d63c..c0fec01e8 100644 --- a/crates/meilisearch/src/search/federated/perform.rs +++ b/crates/meilisearch/src/search/federated/perform.rs @@ -745,10 +745,9 @@ impl SearchByIndex { match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { Ok(sorts) => sorts, Err(asc_desc_error) => { - return Err(milli::Error::from(milli::SortError::from( - asc_desc_error, - )) - .into()) + return Err(milli::SortError::from(asc_desc_error) + .into_search_error() + .into()) } }; Some(sorts) diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 5e543c53f..f57bc9b9a 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1060,7 +1060,7 @@ pub fn prepare_search<'t>( let sort = match sort.iter().map(|s| AscDesc::from_str(s)).collect() { Ok(sorts) => sorts, Err(asc_desc_error) => { - return Err(milli::Error::from(SortError::from(asc_desc_error)).into()) + return Err(SortError::from(asc_desc_error).into_search_error().into()) } }; diff --git a/crates/milli/src/asc_desc.rs b/crates/milli/src/asc_desc.rs index e75adf83d..999b02511 100644 --- a/crates/milli/src/asc_desc.rs +++ b/crates/milli/src/asc_desc.rs @@ -168,6 +168,16 @@ pub enum SortError { ReservedNameForFilter { name: String }, } +impl SortError { + pub fn into_search_error(self) -> Error { + Error::UserError(UserError::SortError { error: self, search: true }) + } + + pub fn into_documents_error(self) -> Error { + Error::UserError(UserError::SortError { error: self, search: false }) + } +} + impl From for SortError { fn from(error: AscDescError) -> Self { match error { @@ -190,12 +200,6 @@ impl From for SortError { } } -impl From for Error { - fn from(error: SortError) -> Self { - Self::UserError(UserError::SortError(error)) - } -} - #[cfg(test)] mod tests { use big_s::S; diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 2136ec97e..2624a9824 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -272,8 +272,8 @@ and can not be more than 511 bytes.", .document_id.to_string() PrimaryKeyCannotBeChanged(String), #[error(transparent)] SerdeJson(serde_json::Error), - #[error(transparent)] - SortError(#[from] SortError), + #[error("{error}")] + SortError { error: SortError, search: bool }, #[error("An unknown internal document id have been used: `{document_id}`.")] UnknownInternalDocumentId { document_id: DocumentId }, #[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")] diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index a4d65f91d..19bf5afb9 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -411,7 +411,7 @@ pub fn recursive_facet_sort<'ctx>( // FIXME: Should this return an error if the field is not found? } - let geo_candidates = if need_geo_candidates { + let geo_candidates = if need_geo_candidates { index.geo_faceted_documents_ids(rtxn)? } else { RoaringBitmap::new() From 8419fd9b3b8dbadf8edfed87c6dca3bb1d798f07 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 13:42:38 +0200 Subject: [PATCH 15/23] Ditch usage of check_sort_criteria --- crates/meilisearch-types/src/error.rs | 3 +- crates/milli/src/error.rs | 18 +++- .../milli/src/facet/facet_sort_recursive.rs | 89 +++++++++---------- crates/milli/src/search/new/mod.rs | 4 +- 4 files changed, 63 insertions(+), 51 deletions(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 9cf1b93a0..e43b28fc6 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -467,7 +467,8 @@ impl ErrorCode for milli::Error { UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct, UserError::SortRankingRuleMissing => Code::InvalidSearchSort, UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets, - UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort, + UserError::InvalidSearchSortableAttribute { .. } => Code::InvalidSearchSort, + UserError::InvalidDocumentSortableAttribute { .. } => Code::InvalidDocumentSort, UserError::InvalidSearchableAttribute { .. } => { Code::InvalidSearchAttributesToSearchOn } diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 2624a9824..f3b390690 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -191,7 +191,21 @@ and can not be more than 511 bytes.", .document_id.to_string() ), } )] - InvalidSortableAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, + InvalidSearchSortableAttribute { + field: String, + valid_fields: BTreeSet, + hidden_fields: bool, + }, + #[error("Attribute `{}` is not sortable. {}", + .field, + match .sortable_fields.is_empty() { + true => "This index does not have configured sortable attributes.".to_string(), + false => format!("Available sortable attributes are: `{}`.", + sortable_fields.iter().map(AsRef::as_ref).collect::>().join(", ") + ), + } + )] + InvalidDocumentSortableAttribute { field: String, sortable_fields: BTreeSet }, #[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}", .field, match (.valid_patterns.is_empty(), .matching_rule_index) { @@ -614,7 +628,7 @@ fn conditionally_lookup_for_error_message() { ]; for (list, suffix) in messages { - let err = UserError::InvalidSortableAttribute { + let err = UserError::InvalidSearchSortableAttribute { field: "name".to_string(), valid_fields: list, hidden_fields: false, diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index 19bf5afb9..ab62ebcfd 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -1,16 +1,15 @@ -use std::collections::VecDeque; +use std::collections::{BTreeSet, VecDeque}; use crate::{ + constants::RESERVED_GEO_FIELD_NAME, documents::{geo_sort::next_bucket, GeoSortParameter}, heed_codec::{ facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec, }, - search::{ - facet::{ascending_facet_sort, descending_facet_sort}, - new::check_sort_criteria, - }, - AscDesc, DocumentId, Member, + is_faceted, + search::facet::{ascending_facet_sort, descending_facet_sort}, + AscDesc, DocumentId, Member, UserError, }; use heed::Database; use roaring::RoaringBitmap; @@ -366,49 +365,47 @@ pub fn recursive_facet_sort<'ctx>( sort: Vec, candidates: &'ctx RoaringBitmap, ) -> crate::Result> { - check_sort_criteria(index, rtxn, Some(&sort))?; - - let mut fields = Vec::new(); + let sortable_fields: BTreeSet<_> = index.sortable_fields(rtxn)?.into_iter().collect(); let fields_ids_map = index.fields_ids_map(rtxn)?; + + let mut fields = Vec::new(); let mut need_geo_candidates = false; - for sort in sort { - match sort { - AscDesc::Asc(Member::Field(field)) => { - if let Some(field_id) = fields_ids_map.id(&field) { - fields.push(AscDescId::Facet { field_id, ascending: true }); - } - } - AscDesc::Desc(Member::Field(field)) => { - if let Some(field_id) = fields_ids_map.id(&field) { - fields.push(AscDescId::Facet { field_id, ascending: false }); - } - } - AscDesc::Asc(Member::Geo(target_point)) => { - if let (Some(lat), Some(lng)) = - (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) - { - need_geo_candidates = true; - fields.push(AscDescId::Geo { - field_ids: [lat, lng], - target_point, - ascending: true, - }); - } - } - AscDesc::Desc(Member::Geo(target_point)) => { - if let (Some(lat), Some(lng)) = - (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) - { - need_geo_candidates = true; - fields.push(AscDescId::Geo { - field_ids: [lat, lng], - target_point, - ascending: false, - }); - } - } + for asc_desc in sort { + let (field, geofield) = match asc_desc { + AscDesc::Asc(Member::Field(field)) => (Some((field, true)), None), + AscDesc::Desc(Member::Field(field)) => (Some((field, false)), None), + AscDesc::Asc(Member::Geo(target_point)) => (None, Some((target_point, true))), + AscDesc::Desc(Member::Geo(target_point)) => (None, Some((target_point, false))), }; - // FIXME: Should this return an error if the field is not found? + if let Some((field, ascending)) = field { + if is_faceted(&field, &sortable_fields) { + if let Some(field_id) = fields_ids_map.id(&field) { + fields.push(AscDescId::Facet { field_id, ascending }); + continue; + } + } + return Err(UserError::InvalidDocumentSortableAttribute { + field: field.to_string(), + sortable_fields: sortable_fields.clone(), + } + .into()); + } + if let Some((target_point, ascending)) = geofield { + if sortable_fields.contains(RESERVED_GEO_FIELD_NAME) { + if let (Some(lat), Some(lng)) = + (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) + { + need_geo_candidates = true; + fields.push(AscDescId::Geo { field_ids: [lat, lng], target_point, ascending }); + continue; + } + } + return Err(UserError::InvalidDocumentSortableAttribute { + field: RESERVED_GEO_FIELD_NAME.to_string(), + sortable_fields: sortable_fields.clone(), + } + .into()); + } } let geo_candidates = if need_geo_candidates { diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index b5258413e..3983aa07a 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -903,7 +903,7 @@ pub(crate) fn check_sort_criteria( let (valid_fields, hidden_fields) = index.remove_hidden_fields(rtxn, sortable_fields)?; - return Err(UserError::InvalidSortableAttribute { + return Err(UserError::InvalidSearchSortableAttribute { field: field.to_string(), valid_fields, hidden_fields, @@ -914,7 +914,7 @@ pub(crate) fn check_sort_criteria( let (valid_fields, hidden_fields) = index.remove_hidden_fields(rtxn, sortable_fields)?; - return Err(UserError::InvalidSortableAttribute { + return Err(UserError::InvalidSearchSortableAttribute { field: RESERVED_GEO_FIELD_NAME.to_string(), valid_fields, hidden_fields, From 280c3907bebc9ad0077e4e3372428c8c2ce5b810 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 13:58:37 +0200 Subject: [PATCH 16/23] Add test to sort the unsortable --- .../tests/documents/get_documents.rs | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 2267b8f5d..5ee838232 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -366,7 +366,27 @@ async fn get_document_geosorted() { "#); } -// TODO test on not sortable attributes +#[actix_rt::test] +async fn get_document_sort_the_unsortable() { + let index = shared_index_with_test_set().await; + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["id", "name"]), + sort: Some(vec!["name:asc"]), + ..Default::default() + }) + .await; + + snapshot!(json_string!(response), @r#" + { + "message": "Attribute `name` is not sortable. This index does not have configured sortable attributes.", + "code": "invalid_document_sort", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_sort" + } + "#); +} #[actix_rt::test] async fn error_get_unexisting_index_all_documents() { From 9f55708d84ddd55cdcca5e442a97ab32fa34ad66 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 13:58:56 +0200 Subject: [PATCH 17/23] Format --- crates/milli/src/facet/facet_sort_recursive.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index ab62ebcfd..596ce6335 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -367,7 +367,7 @@ pub fn recursive_facet_sort<'ctx>( ) -> crate::Result> { let sortable_fields: BTreeSet<_> = index.sortable_fields(rtxn)?.into_iter().collect(); let fields_ids_map = index.fields_ids_map(rtxn)?; - + let mut fields = Vec::new(); let mut need_geo_candidates = false; for asc_desc in sort { From d85480de898d5f6b1c829fdd1fbed2bc381a1864 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:05:47 +0200 Subject: [PATCH 18/23] Move sort code out of facet --- crates/meilisearch/src/routes/indexes/documents.rs | 4 ++-- crates/milli/src/documents/mod.rs | 1 + .../src/{facet/facet_sort_recursive.rs => documents/sort.rs} | 2 +- crates/milli/src/facet/mod.rs | 1 - 4 files changed, 4 insertions(+), 4 deletions(-) rename crates/milli/src/{facet/facet_sort_recursive.rs => documents/sort.rs} (99%) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index b66eec535..e8499a789 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -18,7 +18,7 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::milli::facet::facet_sort_recursive::recursive_facet_sort; +use meilisearch_types::milli::documents::sort::recursive_sort; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::{AscDesc, DocumentId}; @@ -1576,7 +1576,7 @@ fn retrieve_documents>( let facet_sort; let (it, number_of_documents) = if let Some(sort) = sort_criteria { let number_of_documents = candidates.len(); - facet_sort = recursive_facet_sort(index, &rtxn, sort, &candidates)?; + facet_sort = recursive_sort(index, &rtxn, sort, &candidates)?; let iter = facet_sort.iter()?; ( itertools::Either::Left(some_documents( diff --git a/crates/milli/src/documents/mod.rs b/crates/milli/src/documents/mod.rs index b515c4e98..7a4babfa8 100644 --- a/crates/milli/src/documents/mod.rs +++ b/crates/milli/src/documents/mod.rs @@ -4,6 +4,7 @@ pub mod geo_sort; mod primary_key; mod reader; mod serde_impl; +pub mod sort; use std::fmt::Debug; use std::io; diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/documents/sort.rs similarity index 99% rename from crates/milli/src/facet/facet_sort_recursive.rs rename to crates/milli/src/documents/sort.rs index 596ce6335..4008a37a4 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/documents/sort.rs @@ -359,7 +359,7 @@ impl<'ctx> SortedDocuments<'ctx> { } } -pub fn recursive_facet_sort<'ctx>( +pub fn recursive_sort<'ctx>( index: &'ctx crate::Index, rtxn: &'ctx heed::RoTxn<'ctx>, sort: Vec, diff --git a/crates/milli/src/facet/mod.rs b/crates/milli/src/facet/mod.rs index 8b0b9a25e..274d2588d 100644 --- a/crates/milli/src/facet/mod.rs +++ b/crates/milli/src/facet/mod.rs @@ -1,4 +1,3 @@ -pub mod facet_sort_recursive; mod facet_type; mod facet_value; pub mod value_encoding; From 73dfeefc7ce3fbab172730bc1bfc37404dc21e87 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:08:46 +0200 Subject: [PATCH 19/23] Remove plural form --- crates/meilisearch/src/routes/indexes/documents.rs | 2 +- crates/milli/src/asc_desc.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index e8499a789..9c8d28e04 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -663,7 +663,7 @@ fn documents_by_query( let sorts: Vec<_> = match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { Ok(sorts) => sorts, Err(asc_desc_error) => { - return Err(milli::SortError::from(asc_desc_error).into_documents_error().into()) + return Err(milli::SortError::from(asc_desc_error).into_document_error().into()) } }; Some(sorts) diff --git a/crates/milli/src/asc_desc.rs b/crates/milli/src/asc_desc.rs index 999b02511..d7288faa3 100644 --- a/crates/milli/src/asc_desc.rs +++ b/crates/milli/src/asc_desc.rs @@ -173,7 +173,7 @@ impl SortError { Error::UserError(UserError::SortError { error: self, search: true }) } - pub fn into_documents_error(self) -> Error { + pub fn into_document_error(self) -> Error { Error::UserError(UserError::SortError { error: self, search: false }) } } From 27cc3573624d6c3bcd80066c4a95b4d455690ba3 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:21:55 +0200 Subject: [PATCH 20/23] Document code --- crates/milli/src/documents/sort.rs | 302 +++++++++++++++-------------- 1 file changed, 155 insertions(+), 147 deletions(-) diff --git a/crates/milli/src/documents/sort.rs b/crates/milli/src/documents/sort.rs index 4008a37a4..59858caad 100644 --- a/crates/milli/src/documents/sort.rs +++ b/crates/milli/src/documents/sort.rs @@ -20,6 +20,158 @@ enum AscDescId { Geo { field_ids: [u16; 2], target_point: [f64; 2], ascending: bool }, } +/// A [`SortedDocumentsIterator`] allows efficient access to a continuous range of sorted documents. +/// This is ideal in the context of paginated queries in which only a small number of documents are needed at a time. +/// Search operations will only be performed upon access. +pub enum SortedDocumentsIterator<'ctx> { + Leaf { + /// The exact number of documents remaining + size: usize, + values: Box + 'ctx>, + }, + Branch { + /// The current child, got from the children iterator + current_child: Option>>, + /// The exact number of documents remaining, excluding documents in the current child + next_children_size: usize, + /// Iterators to become the current child once it is exhausted + next_children: + Box>> + 'ctx>, + }, +} + +impl SortedDocumentsIterator<'_> { + /// Takes care of updating the current child if it is `None`, and also updates the size + fn update_current<'ctx>( + current_child: &mut Option>>, + next_children_size: &mut usize, + next_children: &mut Box< + dyn Iterator>> + 'ctx, + >, + ) -> crate::Result<()> { + if current_child.is_none() { + *current_child = match next_children.next() { + Some(Ok(builder)) => { + let next_child = Box::new(builder.build()?); + *next_children_size -= next_child.size_hint().0; + Some(next_child) + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + } + Ok(()) + } +} + +impl Iterator for SortedDocumentsIterator<'_> { + type Item = crate::Result; + + /// Implementing the `nth` method allows for efficient access to the nth document in the sorted order. + /// It's used by `skip` internally. + /// The default implementation of `nth` would iterate over all children, which is inefficient for large datasets. + /// This implementation will jump over whole chunks of children until it gets close. + fn nth(&mut self, n: usize) -> Option { + // If it's at the leaf level, just forward the call to the values iterator + let (current_child, next_children, next_children_size) = match self { + SortedDocumentsIterator::Leaf { values, size } => { + *size = size.saturating_sub(n); + return values.nth(n).map(Ok); + } + SortedDocumentsIterator::Branch { + current_child, + next_children, + next_children_size, + } => (current_child, next_children, next_children_size), + }; + + // Otherwise don't directly iterate over children, skip them if we know we will go further + let mut to_skip = n - 1; + while to_skip > 0 { + if let Err(e) = SortedDocumentsIterator::update_current( + current_child, + next_children_size, + next_children, + ) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; // No more inner iterators, everything has been consumed. + }; + + if to_skip >= inner.size_hint().0 { + // The current child isn't large enough to contain the nth element. + // Skip it and continue with the next one. + to_skip -= inner.size_hint().0; + *current_child = None; + continue; + } else { + // The current iterator is large enough, so we can forward the call to it. + return inner.nth(to_skip + 1); + } + } + + self.next() + } + + /// Iterators need to keep track of their size so that they can be skipped efficiently by the `nth` method. + fn size_hint(&self) -> (usize, Option) { + let size = match self { + SortedDocumentsIterator::Leaf { size, .. } => *size, + SortedDocumentsIterator::Branch { + next_children_size, + current_child: Some(current_child), + .. + } => current_child.size_hint().0 + next_children_size, + SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => { + *next_children_size + } + }; + + (size, Some(size)) + } + + fn next(&mut self) -> Option { + match self { + SortedDocumentsIterator::Leaf { values, size } => { + let result = values.next().map(Ok); + if result.is_some() { + *size -= 1; + } + result + } + SortedDocumentsIterator::Branch { + current_child, + next_children_size, + next_children, + } => { + let mut result = None; + while result.is_none() { + // Ensure we have selected an iterator to work with + if let Err(e) = SortedDocumentsIterator::update_current( + current_child, + next_children_size, + next_children, + ) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; + }; + + result = inner.next(); + + // If the current iterator is exhausted, we need to try the next one + if result.is_none() { + *current_child = None; + } + } + result + } + } + } +} + /// Builder for a [`SortedDocumentsIterator`]. /// Most builders won't ever be built, because pagination will skip them. pub struct SortedDocumentsIteratorBuilder<'ctx> { @@ -57,6 +209,7 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { } } + /// Builds a [`SortedDocumentsIterator`] based on the results of a facet sort. fn build_facet( self, field_id: u16, @@ -108,6 +261,7 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { }) } + /// Builds a [`SortedDocumentsIterator`] based on the (lazy) results of a geo sort. fn build_geo( self, field_ids: [u16; 2], @@ -186,153 +340,6 @@ impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { } } -/// A [`SortedDocumentsIterator`] allows efficient access to a continuous range of sorted documents. -/// This is ideal in the context of paginated queries in which only a small number of documents are needed at a time. -/// Search operations will only be performed upon access. -pub enum SortedDocumentsIterator<'ctx> { - Leaf { - /// The exact number of documents remaining - size: usize, - values: Box + 'ctx>, - }, - Branch { - /// The current child, got from the children iterator - current_child: Option>>, - /// The exact number of documents remaining, excluding documents in the current child - next_children_size: usize, - /// Iterators to become the current child once it is exhausted - next_children: - Box>> + 'ctx>, - }, -} - -impl SortedDocumentsIterator<'_> { - /// Takes care of updating the current child if it is `None`, and also updates the size - fn update_current<'ctx>( - current_child: &mut Option>>, - next_children_size: &mut usize, - next_children: &mut Box< - dyn Iterator>> + 'ctx, - >, - ) -> crate::Result<()> { - if current_child.is_none() { - *current_child = match next_children.next() { - Some(Ok(builder)) => { - let next_child = Box::new(builder.build()?); - *next_children_size -= next_child.size_hint().0; - Some(next_child) - } - Some(Err(e)) => return Err(e), - None => return Ok(()), - }; - } - Ok(()) - } -} - -impl Iterator for SortedDocumentsIterator<'_> { - type Item = crate::Result; - - fn nth(&mut self, n: usize) -> Option { - // If it's at the leaf level, just forward the call to the values iterator - let (current_child, next_children, next_children_size) = match self { - SortedDocumentsIterator::Leaf { values, size } => { - *size = size.saturating_sub(n); - return values.nth(n).map(Ok); - } - SortedDocumentsIterator::Branch { - current_child, - next_children, - next_children_size, - } => (current_child, next_children, next_children_size), - }; - - // Otherwise don't directly iterate over children, skip them if we know we will go further - let mut to_skip = n - 1; - while to_skip > 0 { - if let Err(e) = SortedDocumentsIterator::update_current( - current_child, - next_children_size, - next_children, - ) { - return Some(Err(e)); - } - let Some(inner) = current_child else { - return None; // No more inner iterators, everything has been consumed. - }; - - if to_skip >= inner.size_hint().0 { - // The current child isn't large enough to contain the nth element. - // Skip it and continue with the next one. - to_skip -= inner.size_hint().0; - *current_child = None; - continue; - } else { - // The current iterator is large enough, so we can forward the call to it. - return inner.nth(to_skip + 1); - } - } - - self.next() - } - - fn size_hint(&self) -> (usize, Option) { - let size = match self { - SortedDocumentsIterator::Leaf { size, .. } => *size, - SortedDocumentsIterator::Branch { - next_children_size, - current_child: Some(current_child), - .. - } => current_child.size_hint().0 + next_children_size, - SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => { - *next_children_size - } - }; - - (size, Some(size)) - } - - fn next(&mut self) -> Option { - match self { - SortedDocumentsIterator::Leaf { values, size } => { - let result = values.next().map(Ok); - if result.is_some() { - *size -= 1; - } - result - } - SortedDocumentsIterator::Branch { - current_child, - next_children_size, - next_children, - } => { - let mut result = None; - while result.is_none() { - // Ensure we have selected an iterator to work with - if let Err(e) = SortedDocumentsIterator::update_current( - current_child, - next_children_size, - next_children, - ) { - return Some(Err(e)); - } - let Some(inner) = current_child else { - return None; - }; - - result = inner.next(); - - // If the current iterator is exhausted, we need to try the next one - if result.is_none() { - *current_child = None; - } - } - result - } - } - } -} - /// A structure owning the data needed during the lifetime of a [`SortedDocumentsIterator`]. pub struct SortedDocuments<'ctx> { index: &'ctx crate::Index, @@ -368,6 +375,7 @@ pub fn recursive_sort<'ctx>( let sortable_fields: BTreeSet<_> = index.sortable_fields(rtxn)?.into_iter().collect(); let fields_ids_map = index.fields_ids_map(rtxn)?; + // Retrieve the field ids that are used for sorting let mut fields = Vec::new(); let mut need_geo_candidates = false; for asc_desc in sort { From e92b6beb2011f8488d197d9a5e31d3e9b3f88dd6 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:26:55 +0200 Subject: [PATCH 21/23] Revert making check_sort_criteria usable without a search context --- crates/milli/src/search/new/mod.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index 3983aa07a..ecc51161d 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -638,7 +638,7 @@ pub fn execute_vector_search( time_budget: TimeBudget, ranking_score_threshold: Option, ) -> Result { - check_sort_criteria(ctx.index, ctx.txn, sort_criteria.as_deref())?; + check_sort_criteria(ctx, sort_criteria.as_ref())?; // FIXME: input universe = universe & documents_with_vectors // for now if we're computing embeddings for ALL documents, we can assume that this is just universe @@ -702,7 +702,7 @@ pub fn execute_search( ranking_score_threshold: Option, locales: Option<&Vec>, ) -> Result { - check_sort_criteria(ctx.index, ctx.txn, sort_criteria.as_deref())?; + check_sort_criteria(ctx, sort_criteria.as_ref())?; let mut used_negative_operator = false; let mut located_query_terms = None; @@ -873,9 +873,8 @@ pub fn execute_search( } pub(crate) fn check_sort_criteria( - index: &Index, - rtxn: &RoTxn<'_>, - sort_criteria: Option<&[AscDesc]>, + ctx: &SearchContext<'_>, + sort_criteria: Option<&Vec>, ) -> Result<()> { let sort_criteria = if let Some(sort_criteria) = sort_criteria { sort_criteria @@ -889,19 +888,19 @@ pub(crate) fn check_sort_criteria( // We check that the sort ranking rule exists and throw an // error if we try to use it and that it doesn't. - let sort_ranking_rule_missing = !index.criteria(rtxn)?.contains(&crate::Criterion::Sort); + let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort); if sort_ranking_rule_missing { return Err(UserError::SortRankingRuleMissing.into()); } // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. - let sortable_fields = index.sortable_fields(rtxn)?; + let sortable_fields = ctx.index.sortable_fields(ctx.txn)?; for asc_desc in sort_criteria { match asc_desc.member() { Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, sortable_fields)?; + ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; return Err(UserError::InvalidSearchSortableAttribute { field: field.to_string(), @@ -912,7 +911,7 @@ pub(crate) fn check_sort_criteria( } Member::Geo(_) if !sortable_fields.contains(RESERVED_GEO_FIELD_NAME) => { let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, sortable_fields)?; + ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; return Err(UserError::InvalidSearchSortableAttribute { field: RESERVED_GEO_FIELD_NAME.to_string(), From 5a675bcb827300632262223b0c9d16525de17320 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 2 Jul 2025 11:50:32 +0200 Subject: [PATCH 22/23] Add benchmarks --- crates/benchmarks/Cargo.toml | 5 ++ crates/benchmarks/benches/sort.rs | 108 +++++++++++++++++++++++++++++ crates/benchmarks/benches/utils.rs | 98 +++++++++++++++++++++----- 3 files changed, 195 insertions(+), 16 deletions(-) create mode 100644 crates/benchmarks/benches/sort.rs diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index 9dccc444b..68ed5aff4 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -51,3 +51,8 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "sort" +harness = false + diff --git a/crates/benchmarks/benches/sort.rs b/crates/benchmarks/benches/sort.rs new file mode 100644 index 000000000..0dd392cb2 --- /dev/null +++ b/crates/benchmarks/benches/sort.rs @@ -0,0 +1,108 @@ +//! This benchmark module is used to compare the performance of sorting documents in /search VS /documents +//! +//! The tests/benchmarks were designed in the context of a query returning only 20 documents. + +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +#[cfg(not(windows))] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let sortable_fields = + ["_geo", "name", "population", "elevation", "timezone", "modification-date"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_sortable_fields(sortable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: "jsonl", + configure: base_conf, + primary_key: Some("geonameid"), + queries: &[""], + offsets: &[ + Some((0, 20)), // The most common query in the real world + Some((0, 500)), // A query that ranges over many documents + Some((980, 20)), // The worst query that could happen in the real world + Some((800_000, 20)) // The worst query + ], + get_documents: true, + ..Conf::BASE +}; + +fn bench_sort(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + // utils::Conf { + // group_name: "without sort", + // sort: None, + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many different values", + // sort: Some(vec!["name:asc"]), + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many similar values", + // sort: Some(vec!["timezone:desc"]), + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many similar then different values", + // sort: Some(vec!["timezone:desc", "name:asc"]), + // ..BASE_CONF + // }, + + // utils::Conf { + // group_name: "sort on many different then similar values", + // sort: Some(vec!["timezone:desc", "name:asc"]), + // ..BASE_CONF + // }, + + utils::Conf { + group_name: "geo sort", + sample_size: Some(10), + sort: Some(vec!["_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many similar values then geo sort", + sample_size: Some(10), + sort: Some(vec!["timezone:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many different values then geo sort", + sample_size: Some(10), + sort: Some(vec!["name:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_sort); +criterion_main!(benches); diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs index aaa2d50a0..93fa7506f 100644 --- a/crates/benchmarks/benches/utils.rs +++ b/crates/benchmarks/benches/utils.rs @@ -9,6 +9,7 @@ use anyhow::Context; use bumpalo::Bump; use criterion::BenchmarkId; use memmap2::Mmap; +use milli::documents::sort::recursive_sort; use milli::heed::EnvOpenOptions; use milli::progress::Progress; use milli::update::new::indexer; @@ -35,6 +36,12 @@ pub struct Conf<'a> { pub configure: fn(&mut Settings), pub filter: Option<&'a str>, pub sort: Option>, + /// set to skip documents (offset, limit) + pub offsets: &'a [Option<(usize, usize)>], + /// enable if you want to bench getting documents without querying + pub get_documents: bool, + /// configure the benchmark sample size + pub sample_size: Option, /// enable or disable the optional words on the query pub optional_words: bool, /// primary key, if there is None we'll auto-generate docids for every documents @@ -52,6 +59,9 @@ impl Conf<'_> { configure: |_| (), filter: None, sort: None, + offsets: &[None], + get_documents: false, + sample_size: None, optional_words: true, primary_key: None, }; @@ -144,25 +154,81 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap(); let name = format!("{}: {}", file_name, conf.group_name); let mut group = c.benchmark_group(&name); + if let Some(sample_size) = conf.sample_size { + group.sample_size(sample_size); + } for &query in conf.queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let mut search = index.search(&rtxn); - search.query(query).terms_matching_strategy(TermsMatchingStrategy::default()); - if let Some(filter) = conf.filter { - let filter = Filter::from_str(filter).unwrap().unwrap(); - search.filter(filter); - } - if let Some(sort) = &conf.sort { - let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); - search.sort_criteria(sort); - } - let _ids = search.execute().unwrap(); - }); - }); + for offset in conf.offsets { + let parameter = match (query.is_empty(), offset) { + (true, None) => String::from("placeholder"), + (true, Some((offset, limit))) => format!("placeholder[{offset}:{limit}]"), + (false, None) => query.to_string(), + (false, Some((offset, limit))) => format!("{query}[{offset}:{limit}]"), + }; + group.bench_with_input( + BenchmarkId::from_parameter(parameter), + &query, + |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + search + .query(query) + .terms_matching_strategy(TermsMatchingStrategy::default()); + if let Some(filter) = conf.filter { + let filter = Filter::from_str(filter).unwrap().unwrap(); + search.filter(filter); + } + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + search.sort_criteria(sort); + } + if let Some((offset, limit)) = offset { + search.offset(*offset).limit(*limit); + } + + let _ids = search.execute().unwrap(); + }); + }, + ); + } } + + if conf.get_documents { + for offset in conf.offsets { + let parameter = match offset { + None => String::from("get_documents"), + Some((offset, limit)) => format!("get_documents[{offset}:{limit}]"), + }; + group.bench_with_input(BenchmarkId::from_parameter(parameter), &(), |b, &()| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + let all_docs = index.documents_ids(&rtxn).unwrap(); + let facet_sort = + recursive_sort(&index, &rtxn, sort, &all_docs).unwrap(); + let iter = facet_sort.iter().unwrap(); + if let Some((offset, limit)) = offset { + let _results = iter.skip(*offset).take(*limit).collect::>(); + } else { + let _results = iter.collect::>(); + } + } else { + let all_docs = index.documents_ids(&rtxn).unwrap(); + if let Some((offset, limit)) = offset { + let _results = + all_docs.iter().skip(*offset).take(*limit).collect::>(); + } else { + let _results = all_docs.iter().collect::>(); + } + } + }); + }); + } + } + group.finish(); index.prepare_for_closing().wait(); From f60814b319b97beec0e2c98594d4c2cc55c2281f Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 2 Jul 2025 12:06:00 +0200 Subject: [PATCH 23/23] Add benchmark --- crates/benchmarks/benches/sort.rs | 60 +++++++++++++++++-------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/crates/benchmarks/benches/sort.rs b/crates/benchmarks/benches/sort.rs index 0dd392cb2..c3e934432 100644 --- a/crates/benchmarks/benches/sort.rs +++ b/crates/benchmarks/benches/sort.rs @@ -49,35 +49,35 @@ const BASE_CONF: Conf = Conf { fn bench_sort(c: &mut criterion::Criterion) { #[rustfmt::skip] let confs = &[ - // utils::Conf { - // group_name: "without sort", - // sort: None, - // ..BASE_CONF - // }, + utils::Conf { + group_name: "without sort", + sort: None, + ..BASE_CONF + }, - // utils::Conf { - // group_name: "sort on many different values", - // sort: Some(vec!["name:asc"]), - // ..BASE_CONF - // }, + utils::Conf { + group_name: "sort on many different values", + sort: Some(vec!["name:asc"]), + ..BASE_CONF + }, - // utils::Conf { - // group_name: "sort on many similar values", - // sort: Some(vec!["timezone:desc"]), - // ..BASE_CONF - // }, + utils::Conf { + group_name: "sort on many similar values", + sort: Some(vec!["timezone:desc"]), + ..BASE_CONF + }, - // utils::Conf { - // group_name: "sort on many similar then different values", - // sort: Some(vec!["timezone:desc", "name:asc"]), - // ..BASE_CONF - // }, + utils::Conf { + group_name: "sort on many similar then different values", + sort: Some(vec!["timezone:desc", "name:asc"]), + ..BASE_CONF + }, - // utils::Conf { - // group_name: "sort on many different then similar values", - // sort: Some(vec!["timezone:desc", "name:asc"]), - // ..BASE_CONF - // }, + utils::Conf { + group_name: "sort on many different then similar values", + sort: Some(vec!["timezone:desc", "name:asc"]), + ..BASE_CONF + }, utils::Conf { group_name: "geo sort", @@ -88,17 +88,23 @@ fn bench_sort(c: &mut criterion::Criterion) { utils::Conf { group_name: "sort on many similar values then geo sort", - sample_size: Some(10), + sample_size: Some(50), sort: Some(vec!["timezone:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), ..BASE_CONF }, utils::Conf { group_name: "sort on many different values then geo sort", - sample_size: Some(10), + sample_size: Some(50), sort: Some(vec!["name:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), ..BASE_CONF }, + + utils::Conf { + group_name: "sort on many fields", + sort: Some(vec!["population:asc", "name:asc", "elevation:asc", "timezone:asc"]), + ..BASE_CONF + }, ]; utils::run_benches(c, confs);