From 340d9e6edc71621b3c4dba7e95bbd7cf101453ff Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Fri, 27 Jun 2025 14:40:55 +0200 Subject: [PATCH] Optimize facet sort 5 to 10x speedup --- .../src/routes/indexes/documents.rs | 21 +- .../milli/src/facet/facet_sort_recursive.rs | 295 ++++++++++++++---- 2 files changed, 256 insertions(+), 60 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 425930ced..bcd227300 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1558,19 +1558,32 @@ fn retrieve_documents>( })? } + let mut facet_sort = None; if let Some(sort) = sort_criteria { - candidates = recursive_facet_sort(index, &rtxn, &sort, candidates)?; + facet_sort = Some(recursive_facet_sort(index, &rtxn, &sort, &candidates)?) } - let (it, number_of_documents) = { + let (it, number_of_documents) = if let Some(facet_sort) = &facet_sort { + let number_of_documents = candidates.len(); + let iter = facet_sort.iter()?; + ( + itertools::Either::Left(some_documents( + index, + &rtxn, + iter.map(|d| d.unwrap()).skip(offset).take(limit), + retrieve_vectors, + )?), + number_of_documents, + ) + } else { let number_of_documents = candidates.len(); ( - some_documents( + itertools::Either::Right(some_documents( index, &rtxn, candidates.into_iter().skip(offset).take(limit), retrieve_vectors, - )?, + )?), number_of_documents, ) }; diff --git a/crates/milli/src/facet/facet_sort_recursive.rs b/crates/milli/src/facet/facet_sort_recursive.rs index c0fd6ca6f..47c3696f3 100644 --- a/crates/milli/src/facet/facet_sort_recursive.rs +++ b/crates/milli/src/facet/facet_sort_recursive.rs @@ -1,78 +1,256 @@ use roaring::RoaringBitmap; use heed::Database; -use crate::{heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}, search::{facet::{ascending_facet_sort, descending_facet_sort}, new::check_sort_criteria}, AscDesc, Member}; +use crate::{heed_codec::{facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, BytesRefCodec}, search::{facet::{ascending_facet_sort, descending_facet_sort}, new::check_sort_criteria}, AscDesc, DocumentId, Member}; -fn recursive_facet_sort_inner<'t>( - rtxn: &'t heed::RoTxn<'t>, +/// Builder for a [`SortedDocumentsIterator`]. +/// Most builders won't ever be built, because pagination will skip them. +pub struct SortedDocumentsIteratorBuilder<'ctx> { + rtxn: &'ctx heed::RoTxn<'ctx>, number_db: Database, FacetGroupValueCodec>, string_db: Database, FacetGroupValueCodec>, - fields: &[(u16, bool)], + fields: &'ctx [(u16, bool)], candidates: RoaringBitmap, -) -> heed::Result { - let (field_id, ascending) = match fields.first() { - Some(first) => *first, - None => return Ok(candidates), - }; +} - let (number_iter, string_iter) = if ascending { - let number_iter = ascending_facet_sort( +impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { + /// Performs the sort and builds a [`SortedDocumentsIterator`]. + fn build(self) -> heed::Result> { + let SortedDocumentsIteratorBuilder { rtxn, number_db, - field_id, - candidates.clone(), - )?; - let string_iter = ascending_facet_sort( - rtxn, string_db, - field_id, + fields, candidates, - )?; + } = self; + let size = candidates.len() as usize; - (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) - } else { - let number_iter = descending_facet_sort( - rtxn, - number_db, - field_id, - candidates.clone(), - )?; - let string_iter = descending_facet_sort( - rtxn, - string_db, - field_id, - candidates, - )?; + // There is no point sorting a 1-element array + if size <= 1 { + return Ok(SortedDocumentsIterator::Leaf { + size, + values: Box::new(candidates.into_iter()), + }); + } - (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) - }; + // There is no variable to sort on + let Some((field_id, ascending)) = fields.first().copied() else { + return Ok(SortedDocumentsIterator::Leaf { + size, + values: Box::new(candidates.into_iter()), + }); + }; - let chained_iter = number_iter.chain(string_iter); - let mut result = RoaringBitmap::new(); - for part in chained_iter { - let (inner_candidates, _) = part?; - if inner_candidates.len() <= 1 || fields.len() <= 1 { - result |= inner_candidates; + // Perform the sort on the first field + let (number_iter, string_iter) = if ascending { + let number_iter = ascending_facet_sort( + rtxn, + number_db, + field_id, + candidates.clone(), + )?; + let string_iter = ascending_facet_sort( + rtxn, + string_db, + field_id, + candidates, + )?; + + (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) } else { - let inner_candidates = recursive_facet_sort_inner( + let number_iter = descending_facet_sort( + rtxn, + number_db, + field_id, + candidates.clone(), + )?; + let string_iter = descending_facet_sort( + rtxn, + string_db, + field_id, + candidates, + )?; + + (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) + }; + + // Create builders for the next level of the tree + let number_db2 = number_db; + let string_db2 = string_db; + let number_iter = number_iter.map(move |r| -> heed::Result { + let (docids, _bytes) = r?; + Ok(SortedDocumentsIteratorBuilder { rtxn, number_db, string_db, - &fields[1..], - inner_candidates, - )?; - result |= inner_candidates; - } - } + fields: &fields[1..], + candidates: docids, + }) + }); + let string_iter = string_iter.map(move |r| -> heed::Result { + let (docids, _bytes) = r?; + Ok(SortedDocumentsIteratorBuilder { + rtxn, + number_db: number_db2, + string_db: string_db2, + fields: &fields[1..], + candidates: docids, + }) + }); - Ok(result) + Ok(SortedDocumentsIterator::Branch { + current_child: None, + next_children_size: size, + next_children: Box::new(number_iter.chain(string_iter)), + }) + } } -pub fn recursive_facet_sort<'t>( - index: &crate::Index, - rtxn: &'t heed::RoTxn<'t>, +/// A [`SortedDocumentsIterator`] allows efficient access to a continuous range of sorted documents. +/// This is ideal in the context of paginated queries in which only a small number of documents are needed at a time. +/// Search operations will only be performed upon access. +pub enum SortedDocumentsIterator<'ctx> { + Leaf { + /// The exact number of documents remaining + size: usize, + values: Box + 'ctx> + }, + Branch { + /// The current child, got from the children iterator + current_child: Option>>, + /// The exact number of documents remaining, excluding documents in the current child + next_children_size: usize, + /// Iterators to become the current child once it is exhausted + next_children: Box>> + 'ctx>, + } +} + +impl SortedDocumentsIterator<'_> { + /// Takes care of updating the current child if it is `None`, and also updates the size + fn update_current<'ctx>(current_child: &mut Option>>, next_children_size: &mut usize, next_children: &mut Box>> + 'ctx>) -> heed::Result<()> { + if current_child.is_none() { + *current_child = match next_children.next() { + Some(Ok(builder)) => { + let next_child = Box::new(builder.build()?); + *next_children_size -= next_child.size_hint().0; + Some(next_child) + }, + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + } + Ok(()) + } +} + +impl Iterator for SortedDocumentsIterator<'_> { + type Item = heed::Result; + + fn nth(&mut self, n: usize) -> Option { + // If it's at the leaf level, just forward the call to the values iterator + let (current_child, next_children, next_children_size) = match self { + SortedDocumentsIterator::Leaf { values, size } => { + *size = size.saturating_sub(n); + return values.nth(n).map(Ok) + }, + SortedDocumentsIterator::Branch { current_child, next_children, next_children_size } => (current_child, next_children, next_children_size), + }; + + // Otherwise don't directly iterate over children, skip them if we know we will go further + let mut to_skip = n - 1; + while to_skip > 0 { + if let Err(e) = SortedDocumentsIterator::update_current(current_child, next_children_size, next_children) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; // No more inner iterators, everything has been consumed. + }; + + if to_skip >= inner.size_hint().0 { + // The current child isn't large enough to contain the nth element. + // Skip it and continue with the next one. + to_skip -= inner.size_hint().0; + *current_child = None; + continue; + } else { + // The current iterator is large enough, so we can forward the call to it. + return inner.nth(to_skip + 1); + } + } + + self.next() + } + + fn size_hint(&self) -> (usize, Option) { + let size = match self { + SortedDocumentsIterator::Leaf { size, .. } => *size, + SortedDocumentsIterator::Branch { next_children_size, current_child: Some(current_child), .. } => current_child.size_hint().0 + next_children_size, + SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => *next_children_size, + }; + + (size, Some(size)) + } + + fn next(&mut self) -> Option { + match self { + SortedDocumentsIterator::Leaf { values, size } => { + let result = values.next().map(Ok); + if result.is_some() { + *size -= 1; + } + result + }, + SortedDocumentsIterator::Branch { current_child, next_children_size, next_children } => { + let mut result = None; + while result.is_none() { + // Ensure we have selected an iterator to work with + if let Err(e) = SortedDocumentsIterator::update_current(current_child, next_children_size, next_children) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; + }; + + result = inner.next(); + + // If the current iterator is exhausted, we need to try the next one + if result.is_none() { + *current_child = None; + } + } + result + } + } + } +} + +/// A structure owning the data needed during the lifetime of a [`SortedDocumentsIterator`]. +pub struct SortedDocuments<'ctx> { + rtxn: &'ctx heed::RoTxn<'ctx>, + fields: Vec<(u16, bool)>, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + candidates: &'ctx RoaringBitmap, +} + +impl <'ctx> SortedDocuments<'ctx> { + pub fn iter(&'ctx self) -> heed::Result> { + let builder = SortedDocumentsIteratorBuilder { + rtxn: self.rtxn, + number_db: self.number_db, + string_db: self.string_db, + fields: &self.fields, + candidates: self.candidates.clone(), + }; + builder.build() + } +} + +pub fn recursive_facet_sort<'ctx>( + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, sort: &[AscDesc], - candidates: RoaringBitmap, -) -> crate::Result { + candidates: &'ctx RoaringBitmap, +) -> crate::Result> { check_sort_criteria(index, rtxn, Some(sort))?; let mut fields = Vec::new(); @@ -88,7 +266,7 @@ pub fn recursive_facet_sort<'t>( fields.push((field_id, ascending)); // FIXME: Should this return an error if the field is not found? } } - + let number_db = index .facet_id_f64_docids .remap_key_type::>(); @@ -96,6 +274,11 @@ pub fn recursive_facet_sort<'t>( .facet_id_string_docids .remap_key_type::>(); - let candidates = recursive_facet_sort_inner(rtxn, number_db, string_db, &fields, candidates)?; - Ok(candidates) + Ok(SortedDocuments { + rtxn, + fields, + number_db, + string_db, + candidates, + }) }