From c2517e7d5f277e24b9dd83cedf571e6a04d22f68 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Thu, 2 Sep 2021 19:41:19 +0300 Subject: [PATCH 1/2] fix(facet): string fields sorting --- milli/src/search/facet/facet_string.rs | 91 +++++++++++++------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 747b7fd3c..c55430cf1 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -243,24 +243,27 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> { type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), + loop { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok((bounds, docids)) => { + return Some(Ok(((level, left, right), (bounds, docids)))) + } + Err(e) => return Some(Err(e)), + } } - } else { - None + continue; } + Some(Err(e)) => return Some(Err(e)), + None => return None, } - Some(Err(e)) => Some(Err(e)), - None => None, } } } @@ -545,18 +548,18 @@ impl<'t> Iterator for FacetStringIter<'t> { // the algorithm less complex to understand. let last = match last { Left(ascending) => match ascending { - Left(last) => Left(Left(last)), - Right(last) => Right(Left(last)), + Left(group) => Left(Left(group)), + Right(zero_level) => Right(Left(zero_level)), }, Right(descending) => match descending { - Left(last) => Left(Right(last)), - Right(last) => Right(Right(last)), + Left(group) => Left(Right(group)), + Right(zero_level) => Right(Right(zero_level)), }, }; match last { - Left(last) => { - for result in last { + Left(group) => { + for result in group { match result { Ok(((level, left, right), (string_bounds, mut docids))) => { docids &= &*documents_ids; @@ -566,6 +569,27 @@ impl<'t> Iterator for FacetStringIter<'t> { } let result = if is_ascending { + match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + } + .map(Left) + } else { match string_bounds { Some((left, right)) => { FacetStringLevelZeroRevRange::new( @@ -588,27 +612,6 @@ impl<'t> Iterator for FacetStringIter<'t> { .map(Left), } .map(Right) - } else { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) }; match result { @@ -624,9 +627,9 @@ impl<'t> Iterator for FacetStringIter<'t> { } } } - Right(last) => { + Right(zero_level) => { // level zero only - for result in last { + for result in zero_level { match result { Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; From 0be09555f1ec7724ac2789618d0f5c1132a30001 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Fri, 3 Sep 2021 13:00:48 +0300 Subject: [PATCH 2/2] test(search): asc/desc criteria for large datasets --- milli/tests/search/query_criteria.rs | 101 ++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 1723c1d6f..c9720d652 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,6 +1,12 @@ +use std::cmp::Reverse; + use big_s::S; -use milli::update::Settings; -use milli::{AscDesc, Criterion, Search, SearchResult}; +use heed::EnvOpenOptions; +use itertools::Itertools; +use maplit::hashset; +use milli::update::{Settings, UpdateBuilder, UpdateFormat}; +use milli::{AscDesc, Criterion, Index, Search, SearchResult}; +use rand::Rng; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -9,6 +15,7 @@ const ALLOW_TYPOS: bool = true; const DISALLOW_TYPOS: bool = false; const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; +const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000; macro_rules! test_criterion { ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { @@ -357,3 +364,93 @@ fn criteria_mixup() { assert_eq!(documents_ids, expected_external_ids); } } + +#[test] +fn criteria_ascdesc() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut builder = Settings::new(&mut wtxn, &index, 0); + + builder.set_sortable_fields(hashset! { + S("name"), + S("age"), + }); + builder.execute(|_, _| ()).unwrap(); + + // index documents + let mut builder = UpdateBuilder::new(0); + builder.max_memory(10 * 1024 * 1024); // 10MiB + let mut builder = builder.index_documents(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); + + let content = [ + vec![S("name,age")], + (0..ASC_DESC_CANDIDATES_THRESHOLD + 1) + .map(|_| { + let mut rng = rand::thread_rng(); + + let age = rng.gen::().to_string(); + let name = rng + .sample_iter(&rand::distributions::Alphanumeric) + .map(char::from) + .filter(|c| *c >= 'a' && *c <= 'z') + .take(10) + .collect::(); + + format!("{},{}", name, age) + }) + .collect::>(), + ] + .iter() + .flatten() + .join("\n"); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::>(); + + for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] { + eprintln!("Testing with criterion: {:?}", &criterion); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_criteria(vec![criterion.to_string()]); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.limit(ASC_DESC_CANDIDATES_THRESHOLD + 1); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let expected_document_ids = match criterion { + Asc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| obkv.get(0).unwrap()) + } + Desc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(0).unwrap())) + } + Asc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| obkv.get(1).unwrap()) + } + Desc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(1).unwrap())) + } + _ => continue, + } + .map(|(id, _)| *id) + .collect::>(); + + assert_eq!(documents_ids, expected_document_ids); + } +}