338: Fix string fields sorting r=Kerollmops a=shekhirin

Resolves https://github.com/meilisearch/milli/issues/333

<details>
  <summary>curl checks</summary>
  
 ```console
➜  ~ curl -s 'localhost:7700/indexes/movies/search' -d '{"sort": ["title:asc"], "limit": 30}' | jq -r '.hits | map(.title)[]'
#1 Cheerleader Camp
#Horror
#RealityHigh
#Roxy
#SquadGoals
$5 a Day
$9.99
'71
(2)
(500) Days of Summer
(Girl)Friend
*batteries not included
...And God Created Woman
...And Justice for All
...E fuori nevica!
.45
1
1 Mile To You
1 Night
10
10 Cloverfield Lane
10 giorni senza mamma
10 Items or Less
10 Rillington Place
10 Rules for Sleeping Around
10 Things I Hate About You
10 to Midnight
10 Years
10,000 BC
10,000 Saints

➜  ~ curl -s 'localhost:7700/indexes/movies/search' -d '{"sort": ["title:desc"], "limit": 30}' | jq -r '.hits | map(.title)[]'
크게 될 놈
왓칭
뷰티플 마인드
노무현과 바보들
ハニー
Счастье – это… Часть 2
СОТКА
Смотри мою любовь
Позивний 'Бандерас'
Лошо момиче
Күлүк Хомус
Куда течет море
Каникулы президента
Ακίνητο Ποτάμι
Üç Harfliler: Beddua
È nata una Star?
Æon Flux
Ága
À propos de Nice
À Nos Amours
À l'aventure
¡Three Amigos!
Zulu Dawn
Zulu
Zulu
Zu: Warriors from the Magic Mountain
Zu Warriors
Zorro
Zorba the Greek
Zootopia
```
</details>

Co-authored-by: Alexey Shekhirin <a.shekhirin@gmail.com>
This commit is contained in:
bors[bot] 2021-09-07 08:28:23 +00:00 committed by GitHub
commit 446ed17589
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 146 additions and 46 deletions

View File

@ -243,24 +243,27 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> {
type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok(((_fid, level, left, right), docids))) => {
let must_be_returned = match self.end {
Included(end) => right <= end,
Excluded(end) => right < end,
Unbounded => true,
};
if must_be_returned {
match docids.decode() {
Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))),
Err(e) => Some(Err(e)),
loop {
match self.iter.next() {
Some(Ok(((_fid, level, left, right), docids))) => {
let must_be_returned = match self.end {
Included(end) => right <= end,
Excluded(end) => right < end,
Unbounded => true,
};
if must_be_returned {
match docids.decode() {
Ok((bounds, docids)) => {
return Some(Ok(((level, left, right), (bounds, docids))))
}
Err(e) => return Some(Err(e)),
}
}
} else {
None
continue;
}
Some(Err(e)) => return Some(Err(e)),
None => return None,
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}
@ -545,18 +548,18 @@ impl<'t> Iterator for FacetStringIter<'t> {
// the algorithm less complex to understand.
let last = match last {
Left(ascending) => match ascending {
Left(last) => Left(Left(last)),
Right(last) => Right(Left(last)),
Left(group) => Left(Left(group)),
Right(zero_level) => Right(Left(zero_level)),
},
Right(descending) => match descending {
Left(last) => Left(Right(last)),
Right(last) => Right(Right(last)),
Left(group) => Left(Right(group)),
Right(zero_level) => Right(Right(zero_level)),
},
};
match last {
Left(last) => {
for result in last {
Left(group) => {
for result in group {
match result {
Ok(((level, left, right), (string_bounds, mut docids))) => {
docids &= &*documents_ids;
@ -566,6 +569,27 @@ impl<'t> Iterator for FacetStringIter<'t> {
}
let result = if is_ascending {
match string_bounds {
Some((left, right)) => FacetStringLevelZeroRange::new(
self.rtxn,
self.db,
self.field_id,
Included(left),
Included(right),
)
.map(Right),
None => FacetStringGroupRange::new(
self.rtxn,
self.db,
self.field_id,
NonZeroU8::new(level.get() - 1).unwrap(),
Included(left),
Included(right),
)
.map(Left),
}
.map(Left)
} else {
match string_bounds {
Some((left, right)) => {
FacetStringLevelZeroRevRange::new(
@ -588,27 +612,6 @@ impl<'t> Iterator for FacetStringIter<'t> {
.map(Left),
}
.map(Right)
} else {
match string_bounds {
Some((left, right)) => FacetStringLevelZeroRange::new(
self.rtxn,
self.db,
self.field_id,
Included(left),
Included(right),
)
.map(Right),
None => FacetStringGroupRange::new(
self.rtxn,
self.db,
self.field_id,
NonZeroU8::new(level.get() - 1).unwrap(),
Included(left),
Included(right),
)
.map(Left),
}
.map(Left)
};
match result {
@ -624,9 +627,9 @@ impl<'t> Iterator for FacetStringIter<'t> {
}
}
}
Right(last) => {
Right(zero_level) => {
// level zero only
for result in last {
for result in zero_level {
match result {
Ok((normalized, original, mut docids)) => {
docids &= &*documents_ids;

View File

@ -1,6 +1,12 @@
use std::cmp::Reverse;
use big_s::S;
use milli::update::Settings;
use milli::{AscDesc, Criterion, Search, SearchResult};
use heed::EnvOpenOptions;
use itertools::Itertools;
use maplit::hashset;
use milli::update::{Settings, UpdateBuilder, UpdateFormat};
use milli::{AscDesc, Criterion, Index, Search, SearchResult};
use rand::Rng;
use Criterion::*;
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
@ -9,6 +15,7 @@ const ALLOW_TYPOS: bool = true;
const DISALLOW_TYPOS: bool = false;
const ALLOW_OPTIONAL_WORDS: bool = true;
const DISALLOW_OPTIONAL_WORDS: bool = false;
const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000;
macro_rules! test_criterion {
($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => {
@ -357,3 +364,93 @@ fn criteria_mixup() {
assert_eq!(documents_ids, expected_external_ids);
}
}
#[test]
fn criteria_ascdesc() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_sortable_fields(hashset! {
S("name"),
S("age"),
});
builder.execute(|_, _| ()).unwrap();
// index documents
let mut builder = UpdateBuilder::new(0);
builder.max_memory(10 * 1024 * 1024); // 10MiB
let mut builder = builder.index_documents(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.enable_autogenerate_docids();
let content = [
vec![S("name,age")],
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1)
.map(|_| {
let mut rng = rand::thread_rng();
let age = rng.gen::<u32>().to_string();
let name = rng
.sample_iter(&rand::distributions::Alphanumeric)
.map(char::from)
.filter(|c| *c >= 'a' && *c <= 'z')
.take(10)
.collect::<String>();
format!("{},{}", name, age)
})
.collect::<Vec<_>>(),
]
.iter()
.flatten()
.join("\n");
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::<Vec<_>>();
for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] {
eprintln!("Testing with criterion: {:?}", &criterion);
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_criteria(vec![criterion.to_string()]);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let mut rtxn = index.read_txn().unwrap();
let mut search = Search::new(&mut rtxn, &index);
search.limit(ASC_DESC_CANDIDATES_THRESHOLD + 1);
let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_document_ids = match criterion {
Asc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| obkv.get(0).unwrap())
}
Desc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(0).unwrap()))
}
Asc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| obkv.get(1).unwrap())
}
Desc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(1).unwrap()))
}
_ => continue,
}
.map(|(id, _)| *id)
.collect::<Vec<_>>();
assert_eq!(documents_ids, expected_document_ids);
}
}