MeiliSearch/milli/tests/search/mod.rs

299 lines
11 KiB
Rust
Raw Normal View History

2021-08-23 11:37:18 +02:00
use std::cmp::Reverse;
2021-06-17 13:56:09 +02:00
use std::collections::HashSet;
use std::io::Cursor;
2021-06-17 13:56:09 +02:00
2021-06-03 14:44:53 +02:00
use big_s::S;
2021-06-17 13:56:09 +02:00
use either::{Either, Left, Right};
2021-06-03 14:44:53 +02:00
use heed::EnvOpenOptions;
use maplit::{hashmap, hashset};
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
2022-08-18 17:36:08 +02:00
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
use serde::{Deserialize, Deserializer};
2021-06-03 14:44:53 +02:00
use slice_group_by::GroupBy;
2021-06-17 14:24:59 +02:00
mod distinct;
mod facet_distribution;
2021-06-17 13:56:09 +02:00
mod filters;
2022-10-13 20:04:17 +02:00
mod phrase_search;
2021-06-03 14:44:53 +02:00
mod query_criteria;
mod sort;
2022-04-01 10:50:01 +02:00
mod typo_tolerance;
2021-06-03 14:44:53 +02:00
2022-10-10 15:28:03 +02:00
pub const TEST_QUERY: &str = "hello world america";
2021-06-03 14:44:53 +02:00
2021-06-16 18:33:33 +02:00
pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] =
&["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"];
2021-06-03 14:44:53 +02:00
pub const CONTENT: &str = include_str!("../assets/test_set.ndjson");
pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let config = IndexerConfig::default();
2021-06-03 14:44:53 +02:00
let mut builder = Settings::new(&mut wtxn, &index, &config);
2021-06-03 14:44:53 +02:00
2023-01-11 12:14:17 +01:00
builder.set_criteria(criteria.to_vec());
2021-06-16 18:33:33 +02:00
builder.set_filterable_fields(hashset! {
2021-06-03 14:44:53 +02:00
S("tag"),
S("asc_desc_rank"),
2021-09-08 13:08:48 +02:00
S("_geo"),
S("opt1"),
S("opt1.opt2"),
S("tag_in")
2021-06-03 14:44:53 +02:00
});
2021-08-23 11:37:18 +02:00
builder.set_sortable_fields(hashset! {
S("tag"),
S("asc_desc_rank"),
});
2021-06-16 18:33:33 +02:00
builder.set_synonyms(hashmap! {
2021-06-03 14:44:53 +02:00
S("hello") => vec![S("good morning")],
S("world") => vec![S("earth")],
S("america") => vec![S("the united states")],
});
2021-06-16 18:33:33 +02:00
builder.set_searchable_fields(vec![S("title"), S("description")]);
builder.execute(|_| (), || false).unwrap();
2021-06-03 14:44:53 +02:00
// index documents
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
let reader = Cursor::new(CONTENT.as_bytes());
2021-10-24 14:41:36 +02:00
2022-07-21 14:51:41 +02:00
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
let object = result.unwrap();
documents_builder.append_json_object(&object).unwrap();
2021-10-24 14:41:36 +02:00
}
let vector = documents_builder.into_inner().unwrap();
// index documents
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
2022-06-15 14:35:19 +02:00
let (builder, user_error) = builder.add_documents(content).unwrap();
user_error.unwrap();
builder.execute().unwrap();
2021-06-03 14:44:53 +02:00
wtxn.commit().unwrap();
index
}
pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> {
2023-01-17 18:01:26 +01:00
let rtxn = index.read_txn().unwrap();
let docid_map = index.external_documents_ids(&rtxn).unwrap();
2021-06-16 18:33:33 +02:00
let docid_map: std::collections::HashMap<_, _> =
EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
2021-06-03 14:44:53 +02:00
internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect()
}
2021-06-16 18:33:33 +02:00
pub fn expected_order(
criteria: &[Criterion],
authorize_typo: bool,
2022-08-18 17:36:08 +02:00
optional_words: TermsMatchingStrategy,
2021-08-23 11:37:18 +02:00
sort_by: &[AscDesc],
2021-06-16 18:33:33 +02:00
) -> Vec<TestDocument> {
let dataset =
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
2021-06-03 14:44:53 +02:00
let mut groups: Vec<Vec<TestDocument>> = vec![dataset];
for criterion in criteria {
let mut new_groups = Vec::new();
for group in groups.iter_mut() {
match criterion {
Criterion::Attribute => {
group.sort_by_key(|d| d.attribute_rank);
2021-06-16 18:33:33 +02:00
new_groups
.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from));
}
2021-06-03 14:44:53 +02:00
Criterion::Exactness => {
group.sort_by_key(|d| d.exact_rank);
new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from));
2021-06-16 18:33:33 +02:00
}
2021-06-03 14:44:53 +02:00
Criterion::Proximity => {
group.sort_by_key(|d| d.proximity_rank);
2021-06-16 18:33:33 +02:00
new_groups
.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from));
}
2021-09-01 17:43:18 +02:00
Criterion::Sort if sort_by == [AscDesc::Asc(Member::Field(S("tag")))] => {
2021-08-23 11:37:18 +02:00
group.sort_by_key(|d| d.sort_by_rank);
new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from));
}
2021-09-01 17:43:18 +02:00
Criterion::Sort if sort_by == [AscDesc::Desc(Member::Field(S("tag")))] => {
2021-08-23 11:37:18 +02:00
group.sort_by_key(|d| Reverse(d.sort_by_rank));
new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from));
}
2021-06-03 14:44:53 +02:00
Criterion::Typo => {
group.sort_by_key(|d| d.typo_rank);
new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from));
2021-06-16 18:33:33 +02:00
}
2021-06-03 14:44:53 +02:00
Criterion::Words => {
group.sort_by_key(|d| d.word_rank);
new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from));
2021-06-16 18:33:33 +02:00
}
2021-06-08 12:33:02 +02:00
Criterion::Asc(field_name) if field_name == "asc_desc_rank" => {
2021-06-03 14:44:53 +02:00
group.sort_by_key(|d| d.asc_desc_rank);
2021-06-16 18:33:33 +02:00
new_groups
.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
}
Criterion::Desc(field_name) if field_name == "asc_desc_rank" => {
2021-08-23 11:37:18 +02:00
group.sort_by_key(|d| Reverse(d.asc_desc_rank));
2021-06-16 18:33:33 +02:00
new_groups
.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from));
}
2021-08-23 11:37:18 +02:00
Criterion::Asc(_) | Criterion::Desc(_) | Criterion::Sort => {
new_groups.push(group.clone())
}
2021-06-03 14:44:53 +02:00
}
}
groups = std::mem::take(&mut new_groups);
}
2022-08-18 17:36:08 +02:00
if authorize_typo && optional_words == TermsMatchingStrategy::default() {
2021-06-03 14:44:53 +02:00
groups.into_iter().flatten().collect()
2022-08-18 17:36:08 +02:00
} else if optional_words == TermsMatchingStrategy::default() {
2021-06-03 14:44:53 +02:00
groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect()
2021-06-08 14:11:00 +02:00
} else if authorize_typo {
2021-06-03 14:44:53 +02:00
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()
} else {
groups.into_iter().flatten().filter(|d| d.word_rank == 0 && d.typo_rank == 0).collect()
}
}
2021-06-17 13:56:09 +02:00
fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
let mut id = None;
if let Some((field, filter)) = filter.split_once("!=") {
2023-01-17 18:01:26 +01:00
if field == "tag" && document.tag != filter
|| (field == "asc_desc_rank"
&& Ok(&document.asc_desc_rank) != filter.parse::<u32>().as_ref())
{
id = Some(document.id.clone())
}
2022-10-10 15:28:03 +02:00
} else if let Some((field, filter)) = filter.split_once('=') {
2023-01-17 18:01:26 +01:00
if field == "tag" && document.tag == filter
|| (field == "asc_desc_rank"
&& document.asc_desc_rank == filter.parse::<u32>().unwrap())
2021-06-17 13:56:09 +02:00
{
id = Some(document.id.clone())
}
2022-10-10 15:28:03 +02:00
} else if let Some(("asc_desc_rank", filter)) = filter.split_once('<') {
2021-06-17 13:56:09 +02:00
if document.asc_desc_rank < filter.parse().unwrap() {
id = Some(document.id.clone())
}
2022-10-10 15:28:03 +02:00
} else if let Some(("asc_desc_rank", filter)) = filter.split_once('>') {
2021-06-17 13:56:09 +02:00
if document.asc_desc_rank > filter.parse().unwrap() {
id = Some(document.id.clone())
}
2021-09-08 13:08:48 +02:00
} else if filter.starts_with("_geoRadius") {
id = (document.geo_rank < 100000).then(|| document.id.clone());
} else if filter.starts_with("NOT _geoRadius") {
id = (document.geo_rank > 1000000).then(|| document.id.clone());
} else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") {
id = document.opt1.is_some().then(|| document.id.clone());
} else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") {
id = document.opt1.is_none().then(|| document.id.clone());
} else if matches!(filter, "opt1.opt2 EXISTS") {
if document.opt1opt2.is_some() {
id = Some(document.id.clone());
} else if let Some(opt1) = &document.opt1 {
id = contains_key_rec(opt1, "opt2").then(|| document.id.clone());
}
} else if matches!(
filter,
"tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]"
) {
id = matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E")
.then(|| document.id.clone());
} else if matches!(filter, "tag_in NOT IN[1, 2, 3, four, five]") {
id = (!matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E"))
.then(|| document.id.clone());
2021-06-17 13:56:09 +02:00
}
id
}
pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool {
match v {
serde_json::Value::Array(v) => {
for v in v.iter() {
if contains_key_rec(v, key) {
return true;
}
}
false
}
serde_json::Value::Object(v) => {
for (k, v) in v.iter() {
if k == key || contains_key_rec(v, key) {
return true;
}
}
false
}
_ => false,
}
}
2021-06-17 13:56:09 +02:00
pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<String> {
let dataset: Vec<TestDocument> =
2021-06-17 13:56:09 +02:00
serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect();
for either in filters {
let ids = match either {
Left(array) => array
.into_iter()
.map(|f| {
let ids: HashSet<String> =
dataset.iter().filter_map(|d| execute_filter(f, d)).collect();
ids
})
.reduce(|a, b| a.union(&b).cloned().collect())
.unwrap(),
Right(filter) => {
let ids: HashSet<String> =
dataset.iter().filter_map(|d| execute_filter(filter, d)).collect();
ids
}
};
filtered_ids = filtered_ids.intersection(&ids).cloned().collect();
}
filtered_ids
}
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
2021-06-03 14:44:53 +02:00
pub struct TestDocument {
pub id: String,
pub word_rank: u32,
pub typo_rank: u32,
pub proximity_rank: u32,
pub attribute_rank: u32,
pub exact_rank: u32,
pub asc_desc_rank: u32,
2021-08-23 11:37:18 +02:00
pub sort_by_rank: u32,
2021-09-08 13:08:48 +02:00
pub geo_rank: u32,
2021-06-03 14:44:53 +02:00
pub title: String,
pub description: String,
pub tag: String,
#[serde(default, deserialize_with = "some_option")]
pub opt1: Option<serde_json::Value>,
#[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")]
pub opt1opt2: Option<serde_json::Value>,
}
fn some_option<'de, D>(deserializer: D) -> Result<Option<serde_json::Value>, D::Error>
where
D: Deserializer<'de>,
{
let result = serde_json::Value::deserialize(deserializer)?;
Ok(Some(result))
2021-06-03 14:44:53 +02:00
}