1
0
mirror of https://github.com/meilisearch/MeiliSearch synced 2025-01-25 20:57:35 +01:00

391 lines
18 KiB
Rust
Raw Normal View History

use std::cmp::Reverse;
2021-06-03 14:44:53 +02:00
use big_s::S;
2024-11-18 17:39:55 +01:00
use bumpalo::Bump;
use heed::EnvOpenOptions;
use itertools::Itertools;
use maplit::hashset;
2024-12-10 16:30:48 +01:00
use milli::progress::Progress;
2024-11-18 17:39:55 +01:00
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
2024-11-18 17:39:55 +01:00
use milli::vector::EmbeddingConfigs;
2022-08-18 17:36:08 +02:00
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
use rand::Rng;
2021-06-16 18:33:33 +02:00
use Criterion::*;
2021-06-03 14:44:53 +02:00
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
2021-06-08 12:33:02 +02:00
2022-08-18 17:36:08 +02:00
const ALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::Last;
const DISALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::All;
const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000;
2021-06-08 12:33:02 +02:00
macro_rules! test_criterion {
($func:ident, $optional_word:ident, $criteria:expr, $sort_criteria:expr) => {
2021-06-08 12:33:02 +02:00
#[test]
fn $func() {
2021-06-17 14:24:59 +02:00
let criteria = $criteria;
2021-06-08 12:33:02 +02:00
let index = search::setup_search_index_with_criteria(&criteria);
2021-06-17 15:19:03 +02:00
let rtxn = index.read_txn().unwrap();
2021-06-08 12:33:02 +02:00
2021-06-17 15:19:03 +02:00
let mut search = Search::new(&rtxn, &index);
2021-06-08 12:33:02 +02:00
search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.terms_matching_strategy($optional_word);
2021-08-23 11:37:18 +02:00
search.sort_criteria($sort_criteria);
2021-06-08 12:33:02 +02:00
2021-06-08 14:11:00 +02:00
let SearchResult { documents_ids, .. } = search.execute().unwrap();
2021-06-08 12:33:02 +02:00
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, $optional_word, &$sort_criteria[..])
.into_iter()
.map(|d| d.id)
.collect();
2021-06-08 12:33:02 +02:00
let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
assert_eq!(documents_ids, expected_external_ids);
}
2021-06-17 13:56:09 +02:00
};
2021-06-08 12:33:02 +02:00
}
test_criterion!(none, DISALLOW_OPTIONAL_WORDS, vec![], vec![]);
test_criterion!(words, ALLOW_OPTIONAL_WORDS, vec![Words], vec![]);
test_criterion!(attribute, DISALLOW_OPTIONAL_WORDS, vec![Attribute], vec![]);
test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, vec![Typo], vec![]);
test_criterion!(exactness, DISALLOW_OPTIONAL_WORDS, vec![Exactness], vec![]);
test_criterion!(proximity, DISALLOW_OPTIONAL_WORDS, vec![Proximity], vec![]);
test_criterion!(asc, DISALLOW_OPTIONAL_WORDS, vec![Asc(S("asc_desc_rank"))], vec![]);
test_criterion!(desc, DISALLOW_OPTIONAL_WORDS, vec![Desc(S("asc_desc_rank"))], vec![]);
2021-08-23 11:37:18 +02:00
test_criterion!(
asc_unexisting_field,
2021-08-23 11:37:18 +02:00
DISALLOW_OPTIONAL_WORDS,
vec![Asc(S("unexisting_field"))],
vec![]
2021-06-17 15:19:03 +02:00
);
2021-06-17 15:19:03 +02:00
test_criterion!(
desc_unexisting_field,
2021-06-17 15:19:03 +02:00
DISALLOW_OPTIONAL_WORDS,
2021-08-23 11:37:18 +02:00
vec![Desc(S("unexisting_field"))],
vec![]
);
test_criterion!(empty_sort_by, DISALLOW_OPTIONAL_WORDS, vec![Sort], vec![]);
2021-08-23 11:37:18 +02:00
test_criterion!(
sort_by_asc,
2021-08-23 11:37:18 +02:00
DISALLOW_OPTIONAL_WORDS,
vec![Sort],
2021-09-01 17:43:18 +02:00
vec![AscDesc::Asc(Member::Field(S("tag")))]
2021-08-23 11:37:18 +02:00
);
test_criterion!(
sort_by_desc,
2021-08-23 11:37:18 +02:00
DISALLOW_OPTIONAL_WORDS,
vec![Sort],
2021-09-01 17:43:18 +02:00
vec![AscDesc::Desc(Member::Field(S("tag")))]
2021-06-17 15:19:03 +02:00
);
test_criterion!(
default_criteria_order,
ALLOW_OPTIONAL_WORDS,
2021-08-23 11:37:18 +02:00
vec![Words, Typo, Proximity, Attribute, Exactness],
vec![]
2021-06-17 15:19:03 +02:00
);
2021-06-03 14:44:53 +02:00
#[test]
fn criteria_mixup() {
use Criterion::*;
let index = search::setup_search_index_with_criteria(&[
Words,
2021-06-16 18:33:33 +02:00
Attribute,
Desc(S("asc_desc_rank")),
Exactness,
Proximity,
Typo,
]);
2021-06-03 14:44:53 +02:00
2021-06-16 18:33:33 +02:00
#[rustfmt::skip]
2021-06-03 14:44:53 +02:00
let criteria_mix = {
2021-06-08 14:11:00 +02:00
// Criterion doesn't implement Copy, we create a new Criterion using a closure
2021-06-03 14:44:53 +02:00
let desc = || Desc(S("asc_desc_rank"));
// all possible criteria order
vec![
vec![Words, Attribute, desc(), Exactness, Proximity, Typo],
vec![Words, Attribute, desc(), Exactness, Typo, Proximity],
vec![Words, Attribute, desc(), Proximity, Exactness, Typo],
vec![Words, Attribute, desc(), Proximity, Typo, Exactness],
vec![Words, Attribute, desc(), Typo, Exactness, Proximity],
vec![Words, Attribute, desc(), Typo, Proximity, Exactness],
vec![Words, Attribute, Exactness, desc(), Proximity, Typo],
vec![Words, Attribute, Exactness, desc(), Typo, Proximity],
vec![Words, Attribute, Exactness, Proximity, desc(), Typo],
vec![Words, Attribute, Exactness, Proximity, Typo, desc()],
vec![Words, Attribute, Exactness, Typo, desc(), Proximity],
vec![Words, Attribute, Exactness, Typo, Proximity, desc()],
vec![Words, Attribute, Proximity, desc(), Exactness, Typo],
vec![Words, Attribute, Proximity, desc(), Typo, Exactness],
vec![Words, Attribute, Proximity, Exactness, desc(), Typo],
vec![Words, Attribute, Proximity, Exactness, Typo, desc()],
vec![Words, Attribute, Proximity, Typo, desc(), Exactness],
vec![Words, Attribute, Proximity, Typo, Exactness, desc()],
vec![Words, Attribute, Typo, desc(), Exactness, Proximity],
vec![Words, Attribute, Typo, desc(), Proximity, Exactness],
vec![Words, Attribute, Typo, Exactness, desc(), Proximity],
vec![Words, Attribute, Typo, Exactness, Proximity, desc()],
vec![Words, Attribute, Typo, Proximity, desc(), Exactness],
vec![Words, Attribute, Typo, Proximity, Exactness, desc()],
vec![Words, desc(), Attribute, Exactness, Proximity, Typo],
vec![Words, desc(), Attribute, Exactness, Typo, Proximity],
vec![Words, desc(), Attribute, Proximity, Exactness, Typo],
vec![Words, desc(), Attribute, Proximity, Typo, Exactness],
vec![Words, desc(), Attribute, Typo, Exactness, Proximity],
vec![Words, desc(), Attribute, Typo, Proximity, Exactness],
vec![Words, desc(), Exactness, Attribute, Proximity, Typo],
vec![Words, desc(), Exactness, Attribute, Typo, Proximity],
vec![Words, desc(), Exactness, Proximity, Attribute, Typo],
vec![Words, desc(), Exactness, Proximity, Typo, Attribute],
vec![Words, desc(), Exactness, Typo, Attribute, Proximity],
vec![Words, desc(), Exactness, Typo, Proximity, Attribute],
vec![Words, desc(), Proximity, Attribute, Exactness, Typo],
vec![Words, desc(), Proximity, Attribute, Typo, Exactness],
vec![Words, desc(), Proximity, Exactness, Attribute, Typo],
vec![Words, desc(), Proximity, Exactness, Typo, Attribute],
vec![Words, desc(), Proximity, Typo, Attribute, Exactness],
vec![Words, desc(), Proximity, Typo, Exactness, Attribute],
vec![Words, desc(), Typo, Attribute, Exactness, Proximity],
vec![Words, desc(), Typo, Attribute, Proximity, Exactness],
vec![Words, desc(), Typo, Exactness, Attribute, Proximity],
vec![Words, desc(), Typo, Exactness, Proximity, Attribute],
vec![Words, desc(), Typo, Proximity, Attribute, Exactness],
vec![Words, desc(), Typo, Proximity, Exactness, Attribute],
vec![Words, Exactness, Attribute, desc(), Proximity, Typo],
vec![Words, Exactness, Attribute, desc(), Typo, Proximity],
vec![Words, Exactness, Attribute, Proximity, desc(), Typo],
vec![Words, Exactness, Attribute, Proximity, Typo, desc()],
vec![Words, Exactness, Attribute, Typo, desc(), Proximity],
vec![Words, Exactness, Attribute, Typo, Proximity, desc()],
vec![Words, Exactness, desc(), Attribute, Proximity, Typo],
vec![Words, Exactness, desc(), Attribute, Typo, Proximity],
vec![Words, Exactness, desc(), Proximity, Attribute, Typo],
vec![Words, Exactness, desc(), Proximity, Typo, Attribute],
vec![Words, Exactness, desc(), Typo, Attribute, Proximity],
vec![Words, Exactness, desc(), Typo, Proximity, Attribute],
vec![Words, Exactness, Proximity, Attribute, desc(), Typo],
vec![Words, Exactness, Proximity, Attribute, Typo, desc()],
vec![Words, Exactness, Proximity, desc(), Attribute, Typo],
vec![Words, Exactness, Proximity, desc(), Typo, Attribute],
vec![Words, Exactness, Proximity, Typo, Attribute, desc()],
vec![Words, Exactness, Proximity, Typo, desc(), Attribute],
vec![Words, Exactness, Typo, Attribute, desc(), Proximity],
vec![Words, Exactness, Typo, Attribute, Proximity, desc()],
vec![Words, Exactness, Typo, desc(), Attribute, Proximity],
vec![Words, Exactness, Typo, desc(), Proximity, Attribute],
vec![Words, Exactness, Typo, Proximity, Attribute, desc()],
vec![Words, Exactness, Typo, Proximity, desc(), Attribute],
vec![Words, Proximity, Attribute, desc(), Exactness, Typo],
vec![Words, Proximity, Attribute, desc(), Typo, Exactness],
vec![Words, Proximity, Attribute, Exactness, desc(), Typo],
vec![Words, Proximity, Attribute, Exactness, Typo, desc()],
vec![Words, Proximity, Attribute, Typo, desc(), Exactness],
vec![Words, Proximity, Attribute, Typo, Exactness, desc()],
vec![Words, Proximity, desc(), Attribute, Exactness, Typo],
vec![Words, Proximity, desc(), Attribute, Typo, Exactness],
vec![Words, Proximity, desc(), Exactness, Attribute, Typo],
vec![Words, Proximity, desc(), Exactness, Typo, Attribute],
vec![Words, Proximity, desc(), Typo, Attribute, Exactness],
vec![Words, Proximity, desc(), Typo, Exactness, Attribute],
vec![Words, Proximity, Exactness, Attribute, desc(), Typo],
vec![Words, Proximity, Exactness, Attribute, Typo, desc()],
vec![Words, Proximity, Exactness, desc(), Attribute, Typo],
vec![Words, Proximity, Exactness, desc(), Typo, Attribute],
vec![Words, Proximity, Exactness, Typo, Attribute, desc()],
vec![Words, Proximity, Exactness, Typo, desc(), Attribute],
vec![Words, Proximity, Typo, Attribute, desc(), Exactness],
vec![Words, Proximity, Typo, Attribute, Exactness, desc()],
vec![Words, Proximity, Typo, desc(), Attribute, Exactness],
vec![Words, Proximity, Typo, desc(), Exactness, Attribute],
vec![Words, Proximity, Typo, Exactness, Attribute, desc()],
vec![Words, Proximity, Typo, Exactness, desc(), Attribute],
vec![Words, Typo, Attribute, desc(), Exactness, Proximity],
vec![Words, Typo, Attribute, desc(), Proximity, Exactness],
vec![Words, Typo, Attribute, Exactness, desc(), Proximity],
vec![Words, Typo, Attribute, Exactness, Proximity, desc()],
vec![Words, Typo, Attribute, Proximity, desc(), Exactness],
vec![Words, Typo, Attribute, Proximity, Exactness, desc()],
vec![Words, Typo, desc(), Attribute, Proximity, Exactness],
vec![Words, Typo, desc(), Exactness, Attribute, Proximity],
vec![Words, Typo, desc(), Exactness, Attribute, Proximity],
vec![Words, Typo, desc(), Exactness, Proximity, Attribute],
vec![Words, Typo, desc(), Proximity, Attribute, Exactness],
vec![Words, Typo, desc(), Proximity, Exactness, Attribute],
vec![Words, Typo, Exactness, Attribute, desc(), Proximity],
vec![Words, Typo, Exactness, Attribute, Proximity, desc()],
vec![Words, Typo, Exactness, desc(), Attribute, Proximity],
vec![Words, Typo, Exactness, desc(), Proximity, Attribute],
vec![Words, Typo, Exactness, Proximity, Attribute, desc()],
vec![Words, Typo, Exactness, Proximity, desc(), Attribute],
vec![Words, Typo, Proximity, Attribute, desc(), Exactness],
vec![Words, Typo, Proximity, Attribute, Exactness, desc()],
vec![Words, Typo, Proximity, desc(), Attribute, Exactness],
vec![Words, Typo, Proximity, desc(), Exactness, Attribute],
vec![Words, Typo, Proximity, Exactness, Attribute, desc()],
vec![Words, Typo, Proximity, Exactness, desc(), Attribute],
]
};
let config = IndexerConfig::default();
2021-06-03 14:44:53 +02:00
for criteria in criteria_mix {
eprintln!("Testing with criteria order: {:?}", &criteria);
//update criteria
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, &config);
2023-01-11 12:14:17 +01:00
builder.set_criteria(criteria.clone());
builder.execute(|_| (), || false).unwrap();
2021-06-03 14:44:53 +02:00
wtxn.commit().unwrap();
2023-01-17 18:01:26 +01:00
let rtxn = index.read_txn().unwrap();
2021-06-03 14:44:53 +02:00
2023-01-17 18:01:26 +01:00
let mut search = Search::new(&rtxn, &index);
2021-06-03 14:44:53 +02:00
search.query(search::TEST_QUERY);
search.limit(EXTERNAL_DOCUMENTS_IDS.len());
search.terms_matching_strategy(ALLOW_OPTIONAL_WORDS);
2021-06-03 14:44:53 +02:00
2021-06-08 14:11:00 +02:00
let SearchResult { documents_ids, .. } = search.execute().unwrap();
2021-06-03 14:44:53 +02:00
2021-06-16 18:33:33 +02:00
let expected_external_ids: Vec<_> =
search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, &[])
2021-06-16 18:33:33 +02:00
.into_iter()
.map(|d| d.id)
.collect();
2021-06-03 14:44:53 +02:00
let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
assert_eq!(documents_ids, expected_external_ids);
}
}
#[test]
fn criteria_ascdesc() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(12 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
builder.set_sortable_fields(hashset! {
S("name"),
S("age"),
});
builder.execute(|_| (), || false).unwrap();
2024-11-18 17:39:55 +01:00
wtxn.commit().unwrap();
let mut wtxn = index.write_txn().unwrap();
let rtxn = index.read_txn().unwrap();
// index documents
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
2024-11-18 17:39:55 +01:00
let indexer_alloc = Bump::new();
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut new_fields_ids_map = db_fields_ids_map.clone();
2024-11-18 17:39:55 +01:00
let embedders = EmbeddingConfigs::default();
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
2024-11-18 17:39:55 +01:00
let mut file = tempfile::tempfile().unwrap();
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
let mut rng = rand::thread_rng();
let age = rng.gen::<u32>().to_string();
let name = rng
.sample_iter(&rand::distributions::Alphanumeric)
.map(char::from)
.filter(|c| *c >= 'a' && *c <= 'z')
.take(10)
.collect::<String>();
let json = serde_json::json!({
"name": name,
"age": age,
});
let object = match json {
serde_json::Value::Object(object) => object,
_ => panic!(),
};
2024-11-18 17:39:55 +01:00
serde_json::to_writer(&mut file, &object).unwrap();
});
2024-11-18 17:39:55 +01:00
file.sync_all().unwrap();
let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };
indexer.add_documents(&payload).unwrap();
2024-11-20 14:58:25 +01:00
let (document_changes, _operation_stats, primary_key) = indexer
2024-11-20 15:10:09 +01:00
.into_changes(
&indexer_alloc,
&index,
&rtxn,
None,
&mut new_fields_ids_map,
&|| false,
2024-12-10 16:30:48 +01:00
Progress::default(),
2024-11-20 15:10:09 +01:00
)
2024-11-20 14:58:25 +01:00
.unwrap();
2024-11-18 17:39:55 +01:00
indexer::index(
&mut wtxn,
&index,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
2024-11-18 17:39:55 +01:00
config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false,
2024-12-10 16:30:48 +01:00
&Progress::default(),
2024-11-18 17:39:55 +01:00
)
.unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::<Vec<_>>();
for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] {
eprintln!("Testing with criterion: {:?}", &criterion);
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, &config);
2023-01-11 12:14:17 +01:00
builder.set_criteria(vec![criterion.clone()]);
builder.execute(|_| (), || false).unwrap();
wtxn.commit().unwrap();
2023-01-17 18:01:26 +01:00
let rtxn = index.read_txn().unwrap();
2023-01-17 18:01:26 +01:00
let mut search = Search::new(&rtxn, &index);
search.limit(ASC_DESC_CANDIDATES_THRESHOLD + 1);
let SearchResult { documents_ids, .. } = search.execute().unwrap();
let expected_document_ids = match criterion {
Asc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| obkv.get(0).unwrap())
}
Desc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(0).unwrap()))
}
Asc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| obkv.get(1).unwrap())
}
Desc(field_name) if field_name == "name" => {
documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(1).unwrap()))
}
_ => continue,
}
.map(|(id, _)| *id)
.collect::<Vec<_>>();
assert_eq!(documents_ids, expected_document_ids);
}
}