2022-04-04 13:59:29 +02:00
|
|
|
use std::collections::BTreeSet;
|
|
|
|
|
2024-11-18 17:39:55 +01:00
|
|
|
use bumpalo::Bump;
|
2022-04-04 13:59:29 +02:00
|
|
|
use heed::EnvOpenOptions;
|
2024-11-18 17:39:55 +01:00
|
|
|
use milli::documents::mmap_from_objects;
|
2024-12-10 16:30:48 +01:00
|
|
|
use milli::progress::Progress;
|
2024-11-18 17:39:55 +01:00
|
|
|
use milli::update::new::indexer;
|
2024-11-19 11:24:36 +01:00
|
|
|
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
2024-11-18 17:39:55 +01:00
|
|
|
use milli::vector::EmbeddingConfigs;
|
|
|
|
use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
|
|
|
use serde_json::from_value;
|
2022-04-04 13:59:29 +02:00
|
|
|
use tempfile::tempdir;
|
2024-11-18 17:39:55 +01:00
|
|
|
use ureq::json;
|
2022-04-01 10:50:01 +02:00
|
|
|
use Criterion::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_typo_tolerance_one_typo() {
|
|
|
|
let criteria = [Typo];
|
|
|
|
let index = super::setup_search_index_with_criteria(&criteria);
|
|
|
|
|
|
|
|
// basic typo search with default typo settings
|
|
|
|
{
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zeal");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zean");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut txn = index.write_txn().unwrap();
|
|
|
|
|
|
|
|
let config = IndexerConfig::default();
|
|
|
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
|
|
|
builder.set_min_word_len_one_typo(4);
|
2022-10-05 17:41:07 +02:00
|
|
|
builder.execute(|_| (), || false).unwrap();
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
// typo is now supported for 4 letters words
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zean");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_typo_tolerance_two_typo() {
|
|
|
|
let criteria = [Typo];
|
|
|
|
let index = super::setup_search_index_with_criteria(&criteria);
|
|
|
|
|
|
|
|
// basic typo search with default typo settings
|
|
|
|
{
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zealand");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zealemd");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut txn = index.write_txn().unwrap();
|
|
|
|
|
|
|
|
let config = IndexerConfig::default();
|
|
|
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
|
|
|
builder.set_min_word_len_two_typos(7);
|
2022-10-05 17:41:07 +02:00
|
|
|
builder.execute(|_| (), || false).unwrap();
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
// typo is now supported for 4 letters words
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zealemd");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-01 10:50:01 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
}
|
2022-04-04 13:59:29 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_typo_disabled_on_word() {
|
|
|
|
let tmp = tempdir().unwrap();
|
|
|
|
let mut options = EnvOpenOptions::new();
|
|
|
|
options.map_size(4096 * 100);
|
|
|
|
let index = Index::new(options, tmp.path()).unwrap();
|
|
|
|
|
2024-11-18 17:39:55 +01:00
|
|
|
let doc1: Object = from_value(json!({ "id": 1usize, "data": "zealand" })).unwrap();
|
|
|
|
let doc2: Object = from_value(json!({ "id": 2usize, "data": "zearand" })).unwrap();
|
|
|
|
let documents = mmap_from_objects(vec![doc1, doc2]);
|
2022-06-14 16:04:27 +02:00
|
|
|
|
2024-11-18 17:39:55 +01:00
|
|
|
let mut wtxn = index.write_txn().unwrap();
|
|
|
|
let rtxn = index.read_txn().unwrap();
|
2022-04-04 13:59:29 +02:00
|
|
|
let config = IndexerConfig::default();
|
2024-11-18 17:39:55 +01:00
|
|
|
|
|
|
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
|
|
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
|
|
|
let embedders = EmbeddingConfigs::default();
|
|
|
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
|
|
|
|
|
|
|
indexer.add_documents(&documents).unwrap();
|
|
|
|
|
|
|
|
let indexer_alloc = Bump::new();
|
2024-11-20 14:58:25 +01:00
|
|
|
let (document_changes, _operation_stats, primary_key) = indexer
|
2024-11-20 15:10:09 +01:00
|
|
|
.into_changes(
|
|
|
|
&indexer_alloc,
|
|
|
|
&index,
|
|
|
|
&rtxn,
|
|
|
|
None,
|
|
|
|
&mut new_fields_ids_map,
|
|
|
|
&|| false,
|
2024-12-10 16:30:48 +01:00
|
|
|
Progress::default(),
|
2024-11-20 15:10:09 +01:00
|
|
|
)
|
2024-11-20 14:58:25 +01:00
|
|
|
.unwrap();
|
2024-11-18 17:39:55 +01:00
|
|
|
|
|
|
|
indexer::index(
|
|
|
|
&mut wtxn,
|
|
|
|
&index,
|
2024-11-27 17:04:49 +01:00
|
|
|
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
2024-11-18 17:39:55 +01:00
|
|
|
config.grenad_parameters(),
|
|
|
|
&db_fields_ids_map,
|
|
|
|
new_fields_ids_map,
|
|
|
|
primary_key,
|
|
|
|
&document_changes,
|
|
|
|
embedders,
|
|
|
|
&|| false,
|
2024-12-10 16:30:48 +01:00
|
|
|
&Progress::default(),
|
2024-11-18 17:39:55 +01:00
|
|
|
)
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
wtxn.commit().unwrap();
|
2022-04-04 13:59:29 +02:00
|
|
|
|
|
|
|
// basic typo search with default typo settings
|
|
|
|
{
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zealand");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-04 13:59:29 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut txn = index.write_txn().unwrap();
|
|
|
|
|
|
|
|
let config = IndexerConfig::default();
|
|
|
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
|
|
|
let mut exact_words = BTreeSet::new();
|
2022-04-04 20:34:23 +02:00
|
|
|
// `zealand` doesn't allow typos anymore
|
2022-04-04 13:59:29 +02:00
|
|
|
exact_words.insert("zealand".to_string());
|
|
|
|
builder.set_exact_words(exact_words);
|
2022-10-05 17:41:07 +02:00
|
|
|
builder.execute(|_| (), || false).unwrap();
|
2022-04-04 13:59:29 +02:00
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("zealand");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-04 13:59:29 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
}
|
2022-04-04 14:47:07 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_disable_typo_on_attribute() {
|
|
|
|
let criteria = [Typo];
|
|
|
|
let index = super::setup_search_index_with_criteria(&criteria);
|
|
|
|
|
|
|
|
// basic typo search with default typo settings
|
|
|
|
{
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
2022-04-04 21:17:06 +02:00
|
|
|
// typo in `antebel(l)um`
|
2022-04-04 14:47:07 +02:00
|
|
|
search.query("antebelum");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-04 14:47:07 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut txn = index.write_txn().unwrap();
|
|
|
|
|
|
|
|
let config = IndexerConfig::default();
|
|
|
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
2022-04-04 21:17:06 +02:00
|
|
|
// disable typos on `description`
|
2022-04-04 14:47:07 +02:00
|
|
|
builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect());
|
2022-10-05 17:41:07 +02:00
|
|
|
builder.execute(|_| (), || false).unwrap();
|
2022-04-04 14:47:07 +02:00
|
|
|
|
|
|
|
let mut search = Search::new(&txn, &index);
|
|
|
|
search.query("antebelum");
|
|
|
|
search.limit(10);
|
2023-04-24 12:11:25 +02:00
|
|
|
|
2022-08-22 17:37:36 +02:00
|
|
|
search.terms_matching_strategy(TermsMatchingStrategy::default());
|
2022-04-04 14:47:07 +02:00
|
|
|
|
|
|
|
let result = search.execute().unwrap();
|
|
|
|
assert_eq!(result.documents_ids.len(), 0);
|
|
|
|
}
|