2025-03-13 11:07:38 +01:00

112 lines
3.5 KiB
Rust

use std::io::Write;
use big_s::S;
use bumpalo::Bump;
use heed::EnvOpenOptions;
use maplit::{btreemap, hashset};
use crate::progress::Progress;
use crate::update::new::indexer;
use crate::update::{IndexerConfig, Settings};
use crate::vector::EmbeddingConfigs;
use crate::{db_snap, Criterion, FilterableAttributesRule, Index};
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
use crate::constants::RESERVED_GEO_FIELD_NAME;
pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let path = tempfile::tempdir().unwrap();
let options = EnvOpenOptions::new();
let mut options = options.read_txn_without_tls();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path, true).unwrap();
let mut wtxn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
builder.set_criteria(criteria.to_vec());
builder.set_filterable_fields(vec![
FilterableAttributesRule::Field(S("tag")),
FilterableAttributesRule::Field(S("asc_desc_rank")),
FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)),
FilterableAttributesRule::Field(S("opt1")),
FilterableAttributesRule::Field(S("opt1.opt2")),
FilterableAttributesRule::Field(S("tag_in")),
]);
builder.set_sortable_fields(hashset! {
S("tag"),
S("asc_desc_rank"),
});
builder.set_synonyms(btreemap! {
S("hello") => vec![S("good morning")],
S("world") => vec![S("earth")],
S("america") => vec![S("the united states")],
});
builder.set_searchable_fields(vec![S("title"), S("description")]);
builder.execute(|_| (), || false).unwrap();
wtxn.commit().unwrap();
// index documents
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
let rtxn = index.read_txn().unwrap();
let mut wtxn = index.write_txn().unwrap();
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut new_fields_ids_map = db_fields_ids_map.clone();
let embedders = EmbeddingConfigs::default();
let mut indexer = indexer::DocumentOperation::new();
let mut file = tempfile::tempfile().unwrap();
file.write_all(CONTENT.as_bytes()).unwrap();
file.sync_all().unwrap();
let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };
// index documents
indexer.replace_documents(&payload).unwrap();
let indexer_alloc = Bump::new();
let (document_changes, operation_stats, primary_key) = indexer
.into_changes(
&indexer_alloc,
&index,
&rtxn,
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
)
.unwrap();
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
panic!("{error}");
}
indexer::index(
&mut wtxn,
&index,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false,
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
drop(rtxn);
index
}
#[test]
fn snapshot_integration_dataset() {
let index = setup_search_index_with_criteria(&[Criterion::Attribute]);
db_snap!(index, word_position_docids, @"3c9347a767bceef3beb31465f1e5f3ae");
}