mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Fix some tests but not all of them
This commit is contained in:
parent
670aff5553
commit
aba8a0e9e0
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -2041,7 +2041,9 @@ name = "fuzzers"
|
|||||||
version = "1.11.0"
|
version = "1.11.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arbitrary",
|
"arbitrary",
|
||||||
|
"bumpalo",
|
||||||
"clap",
|
"clap",
|
||||||
|
"either",
|
||||||
"fastrand",
|
"fastrand",
|
||||||
"milli",
|
"milli",
|
||||||
"serde",
|
"serde",
|
||||||
|
@ -140,7 +140,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
pub fn documents_from(filename: &str, filetype: &str) -> Mmap {
|
||||||
let reader = File::open(filename)
|
let reader = File::open(filename)
|
||||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
||||||
let reader = BufReader::new(reader);
|
let reader = BufReader::new(reader);
|
||||||
|
@ -12,7 +12,9 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arbitrary = { version = "1.3.2", features = ["derive"] }
|
arbitrary = { version = "1.3.2", features = ["derive"] }
|
||||||
|
bumpalo = "3.16.0"
|
||||||
clap = { version = "4.5.9", features = ["derive"] }
|
clap = { version = "4.5.9", features = ["derive"] }
|
||||||
|
either = "1.13.0"
|
||||||
fastrand = "2.1.0"
|
fastrand = "2.1.0"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
serde = { version = "1.0.204", features = ["derive"] }
|
serde = { version = "1.0.204", features = ["derive"] }
|
||||||
|
@ -4,11 +4,17 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use arbitrary::{Arbitrary, Unstructured};
|
use arbitrary::{Arbitrary, Unstructured};
|
||||||
|
use bumpalo::Bump;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
use either::Either;
|
||||||
use fuzzers::Operation;
|
use fuzzers::Operation;
|
||||||
|
use milli::documents::mmap_from_objects;
|
||||||
use milli::heed::EnvOpenOptions;
|
use milli::heed::EnvOpenOptions;
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
|
use milli::update::new::indexer;
|
||||||
|
use milli::update::{IndexDocumentsMethod, IndexerConfig};
|
||||||
|
use milli::vector::EmbeddingConfigs;
|
||||||
use milli::Index;
|
use milli::Index;
|
||||||
|
use serde_json::Value;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
#[derive(Debug, Arbitrary)]
|
#[derive(Debug, Arbitrary)]
|
||||||
@ -58,7 +64,6 @@ fn main() {
|
|||||||
};
|
};
|
||||||
let index = Index::new(options, tempdir.path()).unwrap();
|
let index = Index::new(options, tempdir.path()).unwrap();
|
||||||
let indexer_config = IndexerConfig::default();
|
let indexer_config = IndexerConfig::default();
|
||||||
let index_documents_config = IndexDocumentsConfig::default();
|
|
||||||
|
|
||||||
std::thread::scope(|s| {
|
std::thread::scope(|s| {
|
||||||
loop {
|
loop {
|
||||||
@ -75,38 +80,69 @@ fn main() {
|
|||||||
|
|
||||||
let handle = s.spawn(|| {
|
let handle = s.spawn(|| {
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
for batch in batches {
|
for batch in batches {
|
||||||
let mut builder = IndexDocuments::new(
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
&mut wtxn,
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
&index,
|
|
||||||
&indexer_config,
|
|
||||||
index_documents_config.clone(),
|
|
||||||
|_| (),
|
|
||||||
|| false,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
let embedders = EmbeddingConfigs::default();
|
||||||
|
let mut indexer = indexer::DocumentOperation::new(
|
||||||
|
IndexDocumentsMethod::ReplaceDocuments,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut operations = Vec::new();
|
||||||
for op in batch.0 {
|
for op in batch.0 {
|
||||||
match op {
|
match op {
|
||||||
Operation::AddDoc(doc) => {
|
Operation::AddDoc(doc) => {
|
||||||
let documents =
|
let object = match doc.to_d() {
|
||||||
milli::documents::objects_from_json_value(doc.to_d());
|
Value::Object(object) => object,
|
||||||
let documents =
|
_ => unreachable!(),
|
||||||
milli::documents::documents_batch_reader_from_objects(
|
};
|
||||||
documents,
|
let documents = mmap_from_objects(vec![object]);
|
||||||
);
|
operations.push(Either::Left(documents));
|
||||||
let (b, _added) = builder.add_documents(documents).unwrap();
|
|
||||||
builder = b;
|
|
||||||
}
|
}
|
||||||
Operation::DeleteDoc(id) => {
|
Operation::DeleteDoc(id) => {
|
||||||
let (b, _removed) =
|
let id = indexer_alloc.alloc_str(&id.to_s());
|
||||||
builder.remove_documents(vec![id.to_s()]).unwrap();
|
let ids = indexer_alloc.alloc_slice_copy(&[&*id]);
|
||||||
builder = b;
|
operations.push(Either::Right(ids));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
builder.execute().unwrap();
|
|
||||||
|
for op in &operations {
|
||||||
|
match op {
|
||||||
|
Either::Left(documents) => {
|
||||||
|
indexer.add_documents(documents).unwrap()
|
||||||
|
}
|
||||||
|
Either::Right(ids) => indexer.delete_documents(ids),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let (document_changes, _operation_stats, primary_key) = indexer
|
||||||
|
.into_changes(
|
||||||
|
&indexer_alloc,
|
||||||
|
&index,
|
||||||
|
&rtxn,
|
||||||
|
None,
|
||||||
|
&mut new_fields_ids_map,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
indexer_config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
// after executing a batch we check if the database is corrupted
|
// after executing a batch we check if the database is corrupted
|
||||||
let res = index.search(&wtxn).execute().unwrap();
|
let res = index.search(&wtxn).execute().unwrap();
|
||||||
|
@ -448,11 +448,12 @@ fn import_dump(
|
|||||||
|
|
||||||
let builder = builder.with_embedders(embedders);
|
let builder = builder.with_embedders(embedders);
|
||||||
|
|
||||||
let (builder, user_result) = builder.add_documents(reader)?;
|
todo!("please plug the dump load of main");
|
||||||
let user_result = user_result?;
|
// let (builder, user_result) = builder.add_documents(reader)?;
|
||||||
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
|
// let user_result = user_result?;
|
||||||
builder.execute()?;
|
// tracing::info!(documents_found = user_result, "{} documents found.", user_result);
|
||||||
wtxn.commit()?;
|
// builder.execute()?;
|
||||||
|
// wtxn.commit()?;
|
||||||
tracing::info!("All documents successfully imported.");
|
tracing::info!("All documents successfully imported.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,114 +0,0 @@
|
|||||||
use std::error::Error;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
|
||||||
use milli::{Index, Object};
|
|
||||||
|
|
||||||
fn usage(error: &str, program_name: &str) -> String {
|
|
||||||
format!(
|
|
||||||
"{}. Usage: {} <PATH-TO-INDEX> <PATH-TO-DATASET> [searchable_fields] [filterable_fields]",
|
|
||||||
error, program_name
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn Error>> {
|
|
||||||
let mut args = std::env::args();
|
|
||||||
let program_name = args.next().expect("No program name");
|
|
||||||
let index_path =
|
|
||||||
args.next().unwrap_or_else(|| panic!("{}", usage("Missing path to index.", &program_name)));
|
|
||||||
let dataset_path = args
|
|
||||||
.next()
|
|
||||||
.unwrap_or_else(|| panic!("{}", usage("Missing path to source dataset.", &program_name)));
|
|
||||||
// let primary_key = args.next().unwrap_or_else(|| "id".into());
|
|
||||||
// "title overview"
|
|
||||||
let searchable_fields: Vec<String> = args
|
|
||||||
.next()
|
|
||||||
.map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
println!("{searchable_fields:?}");
|
|
||||||
// "release_date genres"
|
|
||||||
let filterable_fields: Vec<String> = args
|
|
||||||
.next()
|
|
||||||
.map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
|
|
||||||
.unwrap_or_default();
|
|
||||||
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
std::fs::create_dir_all(&index_path).unwrap();
|
|
||||||
let index = Index::new(options, index_path).unwrap();
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
|
||||||
// builder.set_primary_key(primary_key);
|
|
||||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
builder.set_searchable_fields(searchable_fields);
|
|
||||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
builder.set_filterable_fields(filterable_fields);
|
|
||||||
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
|
||||||
|
|
||||||
let builder =
|
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
|
||||||
|
|
||||||
let documents = documents_from(
|
|
||||||
&dataset_path,
|
|
||||||
Path::new(&dataset_path).extension().unwrap_or_default().to_str().unwrap_or_default(),
|
|
||||||
);
|
|
||||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
|
||||||
user_error.unwrap();
|
|
||||||
builder.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
|
||||||
let reader = File::open(filename)
|
|
||||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
|
||||||
let reader = BufReader::new(reader);
|
|
||||||
let documents = match filetype {
|
|
||||||
"csv" => documents_from_csv(reader).unwrap(),
|
|
||||||
"json" => documents_from_json(reader).unwrap(),
|
|
||||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
|
||||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
|
||||||
};
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_jsonl(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
|
|
||||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
|
||||||
let object = result.unwrap();
|
|
||||||
documents.append_json_object(&object)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_json(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
|
|
||||||
documents.append_json_array(reader)?;
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
|
||||||
let csv = csv::Reader::from_reader(reader);
|
|
||||||
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
documents.append_csv(csv)?;
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
@ -1,124 +0,0 @@
|
|||||||
use std::error::Error;
|
|
||||||
use std::io::stdin;
|
|
||||||
use std::path::Path;
|
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
use milli::{
|
|
||||||
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext,
|
|
||||||
SearchLogger, TermsMatchingStrategy, TimeBudget,
|
|
||||||
};
|
|
||||||
|
|
||||||
#[global_allocator]
|
|
||||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn Error>> {
|
|
||||||
let mut args = std::env::args();
|
|
||||||
let program_name = args.next().expect("No program name");
|
|
||||||
let dataset = args.next().unwrap_or_else(|| {
|
|
||||||
panic!(
|
|
||||||
"Missing path to index. Usage: {} <PATH-TO-INDEX> [<logger-dir>] [print-documents]",
|
|
||||||
program_name
|
|
||||||
)
|
|
||||||
});
|
|
||||||
let detailed_logger_dir = args.next();
|
|
||||||
let print_documents: bool =
|
|
||||||
if let Some(arg) = args.next() { arg == "print-documents" } else { false };
|
|
||||||
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, dataset)?;
|
|
||||||
let txn = index.read_txn()?;
|
|
||||||
let mut query = String::new();
|
|
||||||
while stdin().read_line(&mut query)? > 0 {
|
|
||||||
for _ in 0..2 {
|
|
||||||
let mut default_logger = DefaultSearchLogger;
|
|
||||||
// FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible.
|
|
||||||
// Workaround'd here by recreating the logger on each iteration of the loop
|
|
||||||
let mut detailed_logger = detailed_logger_dir
|
|
||||||
.as_ref()
|
|
||||||
.map(|logger_dir| (milli::VisualSearchLogger::default(), logger_dir));
|
|
||||||
let logger: &mut dyn SearchLogger<_> =
|
|
||||||
if let Some((detailed_logger, _)) = detailed_logger.as_mut() {
|
|
||||||
detailed_logger
|
|
||||||
} else {
|
|
||||||
&mut default_logger
|
|
||||||
};
|
|
||||||
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let mut ctx = SearchContext::new(&index, &txn)?;
|
|
||||||
let universe = filtered_universe(ctx.index, ctx.txn, &None)?;
|
|
||||||
|
|
||||||
let docs = execute_search(
|
|
||||||
&mut ctx,
|
|
||||||
(!query.trim().is_empty()).then(|| query.trim()),
|
|
||||||
TermsMatchingStrategy::Last,
|
|
||||||
milli::score_details::ScoringStrategy::Skip,
|
|
||||||
false,
|
|
||||||
universe,
|
|
||||||
&None,
|
|
||||||
&None,
|
|
||||||
GeoSortStrategy::default(),
|
|
||||||
0,
|
|
||||||
20,
|
|
||||||
None,
|
|
||||||
&mut DefaultSearchLogger,
|
|
||||||
logger,
|
|
||||||
TimeBudget::max(),
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
)?;
|
|
||||||
if let Some((logger, dir)) = detailed_logger {
|
|
||||||
logger.finish(&mut ctx, Path::new(dir))?;
|
|
||||||
}
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
|
|
||||||
if print_documents {
|
|
||||||
let documents = index
|
|
||||||
.documents(&txn, docs.documents_ids.iter().copied())
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|(id, obkv)| {
|
|
||||||
let mut object = serde_json::Map::default();
|
|
||||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
|
||||||
let value = obkv.get(fid).unwrap();
|
|
||||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
|
||||||
object.insert(fid_name.to_owned(), value);
|
|
||||||
}
|
|
||||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
for (id, document) in documents {
|
|
||||||
println!("{id}:");
|
|
||||||
println!("{document}");
|
|
||||||
}
|
|
||||||
|
|
||||||
let documents = index
|
|
||||||
.documents(&txn, docs.documents_ids.iter().copied())
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|(id, obkv)| {
|
|
||||||
let mut object = serde_json::Map::default();
|
|
||||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
|
||||||
let value = obkv.get(fid).unwrap();
|
|
||||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
|
||||||
object.insert(fid_name.to_owned(), value);
|
|
||||||
}
|
|
||||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
|
||||||
for (id, document) in documents {
|
|
||||||
println!("{id}:");
|
|
||||||
println!("{document}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
query.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
@ -1,33 +0,0 @@
|
|||||||
// use big_s::S;
|
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
// use maplit::hashset;
|
|
||||||
use milli::{
|
|
||||||
update::{IndexerConfig, Settings},
|
|
||||||
Criterion, Index,
|
|
||||||
};
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_movies.ms").unwrap();
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
|
||||||
|
|
||||||
// builder.set_min_word_len_one_typo(5);
|
|
||||||
// builder.set_min_word_len_two_typos(7);
|
|
||||||
// builder.set_sortable_fields(hashset! { S("release_date") });
|
|
||||||
builder.set_criteria(vec![
|
|
||||||
Criterion::Words,
|
|
||||||
Criterion::Typo,
|
|
||||||
Criterion::Proximity,
|
|
||||||
Criterion::Attribute,
|
|
||||||
Criterion::Sort,
|
|
||||||
Criterion::Exactness,
|
|
||||||
]);
|
|
||||||
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
}
|
|
@ -150,11 +150,24 @@ pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
|
|||||||
macro_rules! documents {
|
macro_rules! documents {
|
||||||
($data:tt) => {{
|
($data:tt) => {{
|
||||||
let documents = serde_json::json!($data);
|
let documents = serde_json::json!($data);
|
||||||
let documents = $crate::documents::objects_from_json_value(documents);
|
let mut file = tempfile::tempfile().unwrap();
|
||||||
$crate::documents::documents_batch_reader_from_objects(documents)
|
for document in documents.as_array().unwrap() {
|
||||||
|
serde_json::to_writer(&mut file, &document).unwrap();
|
||||||
|
}
|
||||||
|
file.sync_all().unwrap();
|
||||||
|
unsafe { memmap2::Mmap::map(&file).unwrap() }
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn mmap_from_objects(objects: impl IntoIterator<Item = Object>) -> memmap2::Mmap {
|
||||||
|
let mut writer = tempfile::tempfile().map(std::io::BufWriter::new).unwrap();
|
||||||
|
for object in objects {
|
||||||
|
serde_json::to_writer(&mut writer, &object).unwrap();
|
||||||
|
}
|
||||||
|
let file = writer.into_inner().unwrap();
|
||||||
|
unsafe { memmap2::Mmap::map(&file).unwrap() }
|
||||||
|
}
|
||||||
|
|
||||||
pub fn documents_batch_reader_from_objects(
|
pub fn documents_batch_reader_from_objects(
|
||||||
objects: impl IntoIterator<Item = Object>,
|
objects: impl IntoIterator<Item = Object>,
|
||||||
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {
|
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {
|
||||||
@ -224,20 +237,6 @@ mod test {
|
|||||||
assert!(documents.next_document().unwrap().is_none());
|
assert!(documents.next_document().unwrap().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_nested() {
|
|
||||||
let docs_reader = documents!([{
|
|
||||||
"hello": {
|
|
||||||
"toto": ["hello"]
|
|
||||||
}
|
|
||||||
}]);
|
|
||||||
|
|
||||||
let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
|
|
||||||
let doc = cursor.next_document().unwrap().unwrap();
|
|
||||||
let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
|
|
||||||
assert_eq!(nested, json!({ "toto": ["hello"] }));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn out_of_order_json_fields() {
|
fn out_of_order_json_fields() {
|
||||||
let _documents = documents!([
|
let _documents = documents!([
|
||||||
|
@ -1680,19 +1680,23 @@ pub(crate) mod tests {
|
|||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use bumpalo::Bump;
|
||||||
use heed::{EnvOpenOptions, RwTxn};
|
use heed::{EnvOpenOptions, RwTxn};
|
||||||
use maplit::{btreemap, hashset};
|
use maplit::{btreemap, hashset};
|
||||||
|
use memmap2::Mmap;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
use crate::documents::DocumentsBatchReader;
|
|
||||||
use crate::error::{Error, InternalError};
|
use crate::error::{Error, InternalError};
|
||||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||||
|
use crate::update::new::indexer;
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
|
self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings,
|
||||||
Settings,
|
|
||||||
};
|
};
|
||||||
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
|
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
|
||||||
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
use crate::vector::EmbeddingConfigs;
|
||||||
|
use crate::{
|
||||||
|
db_snap, obkv_to_json, Filter, Index, Search, SearchResult, ThreadPoolNoAbortBuilder,
|
||||||
|
};
|
||||||
|
|
||||||
pub(crate) struct TempIndex {
|
pub(crate) struct TempIndex {
|
||||||
pub inner: Index,
|
pub inner: Index,
|
||||||
@ -1725,35 +1729,60 @@ pub(crate) mod tests {
|
|||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self::new_with_map_size(4096 * 2000)
|
Self::new_with_map_size(4096 * 2000)
|
||||||
}
|
}
|
||||||
pub fn add_documents_using_wtxn<'t, R>(
|
|
||||||
|
pub fn add_documents_using_wtxn<'t>(
|
||||||
&'t self,
|
&'t self,
|
||||||
wtxn: &mut RwTxn<'t>,
|
wtxn: &mut RwTxn<'t>,
|
||||||
documents: DocumentsBatchReader<R>,
|
documents: Mmap,
|
||||||
) -> Result<(), crate::error::Error>
|
) -> Result<(), crate::error::Error> {
|
||||||
where
|
let local_pool;
|
||||||
R: std::io::Read + std::io::Seek,
|
let indexer_config = &self.indexer_config;
|
||||||
{
|
let pool = match &indexer_config.thread_pool {
|
||||||
let builder = IndexDocuments::new(
|
Some(pool) => pool,
|
||||||
wtxn,
|
None => {
|
||||||
self,
|
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
|
||||||
&self.indexer_config,
|
&local_pool
|
||||||
self.index_documents_config.clone(),
|
}
|
||||||
|_| (),
|
};
|
||||||
|| false,
|
|
||||||
)
|
let rtxn = self.inner.read_txn()?;
|
||||||
.unwrap();
|
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
|
||||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
user_error?;
|
|
||||||
builder.execute()?;
|
let embedders = EmbeddingConfigs::default();
|
||||||
|
let mut indexer =
|
||||||
|
indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
indexer.add_documents(&documents).unwrap();
|
||||||
|
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
let (document_changes, _operation_stats, primary_key) = indexer.into_changes(
|
||||||
|
&indexer_alloc,
|
||||||
|
&self.inner,
|
||||||
|
&rtxn,
|
||||||
|
None,
|
||||||
|
&mut new_fields_ids_map,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
pool.install(|| {
|
||||||
|
indexer::index(
|
||||||
|
wtxn,
|
||||||
|
&self.inner,
|
||||||
|
indexer_config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
pub fn add_documents<R>(
|
|
||||||
&self,
|
pub fn add_documents(&self, documents: Mmap) -> Result<(), crate::error::Error> {
|
||||||
documents: DocumentsBatchReader<R>,
|
|
||||||
) -> Result<(), crate::error::Error>
|
|
||||||
where
|
|
||||||
R: std::io::Read + std::io::Seek,
|
|
||||||
{
|
|
||||||
let mut wtxn = self.write_txn().unwrap();
|
let mut wtxn = self.write_txn().unwrap();
|
||||||
self.add_documents_using_wtxn(&mut wtxn, documents)?;
|
self.add_documents_using_wtxn(&mut wtxn, documents)?;
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
@ -1769,6 +1798,7 @@ pub(crate) mod tests {
|
|||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update_settings_using_wtxn<'t>(
|
pub fn update_settings_using_wtxn<'t>(
|
||||||
&'t self,
|
&'t self,
|
||||||
wtxn: &mut RwTxn<'t>,
|
wtxn: &mut RwTxn<'t>,
|
||||||
@ -1784,19 +1814,54 @@ pub(crate) mod tests {
|
|||||||
&'t self,
|
&'t self,
|
||||||
wtxn: &mut RwTxn<'t>,
|
wtxn: &mut RwTxn<'t>,
|
||||||
external_document_ids: Vec<String>,
|
external_document_ids: Vec<String>,
|
||||||
) {
|
) -> Result<(), crate::error::Error> {
|
||||||
let builder = IndexDocuments::new(
|
let local_pool;
|
||||||
wtxn,
|
let indexer_config = &self.indexer_config;
|
||||||
self,
|
let pool = match &indexer_config.thread_pool {
|
||||||
&self.indexer_config,
|
Some(pool) => pool,
|
||||||
self.index_documents_config.clone(),
|
None => {
|
||||||
|_| (),
|
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
|
||||||
|| false,
|
&local_pool
|
||||||
)
|
}
|
||||||
.unwrap();
|
};
|
||||||
let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap();
|
|
||||||
user_error.unwrap();
|
let rtxn = self.inner.read_txn()?;
|
||||||
builder.execute().unwrap();
|
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
|
||||||
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
|
let embedders = EmbeddingConfigs::default();
|
||||||
|
let mut indexer =
|
||||||
|
indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let external_document_ids: Vec<_> =
|
||||||
|
external_document_ids.iter().map(AsRef::as_ref).collect();
|
||||||
|
indexer.delete_documents(external_document_ids.as_slice());
|
||||||
|
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
let (document_changes, _operation_stats, primary_key) = indexer.into_changes(
|
||||||
|
&indexer_alloc,
|
||||||
|
&self.inner,
|
||||||
|
&rtxn,
|
||||||
|
None,
|
||||||
|
&mut new_fields_ids_map,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
pool.install(|| {
|
||||||
|
indexer::index(
|
||||||
|
wtxn,
|
||||||
|
&self.inner,
|
||||||
|
indexer_config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete_documents(&self, external_document_ids: Vec<String>) {
|
pub fn delete_documents(&self, external_document_ids: Vec<String>) {
|
||||||
@ -1819,29 +1884,55 @@ pub(crate) mod tests {
|
|||||||
|
|
||||||
let index = TempIndex::new();
|
let index = TempIndex::new();
|
||||||
let mut wtxn = index.inner.write_txn().unwrap();
|
let mut wtxn = index.inner.write_txn().unwrap();
|
||||||
|
|
||||||
let should_abort = AtomicBool::new(false);
|
let should_abort = AtomicBool::new(false);
|
||||||
let builder = IndexDocuments::new(
|
|
||||||
&mut wtxn,
|
|
||||||
&index.inner,
|
|
||||||
&index.indexer_config,
|
|
||||||
index.index_documents_config.clone(),
|
|
||||||
|_| (),
|
|
||||||
|| should_abort.load(Relaxed),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let (builder, user_error) = builder
|
let local_pool;
|
||||||
.add_documents(documents!([
|
let indexer_config = &index.indexer_config;
|
||||||
{ "id": 1, "name": "kevin" },
|
let pool = match &indexer_config.thread_pool {
|
||||||
{ "id": 2, "name": "bob", "age": 20 },
|
Some(pool) => pool,
|
||||||
{ "id": 2, "name": "bob", "age": 20 },
|
None => {
|
||||||
]))
|
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
|
||||||
|
&local_pool
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let rtxn = index.inner.read_txn().unwrap();
|
||||||
|
let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
|
let embedders = EmbeddingConfigs::default();
|
||||||
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
let payload = documents!([
|
||||||
|
{ "id": 1, "name": "kevin" },
|
||||||
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
|
{ "id": 2, "name": "bob", "age": 20 },
|
||||||
|
]);
|
||||||
|
indexer.add_documents(&payload);
|
||||||
|
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
let (document_changes, _operation_stats, primary_key) = indexer
|
||||||
|
.into_changes(&indexer_alloc, &index.inner, &rtxn, None, &mut new_fields_ids_map)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
user_error.unwrap();
|
|
||||||
|
|
||||||
should_abort.store(true, Relaxed);
|
should_abort.store(true, Relaxed);
|
||||||
let err = builder.execute().unwrap_err();
|
|
||||||
|
let err = pool
|
||||||
|
.install(|| {
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index.inner,
|
||||||
|
indexer_config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| should_abort.load(Relaxed),
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.unwrap()
|
||||||
|
.unwrap_err();
|
||||||
|
|
||||||
assert!(matches!(err, Error::InternalError(InternalError::AbortedIndexation)));
|
assert!(matches!(err, Error::InternalError(InternalError::AbortedIndexation)));
|
||||||
}
|
}
|
||||||
|
@ -407,7 +407,7 @@ mod tests {
|
|||||||
use big_s::S;
|
use big_s::S;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
|
|
||||||
use crate::documents::documents_batch_reader_from_objects;
|
use crate::documents::mmap_from_objects;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::{milli_snap, FacetDistribution, OrderBy};
|
use crate::{milli_snap, FacetDistribution, OrderBy};
|
||||||
|
|
||||||
@ -508,8 +508,7 @@ mod tests {
|
|||||||
documents.push(document);
|
documents.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
|
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
@ -594,8 +593,7 @@ mod tests {
|
|||||||
documents.push(document);
|
documents.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
|
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
@ -654,8 +652,7 @@ mod tests {
|
|||||||
documents.push(document);
|
documents.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
|
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
@ -706,8 +703,7 @@ mod tests {
|
|||||||
documents.push(document);
|
documents.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
|
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
@ -758,8 +754,7 @@ mod tests {
|
|||||||
documents.push(document);
|
documents.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
|
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
@ -814,8 +809,7 @@ mod tests {
|
|||||||
documents.push(document);
|
documents.push(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
|
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
let txn = index.read_txn().unwrap();
|
||||||
|
@ -1,11 +1,16 @@
|
|||||||
use std::io::Cursor;
|
use std::io::Write;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use bumpalo::Bump;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::{btreemap, hashset};
|
use maplit::{btreemap, hashset};
|
||||||
|
|
||||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use crate::update::new::indexer;
|
||||||
|
use crate::update::{
|
||||||
|
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
|
};
|
||||||
|
use crate::vector::EmbeddingConfigs;
|
||||||
use crate::{db_snap, Criterion, Index, Object};
|
use crate::{db_snap, Criterion, Index, Object};
|
||||||
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
|
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
|
||||||
|
|
||||||
@ -16,6 +21,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
let index = Index::new(options, &path).unwrap();
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
@ -43,27 +49,41 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
|
||||||
|
|
||||||
let builder =
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
let reader = Cursor::new(CONTENT.as_bytes());
|
|
||||||
|
|
||||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
let embedders = EmbeddingConfigs::default();
|
||||||
let object = result.unwrap();
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
documents_builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let vector = documents_builder.into_inner().unwrap();
|
let mut file = tempfile::tempfile().unwrap();
|
||||||
|
file.write_all(CONTENT.as_bytes()).unwrap();
|
||||||
|
file.sync_all().unwrap();
|
||||||
|
let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };
|
||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
indexer.add_documents(&payload).unwrap();
|
||||||
let (builder, user_error) = builder.add_documents(content).unwrap();
|
|
||||||
user_error.unwrap();
|
let indexer_alloc = Bump::new();
|
||||||
builder.execute().unwrap();
|
let (document_changes, _operation_stats, primary_key) =
|
||||||
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
drop(rtxn);
|
||||||
|
|
||||||
index
|
index
|
||||||
}
|
}
|
||||||
|
@ -369,7 +369,7 @@ mod tests {
|
|||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::documents::documents_batch_reader_from_objects;
|
use crate::documents::{documents_batch_reader_from_objects, mmap_from_objects};
|
||||||
use crate::heed_codec::facet::OrderedF64Codec;
|
use crate::heed_codec::facet::OrderedF64Codec;
|
||||||
use crate::heed_codec::StrRefCodec;
|
use crate::heed_codec::StrRefCodec;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
@ -492,8 +492,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let documents = documents_batch_reader_from_objects(documents);
|
let documents = mmap_from_objects(documents);
|
||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents);
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,86 +0,0 @@
|
|||||||
use heed::types::Bytes;
|
|
||||||
use heed::{Database, RoTxn};
|
|
||||||
use obkv::KvReaderU16;
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use crate::{all_obkv_to_json, DocumentId, FieldsIdsMap, Object, ObkvCodec, Result, BEU32};
|
|
||||||
|
|
||||||
pub struct ImmutableObkvs<'t> {
|
|
||||||
ids: RoaringBitmap,
|
|
||||||
fields_ids_map: FieldsIdsMap,
|
|
||||||
slices: Vec<&'t [u8]>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'t> ImmutableObkvs<'t> {
|
|
||||||
/// Creates the structure by fetching all the OBKVs
|
|
||||||
/// and keeping the transaction making the pointers valid.
|
|
||||||
pub fn new(
|
|
||||||
rtxn: &'t RoTxn,
|
|
||||||
documents_database: Database<BEU32, ObkvCodec>,
|
|
||||||
fields_ids_map: FieldsIdsMap,
|
|
||||||
subset: RoaringBitmap,
|
|
||||||
) -> heed::Result<Self> {
|
|
||||||
let mut slices = Vec::new();
|
|
||||||
let documents_database = documents_database.remap_data_type::<Bytes>();
|
|
||||||
for docid in &subset {
|
|
||||||
let slice = documents_database.get(rtxn, &docid)?.unwrap();
|
|
||||||
slices.push(slice);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(ImmutableObkvs { ids: subset, fields_ids_map, slices })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the OBKVs identified by the given ID.
|
|
||||||
pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
|
|
||||||
match self
|
|
||||||
.ids
|
|
||||||
.rank(docid)
|
|
||||||
.checked_sub(1)
|
|
||||||
.and_then(|offset| self.slices.get(offset as usize))
|
|
||||||
{
|
|
||||||
Some(&bytes) => Ok(Some(bytes.into())),
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the owned rhai::Map identified by the given ID.
|
|
||||||
pub fn rhai_map(&self, docid: DocumentId) -> Result<Option<rhai::Map>> {
|
|
||||||
let obkv = match self.obkv(docid) {
|
|
||||||
Ok(Some(obkv)) => obkv,
|
|
||||||
Ok(None) => return Ok(None),
|
|
||||||
Err(e) => return Err(e.into()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
|
||||||
let map: Result<rhai::Map> = all_keys
|
|
||||||
.iter()
|
|
||||||
.copied()
|
|
||||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
|
||||||
.map(|(id, value)| {
|
|
||||||
let name = self.fields_ids_map.name(id).ok_or(
|
|
||||||
crate::error::FieldIdMapMissingEntry::FieldId {
|
|
||||||
field_id: id,
|
|
||||||
process: "all_obkv_to_rhaimap",
|
|
||||||
},
|
|
||||||
)?;
|
|
||||||
let value = serde_json::from_slice(value)
|
|
||||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
|
||||||
Ok((name.into(), value))
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
map.map(Some)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn json_map(&self, docid: DocumentId) -> Result<Option<Object>> {
|
|
||||||
let obkv = match self.obkv(docid) {
|
|
||||||
Ok(Some(obkv)) => obkv,
|
|
||||||
Ok(None) => return Ok(None),
|
|
||||||
Err(e) => return Err(e.into()),
|
|
||||||
};
|
|
||||||
|
|
||||||
all_obkv_to_json(obkv, &self.fields_ids_map).map(Some)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe impl Sync for ImmutableObkvs<'_> {}
|
|
@ -48,23 +48,8 @@ pub struct TransformOutput {
|
|||||||
/// containing all those documents.
|
/// containing all those documents.
|
||||||
pub struct Transform<'a, 'i> {
|
pub struct Transform<'a, 'i> {
|
||||||
pub index: &'i Index,
|
pub index: &'i Index,
|
||||||
fields_ids_map: FieldsIdsMap,
|
|
||||||
|
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
pub index_documents_method: IndexDocumentsMethod,
|
pub index_documents_method: IndexDocumentsMethod,
|
||||||
available_documents_ids: AvailableIds,
|
|
||||||
|
|
||||||
// Both grenad follows the same format:
|
|
||||||
// key | value
|
|
||||||
// u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
|
|
||||||
original_sorter: grenad::Sorter<EitherObkvMerge>,
|
|
||||||
flattened_sorter: grenad::Sorter<EitherObkvMerge>,
|
|
||||||
|
|
||||||
replaced_documents_ids: RoaringBitmap,
|
|
||||||
new_documents_ids: RoaringBitmap,
|
|
||||||
// To increase the cache locality and decrease the heap usage we use compact smartstring.
|
|
||||||
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
|
|
||||||
documents_count: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This enum is specific to the grenad sorter stored in the transform.
|
/// This enum is specific to the grenad sorter stored in the transform.
|
||||||
@ -75,29 +60,6 @@ pub enum Operation {
|
|||||||
Deletion,
|
Deletion,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a mapping between the field ids found in the document batch and the one that were
|
|
||||||
/// already present in the index.
|
|
||||||
///
|
|
||||||
/// If new fields are present in the addition, they are added to the index field ids map.
|
|
||||||
fn create_fields_mapping(
|
|
||||||
index_field_map: &mut FieldsIdsMap,
|
|
||||||
batch_field_map: &DocumentsBatchIndex,
|
|
||||||
) -> Result<HashMap<FieldId, FieldId>> {
|
|
||||||
batch_field_map
|
|
||||||
.iter()
|
|
||||||
// we sort by id here to ensure a deterministic mapping of the fields, that preserves
|
|
||||||
// the original ordering.
|
|
||||||
.sorted_by_key(|(&id, _)| id)
|
|
||||||
.map(|(field, name)| match index_field_map.id(name) {
|
|
||||||
Some(id) => Ok((*field, id)),
|
|
||||||
None => index_field_map
|
|
||||||
.insert(name)
|
|
||||||
.ok_or(Error::UserError(UserError::AttributeLimitReached))
|
|
||||||
.map(|id| (*field, id)),
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, 'i> Transform<'a, 'i> {
|
impl<'a, 'i> Transform<'a, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &mut heed::RwTxn<'_>,
|
wtxn: &mut heed::RwTxn<'_>,
|
||||||
@ -138,19 +100,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
);
|
);
|
||||||
let documents_ids = index.documents_ids(wtxn)?;
|
let documents_ids = index.documents_ids(wtxn)?;
|
||||||
|
|
||||||
Ok(Transform {
|
Ok(Transform { index, indexer_settings, index_documents_method })
|
||||||
index,
|
|
||||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
|
||||||
indexer_settings,
|
|
||||||
available_documents_ids: AvailableIds::new(&documents_ids),
|
|
||||||
original_sorter,
|
|
||||||
flattened_sorter,
|
|
||||||
index_documents_method,
|
|
||||||
replaced_documents_ids: RoaringBitmap::new(),
|
|
||||||
new_documents_ids: RoaringBitmap::new(),
|
|
||||||
new_external_documents_ids_builder: FxHashMap::default(),
|
|
||||||
documents_count: 0,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flatten a document from the fields ids map contained in self and insert the new
|
// Flatten a document from the fields ids map contained in self and insert the new
|
||||||
|
@ -1,12 +1,17 @@
|
|||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use bumpalo::Bump;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use milli::documents::{mmap_from_objects, DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::new::indexer;
|
||||||
|
use milli::update::{
|
||||||
|
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
|
};
|
||||||
|
use milli::vector::EmbeddingConfigs;
|
||||||
use milli::{FacetDistribution, Index, Object, OrderBy};
|
use milli::{FacetDistribution, Index, Object, OrderBy};
|
||||||
use serde_json::Deserializer;
|
use serde_json::{from_value, json, Deserializer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_distribution_with_no_facet_values() {
|
fn test_facet_distribution_with_no_facet_values() {
|
||||||
@ -27,37 +32,41 @@ fn test_facet_distribution_with_no_facet_values() {
|
|||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let builder =
|
let embedders = EmbeddingConfigs::default();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
let reader = Cursor::new(
|
|
||||||
r#"{
|
|
||||||
"id": 123,
|
|
||||||
"title": "What a week, hu...",
|
|
||||||
"genres": [],
|
|
||||||
"tags": ["blue"]
|
|
||||||
}
|
|
||||||
{
|
|
||||||
"id": 345,
|
|
||||||
"title": "I am the pig!",
|
|
||||||
"tags": ["red"]
|
|
||||||
}"#,
|
|
||||||
);
|
|
||||||
|
|
||||||
for result in Deserializer::from_reader(reader).into_iter::<Object>() {
|
let doc1: Object = from_value(
|
||||||
let object = result.unwrap();
|
json!({ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] }),
|
||||||
documents_builder.append_json_object(&object).unwrap();
|
)
|
||||||
}
|
.unwrap();
|
||||||
|
let doc2: Object =
|
||||||
let vector = documents_builder.into_inner().unwrap();
|
from_value(json!({ "id": 345, "title": "I am the pig!", "tags": ["red"] })).unwrap();
|
||||||
|
let documents = mmap_from_objects(vec![doc1, doc2]);
|
||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
indexer.add_documents(&documents).unwrap();
|
||||||
let (builder, user_error) = builder.add_documents(content).unwrap();
|
|
||||||
user_error.unwrap();
|
let indexer_alloc = Bump::new();
|
||||||
builder.execute().unwrap();
|
let (document_changes, _operation_stats, primary_key) =
|
||||||
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
use std::cmp::Reverse;
|
use std::cmp::Reverse;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::Cursor;
|
use std::io::Write;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use bumpalo::Bump;
|
||||||
use either::{Either, Left, Right};
|
use either::{Either, Left, Right};
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use maplit::{btreemap, hashset};
|
use maplit::{btreemap, hashset};
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use milli::update::new::indexer;
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||||
use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
|
use milli::vector::EmbeddingConfigs;
|
||||||
|
use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy};
|
||||||
use serde::{Deserialize, Deserializer};
|
use serde::{Deserialize, Deserializer};
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
@ -34,6 +36,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
let index = Index::new(options, &path).unwrap();
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
@ -61,27 +64,41 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
|||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let builder =
|
let embedders = EmbeddingConfigs::default();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
let reader = Cursor::new(CONTENT.as_bytes());
|
|
||||||
|
|
||||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
let mut file = tempfile::tempfile().unwrap();
|
||||||
let object = result.unwrap();
|
file.write_all(CONTENT.as_bytes()).unwrap();
|
||||||
documents_builder.append_json_object(&object).unwrap();
|
file.sync_all().unwrap();
|
||||||
}
|
|
||||||
|
|
||||||
let vector = documents_builder.into_inner().unwrap();
|
let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };
|
||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
indexer.add_documents(&payload).unwrap();
|
||||||
let (builder, user_error) = builder.add_documents(content).unwrap();
|
|
||||||
user_error.unwrap();
|
let indexer_alloc = Bump::new();
|
||||||
builder.execute().unwrap();
|
let (document_changes, _operation_stats, primary_key) =
|
||||||
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
drop(rtxn);
|
||||||
|
|
||||||
index
|
index
|
||||||
}
|
}
|
||||||
|
@ -2,11 +2,16 @@ use std::cmp::Reverse;
|
|||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
|
use bumpalo::Bump;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::update::new::indexer;
|
||||||
|
use milli::update::{
|
||||||
|
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||||
|
};
|
||||||
|
use milli::vector::EmbeddingConfigs;
|
||||||
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
|
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use Criterion::*;
|
use Criterion::*;
|
||||||
@ -275,14 +280,20 @@ fn criteria_ascdesc() {
|
|||||||
});
|
});
|
||||||
builder.execute(|_| (), || false).unwrap();
|
builder.execute(|_| (), || false).unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
// index documents
|
// index documents
|
||||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
let indexer_alloc = Bump::new();
|
||||||
let builder =
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
|
|
||||||
let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
|
let embedders = EmbeddingConfigs::default();
|
||||||
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
|
||||||
|
let mut file = tempfile::tempfile().unwrap();
|
||||||
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
|
(0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
|
|
||||||
@ -304,15 +315,29 @@ fn criteria_ascdesc() {
|
|||||||
_ => panic!(),
|
_ => panic!(),
|
||||||
};
|
};
|
||||||
|
|
||||||
batch_builder.append_json_object(&object).unwrap();
|
serde_json::to_writer(&mut file, &object).unwrap();
|
||||||
});
|
});
|
||||||
|
|
||||||
let vector = batch_builder.into_inner().unwrap();
|
file.sync_all().unwrap();
|
||||||
|
|
||||||
let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };
|
||||||
let (builder, user_error) = builder.add_documents(reader).unwrap();
|
indexer.add_documents(&payload).unwrap();
|
||||||
user_error.unwrap();
|
let (document_changes, _operation_stats, primary_key) =
|
||||||
builder.execute().unwrap();
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
@ -1,10 +1,15 @@
|
|||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
use milli::documents::mmap_from_objects;
|
||||||
use milli::{Criterion, Index, Search, TermsMatchingStrategy};
|
use milli::update::new::indexer;
|
||||||
use serde_json::json;
|
use milli::update::{IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings};
|
||||||
|
use milli::vector::EmbeddingConfigs;
|
||||||
|
use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
||||||
|
use serde_json::from_value;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
use ureq::json;
|
||||||
use Criterion::*;
|
use Criterion::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -106,34 +111,40 @@ fn test_typo_disabled_on_word() {
|
|||||||
options.map_size(4096 * 100);
|
options.map_size(4096 * 100);
|
||||||
let index = Index::new(options, tmp.path()).unwrap();
|
let index = Index::new(options, tmp.path()).unwrap();
|
||||||
|
|
||||||
let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new());
|
let doc1: Object = from_value(json!({ "id": 1usize, "data": "zealand" })).unwrap();
|
||||||
let doc1 = json!({
|
let doc2: Object = from_value(json!({ "id": 2usize, "data": "zearand" })).unwrap();
|
||||||
"id": 1usize,
|
let documents = mmap_from_objects(vec![doc1, doc2]);
|
||||||
"data": "zealand",
|
|
||||||
});
|
|
||||||
|
|
||||||
let doc2 = json!({
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
"id": 2usize,
|
let rtxn = index.read_txn().unwrap();
|
||||||
"data": "zearand",
|
|
||||||
});
|
|
||||||
|
|
||||||
builder.append_json_object(doc1.as_object().unwrap()).unwrap();
|
|
||||||
builder.append_json_object(doc2.as_object().unwrap()).unwrap();
|
|
||||||
let vector = builder.into_inner().unwrap();
|
|
||||||
|
|
||||||
let documents =
|
|
||||||
milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap();
|
|
||||||
|
|
||||||
let mut txn = index.write_txn().unwrap();
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
|
||||||
let builder =
|
|
||||||
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
|
||||||
|
|
||||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
user_error.unwrap();
|
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||||
builder.execute().unwrap();
|
let embedders = EmbeddingConfigs::default();
|
||||||
txn.commit().unwrap();
|
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
|
|
||||||
|
indexer.add_documents(&documents).unwrap();
|
||||||
|
|
||||||
|
let indexer_alloc = Bump::new();
|
||||||
|
let (document_changes, _operation_stats, primary_key) =
|
||||||
|
indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
|
||||||
|
|
||||||
|
indexer::index(
|
||||||
|
&mut wtxn,
|
||||||
|
&index,
|
||||||
|
config.grenad_parameters(),
|
||||||
|
&db_fields_ids_map,
|
||||||
|
new_fields_ids_map,
|
||||||
|
primary_key,
|
||||||
|
&document_changes,
|
||||||
|
embedders,
|
||||||
|
&|| false,
|
||||||
|
&|_| (),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
// basic typo search with default typo settings
|
// basic typo search with default typo settings
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user