mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 03:47:02 +02:00
Move crates under a sub folder to clean up the code
This commit is contained in:
parent
30f3c30389
commit
9c1e54a2c8
1062 changed files with 19 additions and 20 deletions
1347
crates/benchmarks/benches/indexing.rs
Normal file
1347
crates/benchmarks/benches/indexing.rs
Normal file
File diff suppressed because it is too large
Load diff
122
crates/benchmarks/benches/search_geo.rs
Normal file
122
crates/benchmarks/benches/search_geo.rs
Normal file
|
@ -0,0 +1,122 @@
|
|||
mod datasets_paths;
|
||||
mod utils;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
use milli::update::Settings;
|
||||
use utils::Conf;
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
fn base_conf(builder: &mut Settings) {
|
||||
let displayed_fields =
|
||||
["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields =
|
||||
["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let filterable_fields =
|
||||
["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
let sortable_fields =
|
||||
["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_sortable_fields(sortable_fields);
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: datasets_paths::SMOL_ALL_COUNTRIES,
|
||||
dataset_format: "jsonl",
|
||||
queries: &[
|
||||
"",
|
||||
],
|
||||
configure: base_conf,
|
||||
primary_key: Some("geonameid"),
|
||||
..Conf::BASE
|
||||
};
|
||||
|
||||
fn bench_geo(c: &mut criterion::Criterion) {
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
// A basic placeholder with no geo
|
||||
utils::Conf {
|
||||
group_name: "placeholder with no geo",
|
||||
..BASE_CONF
|
||||
},
|
||||
// Medium aglomeration: probably the most common usecase
|
||||
utils::Conf {
|
||||
group_name: "asc sort from Lille",
|
||||
sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):asc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "desc sort from Lille",
|
||||
sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):desc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
// Big agglomeration: a lot of documents close to our point
|
||||
utils::Conf {
|
||||
group_name: "asc sort from Tokyo",
|
||||
sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):asc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "desc sort from Tokyo",
|
||||
sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):desc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
// The furthest point from any civilization
|
||||
utils::Conf {
|
||||
group_name: "asc sort from Point Nemo",
|
||||
sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):asc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "desc sort from Point Nemo",
|
||||
sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):desc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
// Filters
|
||||
utils::Conf {
|
||||
group_name: "filter of 100km from Lille",
|
||||
filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 100000)"),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "filter of 1km from Lille",
|
||||
filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 1000)"),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "filter of 100km from Tokyo",
|
||||
filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 100000)"),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "filter of 1km from Tokyo",
|
||||
filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 1000)"),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "filter of 100km from Point Nemo",
|
||||
filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 100000)"),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "filter of 1km from Point Nemo",
|
||||
filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 1000)"),
|
||||
..BASE_CONF
|
||||
},
|
||||
];
|
||||
|
||||
utils::run_benches(c, confs);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_geo);
|
||||
criterion_main!(benches);
|
196
crates/benchmarks/benches/search_songs.rs
Normal file
196
crates/benchmarks/benches/search_songs.rs
Normal file
|
@ -0,0 +1,196 @@
|
|||
mod datasets_paths;
|
||||
mod utils;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
use milli::update::Settings;
|
||||
use utils::Conf;
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
fn base_conf(builder: &mut Settings) {
|
||||
let displayed_fields =
|
||||
["id", "title", "album", "artist", "genre", "country", "released", "duration"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
builder.set_filterable_fields(faceted_fields);
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: datasets_paths::SMOL_SONGS,
|
||||
queries: &[
|
||||
"john ", // 9097
|
||||
"david ", // 4794
|
||||
"charles ", // 1957
|
||||
"david bowie ", // 1200
|
||||
"michael jackson ", // 600
|
||||
"thelonious monk ", // 303
|
||||
"charles mingus ", // 142
|
||||
"marcus miller ", // 60
|
||||
"tamo ", // 13
|
||||
"Notstandskomitee ", // 4
|
||||
],
|
||||
configure: base_conf,
|
||||
primary_key: Some("id"),
|
||||
..Conf::BASE
|
||||
};
|
||||
|
||||
fn bench_songs(c: &mut criterion::Criterion) {
|
||||
let default_criterion: Vec<String> =
|
||||
milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect();
|
||||
let default_criterion = default_criterion.iter().map(|s| s.as_str());
|
||||
let asc_default: Vec<&str> =
|
||||
std::iter::once("released-timestamp:asc").chain(default_criterion.clone()).collect();
|
||||
let desc_default: Vec<&str> =
|
||||
std::iter::once("released-timestamp:desc").chain(default_criterion.clone()).collect();
|
||||
|
||||
let basic_with_quote: Vec<String> = BASE_CONF
|
||||
.queries
|
||||
.iter()
|
||||
.map(|s| {
|
||||
s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
|
||||
})
|
||||
.collect();
|
||||
let basic_with_quote: &[&str] =
|
||||
&basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
|
||||
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
/* first we bench each criterion alone */
|
||||
utils::Conf {
|
||||
group_name: "proximity",
|
||||
queries: &[
|
||||
"black saint sinner lady ",
|
||||
"les dangeureuses 1960 ",
|
||||
"The Disneyland Sing-Along Chorus ",
|
||||
"Under Great Northern Lights ",
|
||||
"7000 Danses Un Jour Dans Notre Vie ",
|
||||
],
|
||||
criterion: Some(&["proximity"]),
|
||||
optional_words: false,
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "typo",
|
||||
queries: &[
|
||||
"mongus ",
|
||||
"thelonius monk ",
|
||||
"Disnaylande ",
|
||||
"the white striper ",
|
||||
"indochie ",
|
||||
"indochien ",
|
||||
"klub des loopers ",
|
||||
"fear of the duck ",
|
||||
"michel depech ",
|
||||
"stromal ",
|
||||
"dire straights ",
|
||||
"Arethla Franklin ",
|
||||
],
|
||||
criterion: Some(&["typo"]),
|
||||
optional_words: false,
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "words",
|
||||
queries: &[
|
||||
"the black saint and the sinner lady and the good doggo ", // four words to pop
|
||||
"les liaisons dangeureuses 1793 ", // one word to pop
|
||||
"The Disneyland Children's Sing-Alone song ", // two words to pop
|
||||
"seven nation mummy ", // one word to pop
|
||||
"7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop
|
||||
"Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
|
||||
"whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
|
||||
],
|
||||
criterion: Some(&["words"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "asc",
|
||||
criterion: Some(&["released-timestamp:desc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "desc",
|
||||
criterion: Some(&["released-timestamp:desc"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
|
||||
/* then we bench the asc and desc criterion on top of the default criterion */
|
||||
utils::Conf {
|
||||
group_name: "asc + default",
|
||||
criterion: Some(&asc_default[..]),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "desc + default",
|
||||
criterion: Some(&desc_default[..]),
|
||||
..BASE_CONF
|
||||
},
|
||||
|
||||
/* we bench the filters with the default request */
|
||||
utils::Conf {
|
||||
group_name: "basic filter: <=",
|
||||
filter: Some("released-timestamp <= 946728000"), // year 2000
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "basic filter: TO",
|
||||
filter: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "big filter",
|
||||
filter: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
|
||||
..BASE_CONF
|
||||
},
|
||||
|
||||
/* the we bench some global / normal search with all the default criterion in the default
|
||||
* order */
|
||||
utils::Conf {
|
||||
group_name: "basic placeholder",
|
||||
queries: &[""],
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "basic without quote",
|
||||
queries: &BASE_CONF
|
||||
.queries
|
||||
.iter()
|
||||
.map(|s| s.trim()) // we remove the space at the end of each request
|
||||
.collect::<Vec<&str>>(),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "basic with quote",
|
||||
queries: basic_with_quote,
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "prefix search",
|
||||
queries: &[
|
||||
"s", // 500k+ results
|
||||
"a", //
|
||||
"b", //
|
||||
"i", //
|
||||
"x", // only 7k results
|
||||
],
|
||||
..BASE_CONF
|
||||
},
|
||||
];
|
||||
|
||||
utils::run_benches(c, confs);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_songs);
|
||||
criterion_main!(benches);
|
129
crates/benchmarks/benches/search_wiki.rs
Normal file
129
crates/benchmarks/benches/search_wiki.rs
Normal file
|
@ -0,0 +1,129 @@
|
|||
mod datasets_paths;
|
||||
mod utils;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
use milli::update::Settings;
|
||||
use utils::Conf;
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
fn base_conf(builder: &mut Settings) {
|
||||
let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
|
||||
queries: &[
|
||||
"mingus ", // 46 candidates
|
||||
"miles davis ", // 159
|
||||
"rock and roll ", // 1007
|
||||
"machine ", // 3448
|
||||
"spain ", // 7002
|
||||
"japan ", // 10.593
|
||||
"france ", // 17.616
|
||||
"film ", // 24.959
|
||||
],
|
||||
configure: base_conf,
|
||||
..Conf::BASE
|
||||
};
|
||||
|
||||
fn bench_songs(c: &mut criterion::Criterion) {
|
||||
let basic_with_quote: Vec<String> = BASE_CONF
|
||||
.queries
|
||||
.iter()
|
||||
.map(|s| {
|
||||
s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::<Vec<String>>().join(" ")
|
||||
})
|
||||
.collect();
|
||||
let basic_with_quote: &[&str] =
|
||||
&basic_with_quote.iter().map(|s| s.as_str()).collect::<Vec<&str>>();
|
||||
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
/* first we bench each criterion alone */
|
||||
utils::Conf {
|
||||
group_name: "proximity",
|
||||
queries: &[
|
||||
"herald sings ",
|
||||
"april paris ",
|
||||
"tea two ",
|
||||
"diesel engine ",
|
||||
],
|
||||
criterion: Some(&["proximity"]),
|
||||
optional_words: false,
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "typo",
|
||||
queries: &[
|
||||
"migrosoft ",
|
||||
"linax ",
|
||||
"Disnaylande ",
|
||||
"phytogropher ",
|
||||
"nympalidea ",
|
||||
"aritmetric ",
|
||||
"the fronce ",
|
||||
"sisan ",
|
||||
],
|
||||
criterion: Some(&["typo"]),
|
||||
optional_words: false,
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "words",
|
||||
queries: &[
|
||||
"the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
|
||||
"Kameya Tokujirō mingus monk ", // two words to pop, 55
|
||||
"Ulrich Hensel meilisearch milli ", // two words to pop, 306
|
||||
"Idaho Bellevue pizza ", // one word to pop, 800
|
||||
"Abraham machin ", // one word to pop, 1141
|
||||
],
|
||||
criterion: Some(&["words"]),
|
||||
..BASE_CONF
|
||||
},
|
||||
/* the we bench some global / normal search with all the default criterion in the default
|
||||
* order */
|
||||
utils::Conf {
|
||||
group_name: "basic placeholder",
|
||||
queries: &[""],
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "basic without quote",
|
||||
queries: &BASE_CONF
|
||||
.queries
|
||||
.iter()
|
||||
.map(|s| s.trim()) // we remove the space at the end of each request
|
||||
.collect::<Vec<&str>>(),
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "basic with quote",
|
||||
queries: basic_with_quote,
|
||||
..BASE_CONF
|
||||
},
|
||||
utils::Conf {
|
||||
group_name: "prefix search",
|
||||
queries: &[
|
||||
"t", // 453k results
|
||||
"c", // 405k
|
||||
"g", // 318k
|
||||
"j", // 227k
|
||||
"q", // 71k
|
||||
"x", // 17k
|
||||
],
|
||||
..BASE_CONF
|
||||
},
|
||||
];
|
||||
|
||||
utils::run_benches(c, confs);
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_songs);
|
||||
criterion_main!(benches);
|
256
crates/benchmarks/benches/utils.rs
Normal file
256
crates/benchmarks/benches/utils.rs
Normal file
|
@ -0,0 +1,256 @@
|
|||
#![allow(dead_code)]
|
||||
|
||||
use std::fs::{create_dir_all, remove_dir_all, File};
|
||||
use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
|
||||
use std::num::ParseFloatError;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use criterion::BenchmarkId;
|
||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use milli::heed::EnvOpenOptions;
|
||||
use milli::update::{
|
||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||
};
|
||||
use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy};
|
||||
use serde_json::Value;
|
||||
|
||||
pub struct Conf<'a> {
|
||||
/// where we are going to create our database.mmdb directory
|
||||
/// each benchmark will first try to delete it and then recreate it
|
||||
pub database_name: &'a str,
|
||||
/// the dataset to be used, it must be an uncompressed csv
|
||||
pub dataset: &'a str,
|
||||
/// The format of the dataset
|
||||
pub dataset_format: &'a str,
|
||||
pub group_name: &'a str,
|
||||
pub queries: &'a [&'a str],
|
||||
/// here you can change which criterion are used and in which order.
|
||||
/// - if you specify something all the base configuration will be thrown out
|
||||
/// - if you don't specify anything (None) the default configuration will be kept
|
||||
pub criterion: Option<&'a [&'a str]>,
|
||||
/// the last chance to configure your database as you want
|
||||
pub configure: fn(&mut Settings),
|
||||
pub filter: Option<&'a str>,
|
||||
pub sort: Option<Vec<&'a str>>,
|
||||
/// enable or disable the optional words on the query
|
||||
pub optional_words: bool,
|
||||
/// primary key, if there is None we'll auto-generate docids for every documents
|
||||
pub primary_key: Option<&'a str>,
|
||||
}
|
||||
|
||||
impl Conf<'_> {
|
||||
pub const BASE: Self = Conf {
|
||||
database_name: "benches.mmdb",
|
||||
dataset_format: "csv",
|
||||
dataset: "",
|
||||
group_name: "",
|
||||
queries: &[],
|
||||
criterion: None,
|
||||
configure: |_| (),
|
||||
filter: None,
|
||||
sort: None,
|
||||
optional_words: true,
|
||||
primary_key: None,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn base_setup(conf: &Conf) -> Index {
|
||||
match remove_dir_all(conf.database_name) {
|
||||
Ok(_) => (),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
|
||||
Err(e) => panic!("{}", e),
|
||||
}
|
||||
create_dir_all(conf.database_name).unwrap();
|
||||
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
options.max_readers(10);
|
||||
let index = Index::new(options, conf.database_name).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
|
||||
if let Some(primary_key) = conf.primary_key {
|
||||
builder.set_primary_key(primary_key.to_string());
|
||||
}
|
||||
|
||||
if let Some(criterion) = conf.criterion {
|
||||
builder.reset_filterable_fields();
|
||||
builder.reset_criteria();
|
||||
builder.reset_stop_words();
|
||||
|
||||
let criterion = criterion.iter().map(|s| Criterion::from_str(s).unwrap()).collect();
|
||||
builder.set_criteria(criterion);
|
||||
}
|
||||
|
||||
(conf.configure)(&mut builder);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let indexing_config = IndexDocumentsConfig {
|
||||
autogenerate_docids: conf.primary_key.is_none(),
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||
let documents = documents_from(conf.dataset, conf.dataset_format);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index
|
||||
}
|
||||
|
||||
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
||||
for conf in confs {
|
||||
let index = base_setup(conf);
|
||||
|
||||
let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap();
|
||||
let name = format!("{}: {}", file_name, conf.group_name);
|
||||
let mut group = c.benchmark_group(&name);
|
||||
|
||||
for &query in conf.queries {
|
||||
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
|
||||
b.iter(|| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut search = index.search(&rtxn);
|
||||
search.query(query).terms_matching_strategy(TermsMatchingStrategy::default());
|
||||
if let Some(filter) = conf.filter {
|
||||
let filter = Filter::from_str(filter).unwrap().unwrap();
|
||||
search.filter(filter);
|
||||
}
|
||||
if let Some(sort) = &conf.sort {
|
||||
let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect();
|
||||
search.sort_criteria(sort);
|
||||
}
|
||||
let _ids = search.execute().unwrap();
|
||||
});
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||
let reader = File::open(filename)
|
||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
||||
let reader = BufReader::new(reader);
|
||||
let documents = match filetype {
|
||||
"csv" => documents_from_csv(reader).unwrap(),
|
||||
"json" => documents_from_json(reader).unwrap(),
|
||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||
};
|
||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||
}
|
||||
|
||||
fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
let object = result?;
|
||||
documents.append_json_object(&object)?;
|
||||
}
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
documents.append_json_array(reader)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
|
||||
let csv = csv::Reader::from_reader(reader);
|
||||
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
documents.append_csv(csv)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
enum AllowedType {
|
||||
String,
|
||||
Number,
|
||||
}
|
||||
|
||||
fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
||||
// if there are several separators we only split on the last one.
|
||||
match header.rsplit_once(':') {
|
||||
Some((field_name, field_type)) => match field_type {
|
||||
"string" => (field_name.to_string(), AllowedType::String),
|
||||
"number" => (field_name.to_string(), AllowedType::Number),
|
||||
// we may return an error in this case.
|
||||
_otherwise => (header.to_string(), AllowedType::String),
|
||||
},
|
||||
None => (header.to_string(), AllowedType::String),
|
||||
}
|
||||
}
|
||||
|
||||
struct CSVDocumentDeserializer<R>
|
||||
where
|
||||
R: Read,
|
||||
{
|
||||
documents: csv::StringRecordsIntoIter<R>,
|
||||
headers: Vec<(String, AllowedType)>,
|
||||
}
|
||||
|
||||
impl<R: Read> CSVDocumentDeserializer<R> {
|
||||
fn from_reader(reader: R) -> io::Result<Self> {
|
||||
let mut records = csv::Reader::from_reader(reader);
|
||||
|
||||
let headers = records.headers()?.into_iter().map(parse_csv_header).collect();
|
||||
|
||||
Ok(Self { documents: records.into_records(), headers })
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
|
||||
type Item = anyhow::Result<Object>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let csv_document = self.documents.next()?;
|
||||
|
||||
match csv_document {
|
||||
Ok(csv_document) => {
|
||||
let mut document = Object::new();
|
||||
|
||||
for ((field_name, field_type), value) in
|
||||
self.headers.iter().zip(csv_document.into_iter())
|
||||
{
|
||||
let parsed_value: Result<Value, ParseFloatError> = match field_type {
|
||||
AllowedType::Number => {
|
||||
value.parse::<f64>().map(Value::from).map_err(Into::into)
|
||||
}
|
||||
AllowedType::String => Ok(Value::String(value.to_string())),
|
||||
};
|
||||
|
||||
match parsed_value {
|
||||
Ok(value) => drop(document.insert(field_name.to_string(), value)),
|
||||
Err(_e) => {
|
||||
return Some(Err(anyhow::anyhow!(
|
||||
"Value '{}' is not a valid number",
|
||||
value
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some(Ok(document))
|
||||
}
|
||||
Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))),
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue