mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Add example targets to the milli crate
This commit is contained in:
parent
a86aeba411
commit
01c7d2de8f
119
milli/examples/index.rs
Normal file
119
milli/examples/index.rs
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
use std::{
|
||||||
|
error::Error,
|
||||||
|
fs::File,
|
||||||
|
io::{BufRead, BufReader, Cursor, Seek},
|
||||||
|
time::Duration,
|
||||||
|
};
|
||||||
|
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use milli::{
|
||||||
|
documents::{DocumentsBatchBuilder, DocumentsBatchReader},
|
||||||
|
update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings},
|
||||||
|
Criterion, Index, Object,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||||
|
|
||||||
|
let index = Index::new(options, "data_organizations").unwrap();
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let primary_key = "uuid";
|
||||||
|
// let searchable_fields = vec!["body", "title", "url"];
|
||||||
|
// let searchable_fields = vec!["title", "overview"];
|
||||||
|
let searchable_fields =
|
||||||
|
vec!["name", "primary_role", "city", "region", "country_code", "short_description"];
|
||||||
|
// let filterable_fields = vec!["release_date", "genres"];
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
builder.set_primary_key(primary_key.to_owned());
|
||||||
|
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||||
|
builder.set_searchable_fields(searchable_fields);
|
||||||
|
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||||
|
// builder.set_filterable_fields(filterable_fields);
|
||||||
|
|
||||||
|
// builder.set_min_word_len_one_typo(5);
|
||||||
|
// builder.set_min_word_len_two_typos(100);
|
||||||
|
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
|
||||||
|
builder.execute(|_| (), || false).unwrap();
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||||
|
|
||||||
|
let documents = documents_from(
|
||||||
|
// "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
|
||||||
|
"/Users/meilisearch/Documents/datasets/organizations.csv",
|
||||||
|
// "json"
|
||||||
|
"csv",
|
||||||
|
);
|
||||||
|
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// let mut wtxn = index.write_txn().unwrap();
|
||||||
|
// let config = IndexerConfig::default();
|
||||||
|
// let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
// let builder =
|
||||||
|
// IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||||
|
|
||||||
|
// let documents = documents_from("test_doc.json", "json");
|
||||||
|
// let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||||
|
// user_error.unwrap();
|
||||||
|
// builder.execute().unwrap();
|
||||||
|
// wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
// let _ = index.all_documents(&rtxn)?;
|
||||||
|
|
||||||
|
// println!("done!");
|
||||||
|
// std::thread::sleep(Duration::from_secs(100));
|
||||||
|
|
||||||
|
index.prepare_for_closing().wait();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||||
|
let reader = File::open(filename)
|
||||||
|
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
||||||
|
let reader = BufReader::new(reader);
|
||||||
|
let documents = match filetype {
|
||||||
|
"csv" => documents_from_csv(reader).unwrap(),
|
||||||
|
"json" => documents_from_json(reader).unwrap(),
|
||||||
|
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
||||||
|
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||||
|
};
|
||||||
|
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn documents_from_jsonl(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||||
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
|
||||||
|
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
|
let object = result.unwrap();
|
||||||
|
documents.append_json_object(&object)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
documents.into_inner().map_err(Into::into)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn documents_from_json(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||||
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
|
||||||
|
documents.append_json_array(reader)?;
|
||||||
|
|
||||||
|
documents.into_inner().map_err(Into::into)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn documents_from_csv(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||||
|
let csv = csv::Reader::from_reader(reader);
|
||||||
|
|
||||||
|
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
documents.append_csv(csv)?;
|
||||||
|
|
||||||
|
documents.into_inner().map_err(Into::into)
|
||||||
|
}
|
124
milli/examples/search.rs
Normal file
124
milli/examples/search.rs
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
// use crate::allocator::ALLOC;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::io::stdin;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use milli::{
|
||||||
|
execute_search, DefaultSearchLogger, Index, Search, SearchContext, TermsMatchingStrategy,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[global_allocator]
|
||||||
|
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||||
|
|
||||||
|
fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
// TODO: command line
|
||||||
|
let mut args = std::env::args();
|
||||||
|
let _ = args.next().unwrap();
|
||||||
|
let dataset = args.next().unwrap();
|
||||||
|
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||||
|
|
||||||
|
// Query:
|
||||||
|
// disp: 20
|
||||||
|
//
|
||||||
|
// dasp: 70 words
|
||||||
|
// dosp: 80
|
||||||
|
// dasc: 80
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// daspouyerf
|
||||||
|
// daspojewkfb
|
||||||
|
|
||||||
|
let index = Index::new(options, dataset)?;
|
||||||
|
let txn = index.read_txn()?;
|
||||||
|
let mut query = String::new();
|
||||||
|
while stdin().read_line(&mut query)? > 0 {
|
||||||
|
for _ in 0..10 {
|
||||||
|
let start = Instant::now();
|
||||||
|
// let mut logger = milli::DetailedSearchLogger::new("log");
|
||||||
|
let mut ctx = SearchContext::new(&index, &txn);
|
||||||
|
let docs = execute_search(
|
||||||
|
&mut ctx,
|
||||||
|
query.trim(),
|
||||||
|
// what a the from which when there is
|
||||||
|
TermsMatchingStrategy::Last,
|
||||||
|
None,
|
||||||
|
0,
|
||||||
|
20,
|
||||||
|
&mut DefaultSearchLogger,
|
||||||
|
&mut DefaultSearchLogger,
|
||||||
|
// &mut logger,
|
||||||
|
)?;
|
||||||
|
// logger.write_d2_description(&mut ctx);
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||||
|
|
||||||
|
// let documents = index
|
||||||
|
// .documents(&txn, docs.documents_ids.iter().copied())
|
||||||
|
// .unwrap()
|
||||||
|
// .into_iter()
|
||||||
|
// .map(|(id, obkv)| {
|
||||||
|
// let mut object = serde_json::Map::default();
|
||||||
|
// for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||||
|
// let value = obkv.get(fid).unwrap();
|
||||||
|
// let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||||
|
// object.insert(fid_name.to_owned(), value);
|
||||||
|
// }
|
||||||
|
// (id, serde_json::to_string_pretty(&object).unwrap())
|
||||||
|
// })
|
||||||
|
// .collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||||
|
// for (id, document) in documents {
|
||||||
|
// println!("{id}:");
|
||||||
|
// println!("{document}");
|
||||||
|
// }
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query(
|
||||||
|
// "which a the releases from poison by the government",
|
||||||
|
// "sun flower s are the best",
|
||||||
|
query.trim(),
|
||||||
|
);
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
// s.limit(1);
|
||||||
|
// s.criterion_implementation_strategy(
|
||||||
|
// milli::CriterionImplementationStrategy::OnlySetBased,
|
||||||
|
// );
|
||||||
|
|
||||||
|
let docs = s.execute().unwrap();
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||||
|
|
||||||
|
// let documents = index
|
||||||
|
// .documents(&txn, docs.documents_ids.iter().copied())
|
||||||
|
// .unwrap()
|
||||||
|
// .into_iter()
|
||||||
|
// .map(|(id, obkv)| {
|
||||||
|
// let mut object = serde_json::Map::default();
|
||||||
|
// for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||||
|
// let value = obkv.get(fid).unwrap();
|
||||||
|
// let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||||
|
// object.insert(fid_name.to_owned(), value);
|
||||||
|
// }
|
||||||
|
// (id, serde_json::to_string_pretty(&object).unwrap())
|
||||||
|
// })
|
||||||
|
// .collect::<Vec<_>>();
|
||||||
|
// println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||||
|
// for (id, document) in documents {
|
||||||
|
// println!("{id}:");
|
||||||
|
// println!("{document}");
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
query.clear();
|
||||||
|
}
|
||||||
|
// for (id, document) in documents {
|
||||||
|
// println!("{id}:");
|
||||||
|
// // println!("{document}");
|
||||||
|
// }
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
31
milli/examples/settings.rs
Normal file
31
milli/examples/settings.rs
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
// use big_s::S;
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
// use maplit::hashset;
|
||||||
|
use milli::{
|
||||||
|
update::{IndexerConfig, Settings},
|
||||||
|
Criterion, Index,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||||
|
|
||||||
|
let index = Index::new(options, "data_wiki").unwrap();
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
// builder.set_min_word_len_one_typo(5);
|
||||||
|
// builder.set_min_word_len_two_typos(7);
|
||||||
|
// builder.set_sortable_fields(hashset! { S("release_date") });
|
||||||
|
builder.set_criteria(vec![
|
||||||
|
Criterion::Words,
|
||||||
|
Criterion::Typo,
|
||||||
|
Criterion::Proximity,
|
||||||
|
// Criterion::Asc("release_date".to_owned()),
|
||||||
|
]);
|
||||||
|
|
||||||
|
builder.execute(|_| (), || false).unwrap();
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user