Add example targets to the milli crate

2025-05-25 09:03:59 +02:00 · 2023-03-22 14:50:41 +01:00 · 2023-03-22 14:50:41 +01:00 · 01c7d2de8f
commit 01c7d2de8f
parent a86aeba411
3 changed files with 274 additions and 0 deletions
--- a/milli/examples/index.rs
+++ b/milli/examples/index.rs
@ -0,0 +1,119 @@
+use std::{
+    error::Error,
+    fs::File,
+    io::{BufRead, BufReader, Cursor, Seek},
+    time::Duration,
+};
+
+use heed::EnvOpenOptions;
+use milli::{
+    documents::{DocumentsBatchBuilder, DocumentsBatchReader},
+    update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings},
+    Criterion, Index, Object,
+};
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+    let index = Index::new(options, "data_organizations").unwrap();
+    let mut wtxn = index.write_txn().unwrap();
+
+    let primary_key = "uuid";
+    //  let searchable_fields = vec!["body", "title", "url"];
+    // let searchable_fields = vec!["title", "overview"];
+    let searchable_fields =
+        vec!["name", "primary_role", "city", "region", "country_code", "short_description"];
+    // let filterable_fields = vec!["release_date", "genres"];
+
+    let config = IndexerConfig::default();
+    let mut builder = Settings::new(&mut wtxn, &index, &config);
+    builder.set_primary_key(primary_key.to_owned());
+    let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+    // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
+    // builder.set_filterable_fields(filterable_fields);
+
+    // builder.set_min_word_len_one_typo(5);
+    // builder.set_min_word_len_two_typos(100);
+    builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
+    builder.execute(|_| (), || false).unwrap();
+
+    let config = IndexerConfig::default();
+    let indexing_config = IndexDocumentsConfig::default();
+    let builder =
+        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
+
+    let documents = documents_from(
+        // "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
+        "/Users/meilisearch/Documents/datasets/organizations.csv",
+        // "json"
+        "csv",
+    );
+    let (builder, user_error) = builder.add_documents(documents).unwrap();
+    user_error.unwrap();
+    builder.execute().unwrap();
+    wtxn.commit().unwrap();
+
+    // let rtxn = index.read_txn().unwrap();
+
+    // let mut wtxn = index.write_txn().unwrap();
+    // let config = IndexerConfig::default();
+    // let indexing_config = IndexDocumentsConfig::default();
+    // let builder =
+    //     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
+
+    // let documents = documents_from("test_doc.json", "json");
+    // let (builder, user_error) = builder.add_documents(documents).unwrap();
+    // user_error.unwrap();
+    // builder.execute().unwrap();
+    // wtxn.commit().unwrap();
+
+    // let _ = index.all_documents(&rtxn)?;
+
+    // println!("done!");
+    // std::thread::sleep(Duration::from_secs(100));
+
+    index.prepare_for_closing().wait();
+    Ok(())
+}
+fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
+    let reader = File::open(filename)
+        .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
+    let reader = BufReader::new(reader);
+    let documents = match filetype {
+        "csv" => documents_from_csv(reader).unwrap(),
+        "json" => documents_from_json(reader).unwrap(),
+        "jsonl" => documents_from_jsonl(reader).unwrap(),
+        otherwise => panic!("invalid update format {:?}", otherwise),
+    };
+    DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
+}
+
+fn documents_from_jsonl(reader: impl BufRead) -> milli::Result<Vec<u8>> {
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
+        let object = result.unwrap();
+        documents.append_json_object(&object)?;
+    }
+
+    documents.into_inner().map_err(Into::into)
+}
+
+fn documents_from_json(reader: impl BufRead) -> milli::Result<Vec<u8>> {
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+
+    documents.append_json_array(reader)?;
+
+    documents.into_inner().map_err(Into::into)
+}
+
+fn documents_from_csv(reader: impl BufRead) -> milli::Result<Vec<u8>> {
+    let csv = csv::Reader::from_reader(reader);
+
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    documents.append_csv(csv)?;
+
+    documents.into_inner().map_err(Into::into)
+}
--- a/milli/examples/search.rs
+++ b/milli/examples/search.rs
@ -0,0 +1,124 @@
+// use crate::allocator::ALLOC;
+use std::error::Error;
+use std::io::stdin;
+use std::time::Instant;
+
+use heed::EnvOpenOptions;
+use milli::{
+    execute_search, DefaultSearchLogger, Index, Search, SearchContext, TermsMatchingStrategy,
+};
+
+#[global_allocator]
+static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    // TODO: command line
+    let mut args = std::env::args();
+    let _ = args.next().unwrap();
+    let dataset = args.next().unwrap();
+
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+    // Query:
+    // disp: 20
+    //
+    // dasp: 70 words
+    // dosp: 80
+    // dasc: 80
+    //
+    //
+    // daspouyerf
+    // daspojewkfb
+
+    let index = Index::new(options, dataset)?;
+    let txn = index.read_txn()?;
+    let mut query = String::new();
+    while stdin().read_line(&mut query)? > 0 {
+        for _ in 0..10 {
+            let start = Instant::now();
+            // let mut logger = milli::DetailedSearchLogger::new("log");
+            let mut ctx = SearchContext::new(&index, &txn);
+            let docs = execute_search(
+                &mut ctx,
+                query.trim(),
+                // what a the from which when there is
+                TermsMatchingStrategy::Last,
+                None,
+                0,
+                20,
+                &mut DefaultSearchLogger,
+                &mut DefaultSearchLogger,
+                // &mut logger,
+            )?;
+            // logger.write_d2_description(&mut ctx);
+            let elapsed = start.elapsed();
+            println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
+
+            // let documents = index
+            //     .documents(&txn, docs.documents_ids.iter().copied())
+            //     .unwrap()
+            //     .into_iter()
+            //     .map(|(id, obkv)| {
+            //         let mut object = serde_json::Map::default();
+            //         for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
+            //             let value = obkv.get(fid).unwrap();
+            //             let value: serde_json::Value = serde_json::from_slice(value).unwrap();
+            //             object.insert(fid_name.to_owned(), value);
+            //         }
+            //         (id, serde_json::to_string_pretty(&object).unwrap())
+            //     })
+            //     .collect::<Vec<_>>();
+
+            // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
+            // for (id, document) in documents {
+            //     println!("{id}:");
+            //     println!("{document}");
+            // }
+
+            let start = Instant::now();
+            let mut s = Search::new(&txn, &index);
+            s.query(
+                // "which a the releases from poison by the government",
+                // "sun flower s are the best",
+                query.trim(),
+            );
+            s.terms_matching_strategy(TermsMatchingStrategy::Last);
+            // s.limit(1);
+            // s.criterion_implementation_strategy(
+            //     milli::CriterionImplementationStrategy::OnlySetBased,
+            // );
+
+            let docs = s.execute().unwrap();
+            let elapsed = start.elapsed();
+            println!("old: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
+
+            // let documents = index
+            //     .documents(&txn, docs.documents_ids.iter().copied())
+            //     .unwrap()
+            //     .into_iter()
+            //     .map(|(id, obkv)| {
+            //         let mut object = serde_json::Map::default();
+            //         for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
+            //             let value = obkv.get(fid).unwrap();
+            //             let value: serde_json::Value = serde_json::from_slice(value).unwrap();
+            //             object.insert(fid_name.to_owned(), value);
+            //         }
+            //         (id, serde_json::to_string_pretty(&object).unwrap())
+            //     })
+            //     .collect::<Vec<_>>();
+            // println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
+            // for (id, document) in documents {
+            //     println!("{id}:");
+            //     println!("{document}");
+            // }
+        }
+        query.clear();
+    }
+    // for (id, document) in documents {
+    //     println!("{id}:");
+    //     // println!("{document}");
+    // }
+
+    Ok(())
+}
--- a/milli/examples/settings.rs
+++ b/milli/examples/settings.rs
@ -0,0 +1,31 @@
+// use big_s::S;
+use heed::EnvOpenOptions;
+// use maplit::hashset;
+use milli::{
+    update::{IndexerConfig, Settings},
+    Criterion, Index,
+};
+
+fn main() {
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+    let index = Index::new(options, "data_wiki").unwrap();
+    let mut wtxn = index.write_txn().unwrap();
+
+    let config = IndexerConfig::default();
+    let mut builder = Settings::new(&mut wtxn, &index, &config);
+
+    // builder.set_min_word_len_one_typo(5);
+    // builder.set_min_word_len_two_typos(7);
+    // builder.set_sortable_fields(hashset! { S("release_date") });
+    builder.set_criteria(vec![
+        Criterion::Words,
+        Criterion::Typo,
+        Criterion::Proximity,
+        // Criterion::Asc("release_date".to_owned()),
+    ]);
+
+    builder.execute(|_| (), || false).unwrap();
+    wtxn.commit().unwrap();
+}