Introduce an example file to test indexing and searching csv documents

2025-07-03 20:07:09 +02:00 · 2019-10-08 14:47:38 +02:00 · 2019-10-08 14:47:38 +02:00 · 2236ebbd42
commit 2236ebbd42
parent 0bfba3e4ba
3 changed files with 354 additions and 78 deletions
--- a/meilidb-core/Cargo.toml
+++ b/meilidb-core/Cargo.toml
@ -11,6 +11,7 @@ crossbeam-channel = "0.3.9"
 deunicode = "1.0.0"
 env_logger = "0.7.0"
 hashbrown = { version = "0.6.0", features = ["serde"] }
+lmdb-rkv = "0.12.3"
 log = "0.4.8"
 meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
 meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
@ -41,3 +42,10 @@ features = ["fst_automaton"]
 [dependencies.fst]
 git = "https://github.com/Kerollmops/fst.git"
 branch = "arc-byte-slice"
+
+[dev-dependencies]
+csv = "1.0.7"
+rustyline = { version = "5.0.0", default-features = false }
+structopt = "0.3.2"
+termcolor = "1.0.4"
+toml = "0.5.3"
--- a/meilidb-core/examples/from_file.rs
+++ b/meilidb-core/examples/from_file.rs
@ -0,0 +1,346 @@
+use std::collections::btree_map::{BTreeMap, Entry};
+use std::collections::{HashMap, HashSet};
+use std::error::Error;
+use std::io::Write;
+use std::iter::FromIterator;
+use std::path::{Path, PathBuf};
+use std::time::{Instant, Duration};
+use std::{fs, io};
+
+use rustyline::{Editor, Config};
+use serde::{Serialize, Deserialize};
+use structopt::StructOpt;
+use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
+
+use meilidb_core::{Highlight, Database};
+use meilidb_schema::SchemaAttr;
+
+const INDEX_NAME: &str = "default";
+
+#[derive(Debug, StructOpt)]
+struct IndexCommand {
+    /// The destination where the database must be created.
+    #[structopt(parse(from_os_str))]
+    database_path: PathBuf,
+
+    /// The csv file to index.
+    #[structopt(parse(from_os_str))]
+    csv_data_path: PathBuf,
+
+    /// The path to the schema.
+    #[structopt(long, parse(from_os_str))]
+    schema: PathBuf,
+
+    #[structopt(long)]
+    update_group_size: Option<usize>,
+}
+
+#[derive(Debug, StructOpt)]
+struct SearchCommand {
+    /// The destination where the database must be created.
+    #[structopt(parse(from_os_str))]
+    database_path: PathBuf,
+
+    /// Timeout after which the search will return results.
+    #[structopt(long)]
+    fetch_timeout_ms: Option<u64>,
+
+    /// The number of returned results
+    #[structopt(short, long, default_value = "10")]
+    number_results: usize,
+
+    /// The number of characters before and after the first match
+    #[structopt(short = "C", long, default_value = "35")]
+    char_context: usize,
+
+    /// Fields that must be displayed.
+    displayed_fields: Vec<String>,
+}
+
+#[derive(Debug, StructOpt)]
+enum Command {
+    Index(IndexCommand),
+    Search(SearchCommand),
+}
+
+impl Command {
+    fn path(&self) -> &Path {
+        match self {
+            Command::Index(command) => &command.database_path,
+            Command::Search(command) => &command.database_path,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+struct Document(HashMap<String, String>);
+
+fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+
+    let rkv = database.rkv.read().unwrap();
+    let index = database.open_index(INDEX_NAME)?;
+
+    let schema = {
+        let string = fs::read_to_string(&command.schema)?;
+        toml::from_str(&string).unwrap()
+    };
+
+    let writer = rkv.write().unwrap();
+    match index.main.schema(&writer)? {
+        Some(current_schema) => {
+            if current_schema != schema {
+                return Err(meilidb_core::Error::SchemaDiffer.into())
+            }
+            writer.abort();
+        },
+        None => index.schema_update(writer, schema)?,
+    }
+
+    let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
+    let mut raw_record = csv::StringRecord::new();
+    let headers = rdr.headers()?.clone();
+
+    let mut max_update_id = 0;
+    let mut i = 0;
+    let mut end_of_file = false;
+
+    while !end_of_file {
+        let mut additions = index.documents_addition();
+
+        loop {
+            end_of_file = !rdr.read_record(&mut raw_record)?;
+            if end_of_file { break }
+
+            let document: Document = match raw_record.deserialize(Some(&headers)) {
+                Ok(document) => document,
+                Err(e) => {
+                    eprintln!("{:?}", e);
+                    continue;
+                }
+            };
+
+            additions.update_document(document);
+
+            print!("\rindexing document {}", i);
+            i += 1;
+
+            if let Some(group_size) = command.update_group_size {
+                if i % group_size == 0 { break }
+            }
+        }
+
+        println!();
+
+        let writer = rkv.write().unwrap();
+        println!("committing update...");
+        let update_id = additions.finalize(writer)?;
+        max_update_id = max_update_id.max(update_id);
+    }
+
+    loop {
+        println!("Waiting for update {}", max_update_id);
+
+        let reader = rkv.read().unwrap();
+        if let Some(_) = index.updates_results.update_result(&reader, max_update_id)? {
+            break
+        }
+        std::thread::sleep(std::time::Duration::from_secs(1));
+    }
+
+    println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path);
+
+    Ok(())
+}
+
+fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
+    let mut stdout = StandardStream::stdout(ColorChoice::Always);
+    let mut highlighted = false;
+
+    for range in ranges.windows(2) {
+        let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
+        if highlighted {
+            stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
+        }
+        write!(&mut stdout, "{}", &text[start..end])?;
+        stdout.reset()?;
+        highlighted = !highlighted;
+    }
+
+    Ok(())
+}
+
+fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
+    let mut byte_index = 0;
+    let mut byte_length = 0;
+
+    for (n, (i, c)) in text.char_indices().enumerate() {
+        if n == index {
+            byte_index = i;
+        }
+
+        if n + 1 == index + length {
+            byte_length = i - byte_index + c.len_utf8();
+            break;
+        }
+    }
+
+    (byte_index, byte_length)
+}
+
+fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
+    let mut byte_indexes = BTreeMap::new();
+
+    for highlight in highlights {
+        let char_index = highlight.char_index as usize;
+        let char_length = highlight.char_length as usize;
+        let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
+
+        match byte_indexes.entry(byte_index) {
+            Entry::Vacant(entry) => { entry.insert(byte_length); },
+            Entry::Occupied(mut entry) => {
+                if *entry.get() < byte_length {
+                    entry.insert(byte_length);
+                }
+            },
+        }
+    }
+
+    let mut title_areas = Vec::new();
+    title_areas.push(0);
+    for (byte_index, length) in byte_indexes {
+        title_areas.push(byte_index);
+        title_areas.push(byte_index + length);
+    }
+    title_areas.push(text.len());
+    title_areas.sort_unstable();
+    title_areas
+}
+
+/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
+///
+/// ```no_run
+/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
+///
+/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
+///
+/// let (text, matches) = crop_text(&text, matches, 35);
+/// ```
+fn crop_text(
+    text: &str,
+    highlights: impl IntoIterator<Item=Highlight>,
+    context: usize,
+) -> (String, Vec<Highlight>)
+{
+    let mut highlights = highlights.into_iter().peekable();
+
+    let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
+    let start = char_index.saturating_sub(context);
+    let text = text.chars().skip(start).take(context * 2).collect();
+
+    let highlights = highlights
+        .take_while(|m| {
+            (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
+        })
+        .map(|highlight| {
+            Highlight { char_index: highlight.char_index - start as u16, ..highlight }
+        })
+        .collect();
+
+    (text, highlights)
+}
+
+fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
+    let rkv = database.rkv.read().unwrap();
+    let index = database.open_index(INDEX_NAME)?;
+    let reader = rkv.read().unwrap();
+
+    let schema = index.main.schema(&reader)?;
+    let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
+
+    let fields = command.displayed_fields.iter().map(String::as_str);
+    let fields = HashSet::from_iter(fields);
+
+    let config = Config::builder().auto_add_history(true).build();
+    let mut readline = Editor::<()>::with_config(config);
+    let _ = readline.load_history("query-history.txt");
+
+    for result in readline.iter("Searching for: ") {
+        match result {
+            Ok(query) => {
+                let start_total = Instant::now();
+
+                let builder = index.query_builder();
+                let documents = builder.query(&reader, &query, 0..command.number_results)?;
+
+                let mut retrieve_duration = Duration::default();
+
+                let number_of_documents = documents.len();
+                for mut doc in documents {
+
+                    doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
+
+                    let start_retrieve = Instant::now();
+                    let result = index.document::<Document, _>(&reader, Some(&fields), doc.id);
+                    retrieve_duration += start_retrieve.elapsed();
+
+                    match result {
+                        Ok(Some(document)) => {
+                            println!("raw-id: {:?}", doc.id);
+                            for (name, text) in document.0 {
+                                print!("{}: ", name);
+
+                                let attr = schema.attribute(&name).unwrap();
+                                let highlights = doc.highlights.iter()
+                                                .filter(|m| SchemaAttr::new(m.attribute) == attr)
+                                                .cloned();
+                                let (text, highlights) = crop_text(&text, highlights, command.char_context);
+                                let areas = create_highlight_areas(&text, &highlights);
+                                display_highlights(&text, &areas)?;
+                                println!();
+                            }
+                        },
+                        Ok(None) => eprintln!("missing document"),
+                        Err(e) => eprintln!("{}", e),
+                    }
+
+                    let mut matching_attributes = HashSet::new();
+                    for highlight in doc.highlights {
+                        let attr = SchemaAttr::new(highlight.attribute);
+                        let name = schema.attribute_name(attr);
+                        matching_attributes.insert(name);
+                    }
+
+                    let matching_attributes = Vec::from_iter(matching_attributes);
+                    println!("matching in: {:?}", matching_attributes);
+
+                    println!();
+                }
+
+                eprintln!("document field retrieve took {:.2?}", retrieve_duration);
+                eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
+            },
+            Err(err) => {
+                println!("Error: {:?}", err);
+                break
+            }
+        }
+    }
+
+    readline.save_history("query-history.txt").unwrap();
+
+    Ok(())
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    env_logger::init();
+
+    let opt = Command::from_args();
+    let database = Database::open_or_create(opt.path())?;
+
+    match opt {
+        Command::Index(command) => index_command(command, database),
+        Command::Search(command) => search_command(command, database),
+    }
+}
--- a/meilidb-core/src/main.rs
+++ b/meilidb-core/src/main.rs
@ -1,78 +0,0 @@
-use std::{fs, path::Path};
-
-use serde_json::json;
-use rkv::{Manager, Rkv, SingleStore, Value, StoreOptions};
-
-use meilidb_core::{Database, MResult};
-use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED};
-
-fn main() -> MResult<()> {
-    env_logger::init();
-
-    let path = Path::new("test.rkv");
-    let database = Database::open_or_create(path)?;
-    let rkv = database.rkv.read().unwrap();
-    println!("{:?}", database.indexes_names());
-
-    let hello = database.open_index("hello")?;
-    let hello1 = database.open_index("hello1")?;
-    let hello2 = database.open_index("hello2")?;
-
-    let mut builder = SchemaBuilder::with_identifier("id");
-    builder.new_attribute("alpha", DISPLAYED);
-    builder.new_attribute("beta", DISPLAYED | INDEXED);
-    builder.new_attribute("gamma", INDEXED);
-    let schema = builder.build();
-
-    let writer = rkv.write()?;
-
-    hello.schema_update(writer, schema)?;
-
-    let object = json!({
-        "id": 23,
-        "beta": "hello",
-    });
-
-    let mut additions = hello.documents_addition();
-    additions.extend(vec![object]);
-
-    let writer = rkv.write()?;
-
-    additions.finalize(writer)?;
-
-    // {
-    //     let mut writer = env.write().unwrap();
-    //     let mut raw_indexer = RawIndexer::new();
-
-    //     let docid = DocumentId(0);
-    //     let attr = SchemaAttr(0);
-    //     let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
-    //     raw_indexer.index_text(docid, attr, text);
-
-    //     let Indexed { words_doc_indexes, .. } = raw_indexer.build();
-
-    //     let mut fst_builder = fst::SetBuilder::memory();
-    //     fst_builder.extend_iter(words_doc_indexes.keys()).unwrap();
-    //     let bytes = fst_builder.into_inner().unwrap();
-    //     let fst = fst::raw::Fst::from_bytes(bytes).unwrap();
-    //     let fst = fst::Set::from(fst);
-
-    //     words.put_words_fst(&mut writer, &fst).unwrap();
-
-    //     for (word, indexes) in words_doc_indexes {
-    //         words.put_words_indexes(&mut writer, &word, &indexes).unwrap();
-    //     }
-
-    //     writer.commit().unwrap();
-    // }
-
-    let builder = hello.query_builder();
-    let reader = rkv.read().unwrap();
-    let documents = builder.query(&reader, "hello", 0..20).unwrap();
-
-    println!("{:?}", documents);
-
-    std::thread::sleep(std::time::Duration::from_secs(2));
-
-    Ok(())
-}