diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml index 775536524..9536df027 100644 --- a/meilidb-core/Cargo.toml +++ b/meilidb-core/Cargo.toml @@ -11,6 +11,7 @@ crossbeam-channel = "0.3.9" deunicode = "1.0.0" env_logger = "0.7.0" hashbrown = { version = "0.6.0", features = ["serde"] } +lmdb-rkv = "0.12.3" log = "0.4.8" meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" } meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } @@ -41,3 +42,10 @@ features = ["fst_automaton"] [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" branch = "arc-byte-slice" + +[dev-dependencies] +csv = "1.0.7" +rustyline = { version = "5.0.0", default-features = false } +structopt = "0.3.2" +termcolor = "1.0.4" +toml = "0.5.3" diff --git a/meilidb-core/examples/from_file.rs b/meilidb-core/examples/from_file.rs new file mode 100644 index 000000000..ad8382f93 --- /dev/null +++ b/meilidb-core/examples/from_file.rs @@ -0,0 +1,346 @@ +use std::collections::btree_map::{BTreeMap, Entry}; +use std::collections::{HashMap, HashSet}; +use std::error::Error; +use std::io::Write; +use std::iter::FromIterator; +use std::path::{Path, PathBuf}; +use std::time::{Instant, Duration}; +use std::{fs, io}; + +use rustyline::{Editor, Config}; +use serde::{Serialize, Deserialize}; +use structopt::StructOpt; +use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; + +use meilidb_core::{Highlight, Database}; +use meilidb_schema::SchemaAttr; + +const INDEX_NAME: &str = "default"; + +#[derive(Debug, StructOpt)] +struct IndexCommand { + /// The destination where the database must be created. + #[structopt(parse(from_os_str))] + database_path: PathBuf, + + /// The csv file to index. + #[structopt(parse(from_os_str))] + csv_data_path: PathBuf, + + /// The path to the schema. + #[structopt(long, parse(from_os_str))] + schema: PathBuf, + + #[structopt(long)] + update_group_size: Option, +} + +#[derive(Debug, StructOpt)] +struct SearchCommand { + /// The destination where the database must be created. + #[structopt(parse(from_os_str))] + database_path: PathBuf, + + /// Timeout after which the search will return results. + #[structopt(long)] + fetch_timeout_ms: Option, + + /// The number of returned results + #[structopt(short, long, default_value = "10")] + number_results: usize, + + /// The number of characters before and after the first match + #[structopt(short = "C", long, default_value = "35")] + char_context: usize, + + /// Fields that must be displayed. + displayed_fields: Vec, +} + +#[derive(Debug, StructOpt)] +enum Command { + Index(IndexCommand), + Search(SearchCommand), +} + +impl Command { + fn path(&self) -> &Path { + match self { + Command::Index(command) => &command.database_path, + Command::Search(command) => &command.database_path, + } + } +} + +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +struct Document(HashMap); + +fn index_command(command: IndexCommand, database: Database) -> Result<(), Box> { + let start = Instant::now(); + + let rkv = database.rkv.read().unwrap(); + let index = database.open_index(INDEX_NAME)?; + + let schema = { + let string = fs::read_to_string(&command.schema)?; + toml::from_str(&string).unwrap() + }; + + let writer = rkv.write().unwrap(); + match index.main.schema(&writer)? { + Some(current_schema) => { + if current_schema != schema { + return Err(meilidb_core::Error::SchemaDiffer.into()) + } + writer.abort(); + }, + None => index.schema_update(writer, schema)?, + } + + let mut rdr = csv::Reader::from_path(command.csv_data_path)?; + let mut raw_record = csv::StringRecord::new(); + let headers = rdr.headers()?.clone(); + + let mut max_update_id = 0; + let mut i = 0; + let mut end_of_file = false; + + while !end_of_file { + let mut additions = index.documents_addition(); + + loop { + end_of_file = !rdr.read_record(&mut raw_record)?; + if end_of_file { break } + + let document: Document = match raw_record.deserialize(Some(&headers)) { + Ok(document) => document, + Err(e) => { + eprintln!("{:?}", e); + continue; + } + }; + + additions.update_document(document); + + print!("\rindexing document {}", i); + i += 1; + + if let Some(group_size) = command.update_group_size { + if i % group_size == 0 { break } + } + } + + println!(); + + let writer = rkv.write().unwrap(); + println!("committing update..."); + let update_id = additions.finalize(writer)?; + max_update_id = max_update_id.max(update_id); + } + + loop { + println!("Waiting for update {}", max_update_id); + + let reader = rkv.read().unwrap(); + if let Some(_) = index.updates_results.update_result(&reader, max_update_id)? { + break + } + std::thread::sleep(std::time::Duration::from_secs(1)); + } + + println!("database created in {:.2?} at: {:?}", start.elapsed(), command.database_path); + + Ok(()) +} + +fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { + let mut stdout = StandardStream::stdout(ColorChoice::Always); + let mut highlighted = false; + + for range in ranges.windows(2) { + let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() }; + if highlighted { + stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?; + } + write!(&mut stdout, "{}", &text[start..end])?; + stdout.reset()?; + highlighted = !highlighted; + } + + Ok(()) +} + +fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) { + let mut byte_index = 0; + let mut byte_length = 0; + + for (n, (i, c)) in text.char_indices().enumerate() { + if n == index { + byte_index = i; + } + + if n + 1 == index + length { + byte_length = i - byte_index + c.len_utf8(); + break; + } + } + + (byte_index, byte_length) +} + +fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec { + let mut byte_indexes = BTreeMap::new(); + + for highlight in highlights { + let char_index = highlight.char_index as usize; + let char_length = highlight.char_length as usize; + let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); + + match byte_indexes.entry(byte_index) { + Entry::Vacant(entry) => { entry.insert(byte_length); }, + Entry::Occupied(mut entry) => { + if *entry.get() < byte_length { + entry.insert(byte_length); + } + }, + } + } + + let mut title_areas = Vec::new(); + title_areas.push(0); + for (byte_index, length) in byte_indexes { + title_areas.push(byte_index); + title_areas.push(byte_index + length); + } + title_areas.push(text.len()); + title_areas.sort_unstable(); + title_areas +} + +/// note: matches must have been sorted by `char_index` and `char_length` before being passed. +/// +/// ```no_run +/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length)); +/// +/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned(); +/// +/// let (text, matches) = crop_text(&text, matches, 35); +/// ``` +fn crop_text( + text: &str, + highlights: impl IntoIterator, + context: usize, +) -> (String, Vec) +{ + let mut highlights = highlights.into_iter().peekable(); + + let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0); + let start = char_index.saturating_sub(context); + let text = text.chars().skip(start).take(context * 2).collect(); + + let highlights = highlights + .take_while(|m| { + (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2) + }) + .map(|highlight| { + Highlight { char_index: highlight.char_index - start as u16, ..highlight } + }) + .collect(); + + (text, highlights) +} + +fn search_command(command: SearchCommand, database: Database) -> Result<(), Box> { + let rkv = database.rkv.read().unwrap(); + let index = database.open_index(INDEX_NAME)?; + let reader = rkv.read().unwrap(); + + let schema = index.main.schema(&reader)?; + let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?; + + let fields = command.displayed_fields.iter().map(String::as_str); + let fields = HashSet::from_iter(fields); + + let config = Config::builder().auto_add_history(true).build(); + let mut readline = Editor::<()>::with_config(config); + let _ = readline.load_history("query-history.txt"); + + for result in readline.iter("Searching for: ") { + match result { + Ok(query) => { + let start_total = Instant::now(); + + let builder = index.query_builder(); + let documents = builder.query(&reader, &query, 0..command.number_results)?; + + let mut retrieve_duration = Duration::default(); + + let number_of_documents = documents.len(); + for mut doc in documents { + + doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length)); + + let start_retrieve = Instant::now(); + let result = index.document::(&reader, Some(&fields), doc.id); + retrieve_duration += start_retrieve.elapsed(); + + match result { + Ok(Some(document)) => { + println!("raw-id: {:?}", doc.id); + for (name, text) in document.0 { + print!("{}: ", name); + + let attr = schema.attribute(&name).unwrap(); + let highlights = doc.highlights.iter() + .filter(|m| SchemaAttr::new(m.attribute) == attr) + .cloned(); + let (text, highlights) = crop_text(&text, highlights, command.char_context); + let areas = create_highlight_areas(&text, &highlights); + display_highlights(&text, &areas)?; + println!(); + } + }, + Ok(None) => eprintln!("missing document"), + Err(e) => eprintln!("{}", e), + } + + let mut matching_attributes = HashSet::new(); + for highlight in doc.highlights { + let attr = SchemaAttr::new(highlight.attribute); + let name = schema.attribute_name(attr); + matching_attributes.insert(name); + } + + let matching_attributes = Vec::from_iter(matching_attributes); + println!("matching in: {:?}", matching_attributes); + + println!(); + } + + eprintln!("document field retrieve took {:.2?}", retrieve_duration); + eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); + }, + Err(err) => { + println!("Error: {:?}", err); + break + } + } + } + + readline.save_history("query-history.txt").unwrap(); + + Ok(()) +} + +fn main() -> Result<(), Box> { + env_logger::init(); + + let opt = Command::from_args(); + let database = Database::open_or_create(opt.path())?; + + match opt { + Command::Index(command) => index_command(command, database), + Command::Search(command) => search_command(command, database), + } +} diff --git a/meilidb-core/src/main.rs b/meilidb-core/src/main.rs deleted file mode 100644 index 18c570a83..000000000 --- a/meilidb-core/src/main.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::{fs, path::Path}; - -use serde_json::json; -use rkv::{Manager, Rkv, SingleStore, Value, StoreOptions}; - -use meilidb_core::{Database, MResult}; -use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED}; - -fn main() -> MResult<()> { - env_logger::init(); - - let path = Path::new("test.rkv"); - let database = Database::open_or_create(path)?; - let rkv = database.rkv.read().unwrap(); - println!("{:?}", database.indexes_names()); - - let hello = database.open_index("hello")?; - let hello1 = database.open_index("hello1")?; - let hello2 = database.open_index("hello2")?; - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("alpha", DISPLAYED); - builder.new_attribute("beta", DISPLAYED | INDEXED); - builder.new_attribute("gamma", INDEXED); - let schema = builder.build(); - - let writer = rkv.write()?; - - hello.schema_update(writer, schema)?; - - let object = json!({ - "id": 23, - "beta": "hello", - }); - - let mut additions = hello.documents_addition(); - additions.extend(vec![object]); - - let writer = rkv.write()?; - - additions.finalize(writer)?; - - // { - // let mut writer = env.write().unwrap(); - // let mut raw_indexer = RawIndexer::new(); - - // let docid = DocumentId(0); - // let attr = SchemaAttr(0); - // let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !"; - // raw_indexer.index_text(docid, attr, text); - - // let Indexed { words_doc_indexes, .. } = raw_indexer.build(); - - // let mut fst_builder = fst::SetBuilder::memory(); - // fst_builder.extend_iter(words_doc_indexes.keys()).unwrap(); - // let bytes = fst_builder.into_inner().unwrap(); - // let fst = fst::raw::Fst::from_bytes(bytes).unwrap(); - // let fst = fst::Set::from(fst); - - // words.put_words_fst(&mut writer, &fst).unwrap(); - - // for (word, indexes) in words_doc_indexes { - // words.put_words_indexes(&mut writer, &word, &indexes).unwrap(); - // } - - // writer.commit().unwrap(); - // } - - let builder = hello.query_builder(); - let reader = rkv.read().unwrap(); - let documents = builder.query(&reader, "hello", 0..20).unwrap(); - - println!("{:?}", documents); - - std::thread::sleep(std::time::Duration::from_secs(2)); - - Ok(()) -}