diff --git a/Cargo.toml b/Cargo.toml index ae85ae7c5..e15fbb6cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ bincode = "1.0" byteorder = "1.2" crossbeam = "0.6" fst = "0.3" -hashbrown = "0.1" +hashbrown = { version = "0.1", features = ["serde"] } lazy_static = "1.1" levenshtein_automata = { version = "0.1", features = ["fst_automaton"] } linked-hash-map = { version = "0.5", features = ["serde_impl"] } diff --git a/examples/create-database.rs b/examples/create-database.rs index 07ffeb931..9a2784586 100644 --- a/examples/create-database.rs +++ b/examples/create-database.rs @@ -1,41 +1,35 @@ use std::path::{Path, PathBuf}; use std::error::Error; +use std::borrow::Cow; +use std::fs::File; +use hashbrown::HashMap; use serde_derive::{Serialize, Deserialize}; use structopt::StructOpt; -use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED}; -use meilidb::database::UpdateBuilder; +use meilidb::database::{Database, Schema, UpdateBuilder}; use meilidb::tokenizer::DefaultBuilder; -use meilidb::database::Database; #[derive(Debug, StructOpt)] pub struct Opt { - /// The destination where the database must be created + /// The destination where the database must be created. #[structopt(parse(from_os_str))] pub database_path: PathBuf, /// The csv file to index. #[structopt(parse(from_os_str))] pub csv_data_path: PathBuf, + + /// The path to the schema. + #[structopt(long = "schema", parse(from_os_str))] + pub schema_path: PathBuf, } -#[derive(Debug, Serialize, Deserialize)] -struct Document<'a> { - id: &'a str, - title: &'a str, - description: &'a str, - image: &'a str, -} - -fn create_schema() -> Schema { - let mut schema = SchemaBuilder::with_identifier("id"); - schema.new_attribute("id", STORED); - schema.new_attribute("title", STORED | INDEXED); - schema.new_attribute("description", STORED | INDEXED); - schema.new_attribute("image", STORED); - schema.build() -} +#[derive(Serialize, Deserialize)] +struct Document<'a> ( + #[serde(borrow)] + HashMap, Cow<'a, str>> +); fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result> { let database = Database::create(database_path, schema.clone())?; @@ -71,7 +65,10 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result Result<(), Box> { let opt = Opt::from_args(); - let schema = create_schema(); + let schema = { + let file = File::open(&opt.schema_path)?; + Schema::from_toml(file)? + }; let (elapsed, result) = elapsed::measure_time(|| { index(schema, &opt.database_path, &opt.csv_data_path) @@ -82,6 +79,5 @@ fn main() -> Result<(), Box> { } println!("database created in {} at: {:?}", elapsed, opt.database_path); - Ok(()) } diff --git a/examples/query-database.rs b/examples/query-database.rs index e6fb6ee93..4571d242e 100644 --- a/examples/query-database.rs +++ b/examples/query-database.rs @@ -1,11 +1,14 @@ +use std::collections::btree_map::{BTreeMap, Entry}; +use std::iter::FromIterator; use std::io::{self, Write}; use std::path::PathBuf; use std::error::Error; +use hashbrown::{HashMap, HashSet}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; -use serde_derive::{Serialize, Deserialize}; use structopt::StructOpt; +use meilidb::database::schema::SchemaAttr; use meilidb::database::Database; use meilidb::Match; @@ -15,18 +18,15 @@ pub struct Opt { #[structopt(parse(from_os_str))] pub database_path: PathBuf, + /// Fields that must be displayed. + pub displayed_fields: Vec, + /// The number of returned results #[structopt(short = "n", long = "number-results", default_value = "10")] pub number_results: usize, } -#[derive(Debug, Serialize, Deserialize)] -struct Document { - id: String, - title: String, - description: String, - image: String, -} +type Document = HashMap; fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { let mut stdout = StandardStream::stdout(ColorChoice::Always); @@ -45,20 +45,30 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { Ok(()) } -fn create_highlight_areas(text: &str, matches: &[Match], attribute: u16) -> Vec { - let mut title_areas = Vec::new(); +fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec { + let mut byte_indexes = BTreeMap::new(); - title_areas.push(0); for match_ in matches { - if match_.attribute.attribute() == attribute { + let match_attribute = match_.attribute.attribute(); + if SchemaAttr::new(match_attribute) == attribute { let word_area = match_.word_area; let byte_index = word_area.byte_index() as usize; let length = word_area.length() as usize; - title_areas.push(byte_index); - title_areas.push(byte_index + length); + match byte_indexes.entry(byte_index) { + Entry::Vacant(entry) => { entry.insert(length); }, + Entry::Occupied(mut entry) => if *entry.get() < length { entry.insert(length); }, + } } } + + let mut title_areas = Vec::new(); + title_areas.push(0); + for (byte_index, length) in byte_indexes { + title_areas.push(byte_index); + title_areas.push(byte_index + length); + } title_areas.push(text.len()); + title_areas.sort_unstable(); title_areas } @@ -80,6 +90,7 @@ fn main() -> Result<(), Box> { let query = buffer.trim_end_matches('\n'); let view = database.view(); + let schema = view.schema(); let (elapsed, documents) = elapsed::measure_time(|| { let builder = view.query_builder().unwrap(); @@ -90,22 +101,39 @@ fn main() -> Result<(), Box> { for doc in documents { match view.document_by_id::(doc.id) { Ok(document) => { + for name in &opt.displayed_fields { + let attr = match schema.attribute(name) { + Some(attr) => attr, + None => continue, + }; + let text = match document.get(name) { + Some(text) => text, + None => continue, + }; - print!("title: "); - let title_areas = create_highlight_areas(&document.title, &doc.matches, 1); - display_highlights(&document.title, &title_areas)?; - println!(); - - print!("description: "); - let description_areas = create_highlight_areas(&document.description, &doc.matches, 2); - display_highlights(&document.description, &description_areas)?; - println!(); + print!("{}: ", name); + let areas = create_highlight_areas(&text, &doc.matches, attr); + display_highlights(&text, &areas)?; + println!(); + } }, Err(e) => eprintln!("{}", e), } + + let mut matching_attributes = HashSet::new(); + for _match in doc.matches { + let attr = SchemaAttr::new(_match.attribute.attribute()); + let name = schema.attribute_name(attr); + matching_attributes.insert(name); + } + + let matching_attributes = Vec::from_iter(matching_attributes); + println!("matching in: {:?}", matching_attributes); + + println!(); } - println!("Found {} results in {}", number_of_documents, elapsed); + println!("===== Found {} results in {} =====", number_of_documents, elapsed); buffer.clear(); } diff --git a/examples/schema-example.toml b/examples/schema-example.toml new file mode 100644 index 000000000..fcf2685e9 --- /dev/null +++ b/examples/schema-example.toml @@ -0,0 +1,19 @@ +# This schema has been generated ... +# The order in which the attributes are declared is important, +# it specify the attribute xxx... + +identifier = "id" + +[attributes.id] +stored = true + +[attributes.title] +stored = true +indexed = true + +[attributes.description] +stored = true +indexed = true + +[attributes.image] +stored = true diff --git a/src/lib.rs b/src/lib.rs index b43d8d506..ab291afa5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,10 +63,12 @@ impl Attribute { } } + #[inline] pub fn attribute(&self) -> u16 { (self.0 >> 22) as u16 } + #[inline] pub fn word_index(&self) -> u32 { self.0 & 0b0000_0000_0011_1111_1111_1111_1111 } @@ -129,10 +131,12 @@ impl WordArea { } } + #[inline] pub fn byte_index(&self) -> u32 { self.0 >> 10 } + #[inline] pub fn length(&self) -> u16 { (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 }