doc: Add examples for runtime defined data and Schema

This commit is contained in:
Clément Renault 2018-12-29 12:26:33 +01:00
parent a842e647f7
commit 20b5a6a06e
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
5 changed files with 94 additions and 47 deletions

View File

@ -9,7 +9,7 @@ bincode = "1.0"
byteorder = "1.2" byteorder = "1.2"
crossbeam = "0.6" crossbeam = "0.6"
fst = "0.3" fst = "0.3"
hashbrown = "0.1" hashbrown = { version = "0.1", features = ["serde"] }
lazy_static = "1.1" lazy_static = "1.1"
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5", features = ["serde_impl"] } linked-hash-map = { version = "0.5", features = ["serde_impl"] }

View File

@ -1,41 +1,35 @@
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::error::Error; use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use hashbrown::HashMap;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED}; use meilidb::database::{Database, Schema, UpdateBuilder};
use meilidb::database::UpdateBuilder;
use meilidb::tokenizer::DefaultBuilder; use meilidb::tokenizer::DefaultBuilder;
use meilidb::database::Database;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct Opt { pub struct Opt {
/// The destination where the database must be created /// The destination where the database must be created.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
pub database_path: PathBuf, pub database_path: PathBuf,
/// The csv file to index. /// The csv file to index.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf, pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
struct Document<'a> { struct Document<'a> (
id: &'a str, #[serde(borrow)]
title: &'a str, HashMap<Cow<'a, str>, Cow<'a, str>>
description: &'a str, );
image: &'a str,
}
fn create_schema() -> Schema {
let mut schema = SchemaBuilder::with_identifier("id");
schema.new_attribute("id", STORED);
schema.new_attribute("title", STORED | INDEXED);
schema.new_attribute("description", STORED | INDEXED);
schema.new_attribute("image", STORED);
schema.build()
}
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> { fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
let database = Database::create(database_path, schema.clone())?; let database = Database::create(database_path, schema.clone())?;
@ -71,7 +65,10 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
fn main() -> Result<(), Box<Error>> { fn main() -> Result<(), Box<Error>> {
let opt = Opt::from_args(); let opt = Opt::from_args();
let schema = create_schema(); let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let (elapsed, result) = elapsed::measure_time(|| { let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path) index(schema, &opt.database_path, &opt.csv_data_path)
@ -82,6 +79,5 @@ fn main() -> Result<(), Box<Error>> {
} }
println!("database created in {} at: {:?}", elapsed, opt.database_path); println!("database created in {} at: {:?}", elapsed, opt.database_path);
Ok(()) Ok(())
} }

View File

@ -1,11 +1,14 @@
use std::collections::btree_map::{BTreeMap, Entry};
use std::iter::FromIterator;
use std::io::{self, Write}; use std::io::{self, Write};
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use hashbrown::{HashMap, HashSet};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb::database::schema::SchemaAttr;
use meilidb::database::Database; use meilidb::database::Database;
use meilidb::Match; use meilidb::Match;
@ -15,18 +18,15 @@ pub struct Opt {
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
pub database_path: PathBuf, pub database_path: PathBuf,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results /// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")] #[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize, pub number_results: usize,
} }
#[derive(Debug, Serialize, Deserialize)] type Document = HashMap<String, String>;
struct Document {
id: String,
title: String,
description: String,
image: String,
}
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always); let mut stdout = StandardStream::stdout(ColorChoice::Always);
@ -45,20 +45,30 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
Ok(()) Ok(())
} }
fn create_highlight_areas(text: &str, matches: &[Match], attribute: u16) -> Vec<usize> { fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
let mut title_areas = Vec::new(); let mut byte_indexes = BTreeMap::new();
title_areas.push(0);
for match_ in matches { for match_ in matches {
if match_.attribute.attribute() == attribute { let match_attribute = match_.attribute.attribute();
if SchemaAttr::new(match_attribute) == attribute {
let word_area = match_.word_area; let word_area = match_.word_area;
let byte_index = word_area.byte_index() as usize; let byte_index = word_area.byte_index() as usize;
let length = word_area.length() as usize; let length = word_area.length() as usize;
title_areas.push(byte_index); match byte_indexes.entry(byte_index) {
title_areas.push(byte_index + length); Entry::Vacant(entry) => { entry.insert(length); },
Entry::Occupied(mut entry) => if *entry.get() < length { entry.insert(length); },
}
} }
} }
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len()); title_areas.push(text.len());
title_areas.sort_unstable();
title_areas title_areas
} }
@ -80,6 +90,7 @@ fn main() -> Result<(), Box<Error>> {
let query = buffer.trim_end_matches('\n'); let query = buffer.trim_end_matches('\n');
let view = database.view(); let view = database.view();
let schema = view.schema();
let (elapsed, documents) = elapsed::measure_time(|| { let (elapsed, documents) = elapsed::measure_time(|| {
let builder = view.query_builder().unwrap(); let builder = view.query_builder().unwrap();
@ -90,22 +101,39 @@ fn main() -> Result<(), Box<Error>> {
for doc in documents { for doc in documents {
match view.document_by_id::<Document>(doc.id) { match view.document_by_id::<Document>(doc.id) {
Ok(document) => { Ok(document) => {
for name in &opt.displayed_fields {
let attr = match schema.attribute(name) {
Some(attr) => attr,
None => continue,
};
let text = match document.get(name) {
Some(text) => text,
None => continue,
};
print!("title: "); print!("{}: ", name);
let title_areas = create_highlight_areas(&document.title, &doc.matches, 1); let areas = create_highlight_areas(&text, &doc.matches, attr);
display_highlights(&document.title, &title_areas)?; display_highlights(&text, &areas)?;
println!(); println!();
}
print!("description: ");
let description_areas = create_highlight_areas(&document.description, &doc.matches, 2);
display_highlights(&document.description, &description_areas)?;
println!();
}, },
Err(e) => eprintln!("{}", e), Err(e) => eprintln!("{}", e),
} }
let mut matching_attributes = HashSet::new();
for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute.attribute());
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
} }
println!("Found {} results in {}", number_of_documents, elapsed); println!("===== Found {} results in {} =====", number_of_documents, elapsed);
buffer.clear(); buffer.clear();
} }

View File

@ -0,0 +1,19 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
stored = true
[attributes.title]
stored = true
indexed = true
[attributes.description]
stored = true
indexed = true
[attributes.image]
stored = true

View File

@ -63,10 +63,12 @@ impl Attribute {
} }
} }
#[inline]
pub fn attribute(&self) -> u16 { pub fn attribute(&self) -> u16 {
(self.0 >> 22) as u16 (self.0 >> 22) as u16
} }
#[inline]
pub fn word_index(&self) -> u32 { pub fn word_index(&self) -> u32 {
self.0 & 0b0000_0000_0011_1111_1111_1111_1111 self.0 & 0b0000_0000_0011_1111_1111_1111_1111
} }
@ -129,10 +131,12 @@ impl WordArea {
} }
} }
#[inline]
pub fn byte_index(&self) -> u32 { pub fn byte_index(&self) -> u32 {
self.0 >> 10 self.0 >> 10
} }
#[inline]
pub fn length(&self) -> u16 { pub fn length(&self) -> u16 {
(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
} }