diff --git a/Cargo.toml b/Cargo.toml index 4741cb040..79d404b8c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,11 +5,15 @@ version = "0.1.0" authors = ["Kerollmops "] [dependencies] +bincode = "1.0" byteorder = "1.2" fnv = "1.0" fs2 = "0.4" lazy_static = "1.1" +linked-hash-map = { version = "0.5", features = ["serde_impl"] } sdset = "0.2" +serde = "1.0" +serde_derive = "1.0" unidecode = "0.3" [dependencies.fst] @@ -31,8 +35,6 @@ git = "https://github.com/Kerollmops/group-by.git" csv = "1.0" elapsed = "0.1" moby-name-gen = "0.1" -serde = "1.0" -serde_derive = "1.0" serde_json = "1.0" structopt = "0.2" tempfile = "3.0" diff --git a/examples/csv-indexer.rs b/examples/csv-indexer.rs deleted file mode 100644 index b231b7932..000000000 --- a/examples/csv-indexer.rs +++ /dev/null @@ -1,146 +0,0 @@ -#[macro_use] extern crate serde_derive; - -use std::collections::BTreeMap; -use std::path::PathBuf; -use std::fs::File; -use std::io; - -use csv::ReaderBuilder; -use pentium::{MetadataBuilder, DocIndex, Tokenizer, CommonWords}; -use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; -use structopt::StructOpt; - -#[derive(Debug, StructOpt)] -pub struct CommandCsv { - /// The stop word file, each word must be separated by a newline. - #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words: PathBuf, - - /// The csv file to index. - #[structopt(parse(from_os_str))] - pub products: PathBuf, -} - -#[derive(Debug, Deserialize)] -struct Product { - id: u64, - title: String, - description: String, - image: String, -} - -#[derive(Debug)] -pub struct CsvIndexer { - common_words: CommonWords, - products: PathBuf, -} - -impl CsvIndexer { - pub fn from_command(command: CommandCsv) -> io::Result { - let common_words = CommonWords::from_file(command.stop_words)?; - let products = command.products; - - Ok(CsvIndexer { common_words, products }) - } - - pub fn index(self) { - let random_name = PathBuf::from(moby_name_gen::random_name()); - let map_file = random_name.with_extension("map"); - let idx_file = random_name.with_extension("idx"); - let sst_file = random_name.with_extension("sst"); - - let env_options = EnvOptions::new(); - let cf_options = ColumnFamilyOptions::new(); - let mut sst_file_writer = SstFileWriter::new(env_options, cf_options); - let sst_file = sst_file.to_str().unwrap(); - sst_file_writer.open(&sst_file).expect("open the sst file"); - - let map = File::create(&map_file).unwrap(); - let indexes = File::create(&idx_file).unwrap(); - let mut builder = MetadataBuilder::new(map, indexes); - let mut fields = BTreeMap::new(); - - let mut rdr = ReaderBuilder::new().from_path(&self.products).expect("reading product file"); - let mut errors = 0; - - for result in rdr.deserialize() { - let product: Product = match result { - Ok(product) => product, - Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, - }; - - { - let string_id = product.id.to_string(); - insert_document_words(&mut builder, product.id, 0, Some((0, string_id.as_str()))); - - let key = format!("{}-id", product.id); - let value = string_id; - fields.insert(key, value); - } - - { - let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w)); - insert_document_words(&mut builder, product.id, 1, title); - - let key = format!("{}-title", product.id); - let value = product.title; - fields.insert(key, value); - } - - { - let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w)); - insert_document_words(&mut builder, product.id, 2, description); - - let key = format!("{}-description", product.id); - let value = product.description; - fields.insert(key, value); - } - - { - let key = format!("{}-image", product.id); - let value = product.image; - fields.insert(key, value); - } - } - - for (key, value) in fields { - sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap(); - } - let _sst_file_info = sst_file_writer.finish().unwrap(); - - builder.finish().unwrap(); - - println!("Found {} errorneous lines", errors); - println!("Succesfully created {:?} dump.", random_name); - } -} - -fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder, doc_id: u64, attr: u8, words: I) -where A: io::Write, - B: io::Write, - I: IntoIterator, -{ - for (index, word) in words { - let doc_index = DocIndex { - document_id: doc_id, - attribute: attr, - attribute_index: index as u32, - }; - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - builder.insert(word_unidecoded, doc_index); - } - - builder.insert(word_lower, doc_index); - } -} - -fn main() { - let command = CommandCsv::from_args(); - let indexer = CsvIndexer::from_command(command).unwrap(); - indexer.index(); -} diff --git a/examples/json-lines-indexer.rs b/examples/json-lines-indexer.rs deleted file mode 100644 index 093c8189b..000000000 --- a/examples/json-lines-indexer.rs +++ /dev/null @@ -1,151 +0,0 @@ -#[macro_use] extern crate serde_derive; - -use std::collections::BTreeMap; -use std::io::{self, BufReader, BufRead}; -use std::fs::File; -use std::path::PathBuf; - -use serde_json::from_str; -use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; -use pentium::{MetadataBuilder, DocIndex, Tokenizer, CommonWords}; -use structopt::StructOpt; - -#[derive(Debug, StructOpt)] -pub struct CommandJsonLines { - /// The stop word file, each word must be separated by a newline. - #[structopt(long = "stop-words", parse(from_os_str))] - pub stop_words: PathBuf, - - /// The csv file to index. - #[structopt(parse(from_os_str))] - pub products: PathBuf, -} - -#[derive(Debug, Deserialize)] -struct Product { - id: u64, - title: String, - description: String, - image: String, -} - -#[derive(Debug)] -pub struct JsonLinesIndexer { - common_words: CommonWords, - products: PathBuf, -} - -impl JsonLinesIndexer { - pub fn from_command(command: CommandJsonLines) -> io::Result { - let common_words = CommonWords::from_file(command.stop_words)?; - let products = command.products; - - Ok(JsonLinesIndexer { common_words, products }) - } - - pub fn index(self) { - let data = File::open(&self.products).unwrap(); - let data = BufReader::new(data); - - // TODO add a subcommand to pack these files in a tar.xxx archive - let random_name = PathBuf::from(moby_name_gen::random_name()); - let map_file = random_name.with_extension("map"); - let idx_file = random_name.with_extension("idx"); - let sst_file = random_name.with_extension("sst"); - - let env_options = EnvOptions::new(); - let cf_options = ColumnFamilyOptions::new(); - let mut sst_file_writer = SstFileWriter::new(env_options, cf_options); - let sst_file = sst_file.to_str().unwrap(); - sst_file_writer.open(&sst_file).expect("open the sst file"); - - let map = File::create(&map_file).unwrap(); - let indexes = File::create(&idx_file).unwrap(); - let mut builder = MetadataBuilder::new(map, indexes); - let mut fields = BTreeMap::new(); - let mut errors = 0; - - for result in data.lines() { - let product: Product = match result { - Ok(product) => match from_str(&product) { - Ok(product) => product, - Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, - }, - Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, - }; - - { - let string_id = product.id.to_string(); - insert_document_words(&mut builder, product.id, 0, Some((0, string_id.as_str()))); - - let key = format!("{}-id", product.id); - let value = string_id; - fields.insert(key, value); - } - - { - let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w)); - insert_document_words(&mut builder, product.id, 1, title); - - let key = format!("{}-title", product.id); - let value = product.title; - fields.insert(key, value); - } - - { - let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w)); - insert_document_words(&mut builder, product.id, 2, description); - - let key = format!("{}-description", product.id); - let value = product.description; - fields.insert(key, value); - } - - { - let key = format!("{}-image", product.id); - let value = product.image; - fields.insert(key, value); - } - } - - for (key, value) in fields { - sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap(); - } - let _sst_file_info = sst_file_writer.finish().unwrap(); - - builder.finish().unwrap(); - - println!("Found {} errorneous lines", errors); - println!("Succesfully created {:?} dump.", random_name); - } -} - -fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder, doc_id: u64, attr: u8, words: I) -where A: io::Write, - B: io::Write, - I: IntoIterator, -{ - for (index, word) in words { - let doc_index = DocIndex { - document_id: doc_id, - attribute: attr, - attribute_index: index as u32, - }; - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - builder.insert(word_unidecoded, doc_index); - } - - builder.insert(word_lower, doc_index); - } -} - -fn main() { - let command = CommandJsonLines::from_args(); - let indexer = JsonLinesIndexer::from_command(command).unwrap(); - indexer.index(); -} diff --git a/examples/serve-console.rs b/examples/serve-console.rs deleted file mode 100644 index e908e0ec8..000000000 --- a/examples/serve-console.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::error::Error; -use std::str::from_utf8_unchecked; -use std::io::{self, Write}; -use structopt::StructOpt; -use std::path::PathBuf; - -use elapsed::measure_time; -use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; -use pentium::index::Index; -use pentium::rank::{criterion, Config, RankedStream}; -use pentium::{automaton, DocumentId}; - -#[derive(Debug, StructOpt)] -pub struct CommandConsole { - /// Meta file name (e.g. relaxed-colden). - #[structopt(parse(from_os_str))] - pub index_path: PathBuf, -} - -pub struct ConsoleSearch { - index: Index, -} - -impl ConsoleSearch { - pub fn from_command(command: CommandConsole) -> Result> { - let index = Index::open(command.index_path)?; - Ok(ConsoleSearch { index }) - } - - pub fn serve(self) { - loop { - print!("Searching for: "); - io::stdout().flush().unwrap(); - - let mut query = String::new(); - io::stdin().read_line(&mut query).unwrap(); - - if query.is_empty() { break } - - let (elapsed, _) = measure_time(|| search(&self.index, &query)); - println!("Finished in {}", elapsed); - } - } -} - -fn search(index: &Index, query: &str) { - let mut automatons = Vec::new(); - for query in query.split_whitespace().map(str::to_lowercase) { - let lev = automaton::build_prefix_dfa(&query); - automatons.push(lev); - } - - let distinct_by_title_first_four_chars = |id: &DocumentId| { - let title_key = format!("{}-title", id); - match database.get(title_key.as_bytes()) { - Ok(Some(value)) => { - value.to_utf8().map(|s| s.chars().take(4).collect::()) - }, - Ok(None) => None, - Err(err) => { - eprintln!("{:?}", err); - None - } - } - }; - - let index: Index = unimplemented!(); - - // "Sony" "PlayStation 4 500GB" - let config = Config { - blobs: &index.blobs().unwrap(), - automatons: automatons, - criteria: criterion::default(), - distinct: (distinct_by_title_first_four_chars, 1), - }; - let stream = RankedStream::new(config); - - let documents = stream.retrieve_distinct_documents(0..20); - // let documents = stream.retrieve_documents(0..20); - - for document in documents { - let id_key = format!("{}-id", document.id); - let id = database.get(id_key.as_bytes()).unwrap().unwrap(); - let id = unsafe { from_utf8_unchecked(&id) }; - print!("{} ", id); - - let title_key = format!("{}-title", document.id); - let title = database.get(title_key.as_bytes()).unwrap().unwrap(); - let title = unsafe { from_utf8_unchecked(&title) }; - println!("{:?}", title); - } -} - -fn main() { - let command = CommandConsole::from_args(); - let console = ConsoleSearch::from_command(command).unwrap(); - console.serve() -} diff --git a/examples/serve-http.rs b/examples/serve-http.rs deleted file mode 100644 index 53f5dd456..000000000 --- a/examples/serve-http.rs +++ /dev/null @@ -1,140 +0,0 @@ -#[macro_use] extern crate serde_derive; - -use std::str::from_utf8_unchecked; -use std::io::{self, Write}; -use std::net::SocketAddr; -use std::path::PathBuf; -use std::error::Error; -use std::sync::Arc; - -use pentium::rank::{criterion, Config, RankedStream}; -use pentium::{automaton, Metadata}; -use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; -use warp::Filter; -use structopt::StructOpt; - -#[derive(Debug, StructOpt)] -pub struct CommandHttp { - /// The address and port to bind the server to. - #[structopt(short = "l", default_value = "127.0.0.1:3030")] - pub listen_addr: SocketAddr, - - /// Meta file name (e.g. relaxed-colden). - #[structopt(parse(from_os_str))] - pub meta_name: PathBuf, -} - -#[derive(Debug, Serialize)] -struct Document<'a> { - id: u64, - title: &'a str, - description: &'a str, - image: &'a str, -} - -#[derive(Debug, Deserialize)] -struct SearchQuery { q: String } - -pub struct HttpServer { - listen_addr: SocketAddr, - metadata: Arc, - db: Arc, -} - -impl HttpServer { - pub fn from_command(command: CommandHttp) -> io::Result { - let map_file = command.meta_name.with_extension("map"); - let idx_file = command.meta_name.with_extension("idx"); - let sst_file = command.meta_name.with_extension("sst"); - let metadata = unsafe { Metadata::from_paths(map_file, idx_file).unwrap() }; - - let rocksdb = "rocksdb/storage"; - let db = DB::open_default(rocksdb).unwrap(); - let sst_file = sst_file.to_str().unwrap(); - db.ingest_external_file(&IngestExternalFileOptions::new(), &[&sst_file]).unwrap(); - drop(db); - let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap(); - - Ok(HttpServer { - listen_addr: command.listen_addr, - metadata: Arc::new(metadata), - db: Arc::new(db), - }) - } - - pub fn serve(self) { - let HttpServer { listen_addr, metadata, db } = self; - - let routes = warp::path("search") - .and(warp::query()) - .map(move |query: SearchQuery| { - let body = search(metadata.clone(), db.clone(), &query.q).unwrap(); - body - }) - .with(warp::reply::with::header("Content-Type", "application/json")) - .with(warp::reply::with::header("Access-Control-Allow-Origin", "*")); - - warp::serve(routes).run(listen_addr) - } -} - -fn search(metadata: M, database: D, query: &str) -> Result> -where M: AsRef, - D: AsRef, -{ - let mut automatons = Vec::new(); - for query in query.split_whitespace().map(str::to_lowercase) { - let lev = automaton::build_prefix_dfa(&query); - automatons.push(lev); - } - - let config = Config { - index: unimplemented!(), - automatons: automatons, - criteria: criterion::default(), - distinct: ((), 1), - }; - let stream = RankedStream::new(config); - - let documents = stream.retrieve_documents(0..20); - - let mut body = Vec::new(); - write!(&mut body, "[")?; - - let mut first = true; - for document in documents { - let title_key = format!("{}-title", document.id); - let title = database.as_ref().get(title_key.as_bytes()).unwrap().unwrap(); - let title = unsafe { from_utf8_unchecked(&title) }; - - let description_key = format!("{}-description", document.id); - let description = database.as_ref().get(description_key.as_bytes()).unwrap().unwrap(); - let description = unsafe { from_utf8_unchecked(&description) }; - - let image_key = format!("{}-image", document.id); - let image = database.as_ref().get(image_key.as_bytes()).unwrap().unwrap(); - let image = unsafe { from_utf8_unchecked(&image) }; - - let document = Document { - id: document.id, - title: title, - description: description, - image: image, - }; - - if !first { write!(&mut body, ",")? } - serde_json::to_writer(&mut body, &document)?; - - first = false; - } - - write!(&mut body, "]")?; - - Ok(String::from_utf8(body)?) -} - -fn main() { - let command = CommandHttp::from_args(); - let server = HttpServer::from_command(command).unwrap(); - server.serve(); -} diff --git a/src/index/mod.rs b/src/index/mod.rs index f5875c2ca..9a53b49ab 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -136,12 +136,12 @@ mod tests { let mut builder = PositiveUpdateBuilder::new("update-positive-0001.sst", schema.clone(), tokenizer_builder); // you can insert documents in any order, it is sorted internally - let title_field = schema.field("title").unwrap(); + let title_field = schema.attribute("title").unwrap(); builder.update_field(1, title_field, "hallo!".to_owned()); builder.update_field(5, title_field, "hello!".to_owned()); builder.update_field(2, title_field, "hi!".to_owned()); - let name_field = schema.field("name").unwrap(); + let name_field = schema.attribute("name").unwrap(); builder.remove_field(4, name_field); let update = builder.build()?; diff --git a/src/index/schema.rs b/src/index/schema.rs index f4ab19279..f1df50b07 100644 --- a/src/index/schema.rs +++ b/src/index/schema.rs @@ -1,14 +1,16 @@ +use std::collections::{HashMap, BTreeMap}; use std::io::{Read, Write}; -use std::error::Error; use std::path::Path; use std::ops::BitOr; use std::fs::File; use std::fmt; +use linked_hash_map::LinkedHashMap; + pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true }; -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct SchemaProps { stored: bool, indexed: bool, @@ -36,66 +38,110 @@ impl BitOr for SchemaProps { } pub struct SchemaBuilder { - fields: Vec<(String, SchemaProps)>, + attrs: LinkedHashMap, } impl SchemaBuilder { pub fn new() -> SchemaBuilder { - SchemaBuilder { fields: Vec::new() } + SchemaBuilder { attrs: LinkedHashMap::new() } } - pub fn field(&mut self, name: N, props: SchemaProps) -> SchemaField - where N: Into, - { - let len = self.fields.len(); - let name = name.into(); - self.fields.push((name, props)); - - SchemaField(len as u32) + pub fn new_field>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { + let len = self.attrs.len(); + self.attrs.insert(name.into(), props); + SchemaAttr(len as u32) } pub fn build(self) -> Schema { - unimplemented!() + let mut attrs = HashMap::new(); + let mut props = Vec::new(); + + for (i, (name, prop)) in self.attrs.into_iter().enumerate() { + attrs.insert(name, SchemaAttr(i as u32)); + props.push(prop); + } + + Schema { attrs, props } } } -#[derive(Clone)] -pub struct Schema; +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Schema { + attrs: HashMap, + props: Vec, +} impl Schema { - pub fn open>(path: P) -> Result> { + pub fn open>(path: P) -> bincode::Result { let file = File::open(path)?; Schema::read_from(file) } - pub fn read_from(reader: R) -> Result> { - unimplemented!() + pub fn read_from(reader: R) -> bincode::Result { + let attrs = bincode::deserialize_from(reader)?; + let builder = SchemaBuilder { attrs }; + Ok(builder.build()) } - pub fn write_to(writer: W) -> Result<(), Box> { - unimplemented!() + pub fn write_to(&self, writer: W) -> bincode::Result<()> { + let mut ordered = BTreeMap::new(); + for (name, field) in &self.attrs { + let index = field.as_u32(); + let props = self.props[index as usize]; + ordered.insert(index, (name, props)); + } + + let mut attrs = LinkedHashMap::with_capacity(ordered.len()); + for (_, (name, props)) in ordered { + attrs.insert(name, props); + } + + bincode::serialize_into(writer, &attrs) } - pub fn props(&self, field: SchemaField) -> SchemaProps { - unimplemented!() + pub fn props(&self, attr: SchemaAttr) -> SchemaProps { + self.props[attr.as_u32() as usize] } - pub fn field(&self, name: &str) -> Option { - unimplemented!() + pub fn attribute>(&self, name: S) -> Option { + self.attrs.get(name.as_ref()).cloned() } } -#[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] -pub struct SchemaField(u32); +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] +pub struct SchemaAttr(u32); -impl SchemaField { +impl SchemaAttr { pub fn as_u32(&self) -> u32 { self.0 } } -impl fmt::Display for SchemaField { +impl fmt::Display for SchemaAttr { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.0) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn serialize_deserialize() -> bincode::Result<()> { + let mut builder = SchemaBuilder::new(); + builder.new_field("alphabet", STORED); + builder.new_field("beta", STORED | INDEXED); + builder.new_field("gamma", INDEXED); + let schema = builder.build(); + + let mut buffer = Vec::new(); + + schema.write_to(&mut buffer)?; + let schema2 = Schema::read_from(buffer.as_slice())?; + + assert_eq!(schema, schema2); + + Ok(()) + } +} diff --git a/src/index/update/positive_update.rs b/src/index/update/positive_update.rs index 77d24b8a7..a6eb3f5d2 100644 --- a/src/index/update/positive_update.rs +++ b/src/index/update/positive_update.rs @@ -5,7 +5,7 @@ use std::fmt::Write; use ::rocksdb::rocksdb_options; -use crate::index::schema::{SchemaProps, Schema, SchemaField}; +use crate::index::schema::{SchemaProps, Schema, SchemaAttr}; use crate::index::update::{FIELD_BLOBS_ORDER, Update}; use crate::tokenizer::TokenizerBuilder; use crate::index::blob_name::BlobName; @@ -24,7 +24,7 @@ pub struct PositiveUpdateBuilder { path: PathBuf, schema: Schema, tokenizer_builder: B, - new_states: BTreeMap<(DocumentId, SchemaField), NewState>, + new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>, } impl PositiveUpdateBuilder { @@ -38,12 +38,12 @@ impl PositiveUpdateBuilder { } // TODO value must be a field that can be indexed - pub fn update_field(&mut self, id: DocumentId, field: SchemaField, value: String) { + pub fn update_field(&mut self, id: DocumentId, field: SchemaAttr, value: String) { let state = NewState::Updated { value, props: self.schema.props(field) }; self.new_states.insert((id, field), state); } - pub fn remove_field(&mut self, id: DocumentId, field: SchemaField) { + pub fn remove_field(&mut self, id: DocumentId, field: SchemaAttr) { self.new_states.insert((id, field), NewState::Removed); } } diff --git a/src/lib.rs b/src/lib.rs index fe1e19dfa..61edcc7e7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ #![feature(range_contains)] #[macro_use] extern crate lazy_static; +#[macro_use] extern crate serde_derive; pub mod index; pub mod blob;