diff --git a/examples/create-index.rs b/examples/create-index.rs deleted file mode 100644 index 9c121accf..000000000 --- a/examples/create-index.rs +++ /dev/null @@ -1,87 +0,0 @@ -use std::fs; -use std::path::Path; -use std::error::Error; -use std::path::PathBuf; - -use elapsed::measure_time; -use moby_name_gen::random_name; -use structopt::StructOpt; - -use pentium::index::schema::{Schema, SchemaBuilder, STORED, INDEXED}; -use pentium::index::update::{Update, PositiveUpdateBuilder}; -use pentium::tokenizer::DefaultBuilder; -use pentium::index::Index; - -#[derive(Debug, StructOpt)] -pub struct Cmd { - /// csv file to index - #[structopt(parse(from_os_str))] - pub csv_file: PathBuf, -} - -fn generate_update_from_csv(path: &Path) -> Result<(Schema, Update), Box> { - let mut csv = csv::Reader::from_path(path)?; - - let mut attributes = Vec::new(); - let (schema, id_attr_index) = { - let mut id_attr_index = None; - let mut builder = SchemaBuilder::new(); - - for (i, header_name) in csv.headers()?.iter().enumerate() { - // FIXME this does not disallow multiple "id" fields - if header_name == "id" { id_attr_index = Some(i) }; - - let field = builder.new_attribute(header_name, STORED | INDEXED); - attributes.push(field); - } - - let id = match id_attr_index { - Some(index) => index, - None => return Err(String::from("No \"id\" field found which is mandatory").into()), - }; - - (builder.build(), id) - }; - - let update_path = PathBuf::from("./positive-update-xxx.sst"); - let tokenizer_builder = DefaultBuilder::new(); - let mut builder = PositiveUpdateBuilder::new(&update_path, schema.clone(), tokenizer_builder); - - for record in csv.records() { - let record = match record { - Ok(x) => x, - Err(e) => { eprintln!("{:?}", e); continue } - }; - - let id = record.into_iter().nth(id_attr_index).unwrap().parse()?; - for (value, attr) in record.into_iter().zip(&attributes) { - builder.update_field(id, *attr, value.to_string()); - } - } - - builder.build().map(|update| (schema, update)) -} - -fn main() -> Result<(), Box> { - let command = Cmd::from_args(); - - let path = random_name() + ".rdb"; - - println!("generating the update..."); - let (schema, update) = generate_update_from_csv(&command.csv_file)?; - - println!("creating the index"); - let index = Index::create(&path, schema)?; - - println!("ingesting the changes in the index"); - index.ingest_update(update)?; - - // FIXME this is really ugly !!!! - // the index does not support moving update files - // so we must remove it by hand - fs::remove_file("./positive-update-xxx.sst")?; - - println!("the index {:?} has been created!", path); - - Ok(()) -} diff --git a/examples/search-index.rs b/examples/search-index.rs deleted file mode 100644 index c8d55c74f..000000000 --- a/examples/search-index.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::error::Error; -use std::path::PathBuf; -use std::io::{self, Write}; - -use elapsed::measure_time; -use structopt::StructOpt; -use pentium::index::Index; - -#[derive(Debug, StructOpt)] -pub struct Cmd { - /// Index path (e.g. relaxed-colden). - #[structopt(parse(from_os_str))] - pub index_path: PathBuf, -} - -fn main() -> Result<(), Box> { - let command = Cmd::from_args(); - let index = Index::open(command.index_path)?; - - loop { - print!("Searching for: "); - io::stdout().flush()?; - - let mut query = String::new(); - io::stdin().read_line(&mut query)?; - - if query.is_empty() { break } - - let (elapsed, result) = measure_time(|| index.search(&query)); - match result { - Ok(documents) => { - println!("{:?}", documents); - println!("Finished in {}", elapsed) - }, - Err(e) => panic!("{}", e), - } - } - - Ok(()) -} diff --git a/src/blob/mod.rs b/src/database/blob/mod.rs similarity index 100% rename from src/blob/mod.rs rename to src/database/blob/mod.rs diff --git a/src/blob/negative/blob.rs b/src/database/blob/negative/blob.rs similarity index 100% rename from src/blob/negative/blob.rs rename to src/database/blob/negative/blob.rs diff --git a/src/blob/negative/mod.rs b/src/database/blob/negative/mod.rs similarity index 100% rename from src/blob/negative/mod.rs rename to src/database/blob/negative/mod.rs diff --git a/src/blob/negative/ops.rs b/src/database/blob/negative/ops.rs similarity index 97% rename from src/blob/negative/ops.rs rename to src/database/blob/negative/ops.rs index 136f23533..bb3b783b8 100644 --- a/src/blob/negative/ops.rs +++ b/src/database/blob/negative/ops.rs @@ -1,7 +1,7 @@ use sdset::multi::OpBuilder as SdOpBuilder; use sdset::Set; -use crate::blob::NegativeBlob; +use crate::database::blob::NegativeBlob; use crate::data::DocIds; use crate::DocumentId; diff --git a/src/blob/ops.rs b/src/database/blob/ops.rs similarity index 96% rename from src/blob/ops.rs rename to src/database/blob/ops.rs index b345b8eab..b752739f7 100644 --- a/src/blob/ops.rs +++ b/src/database/blob/ops.rs @@ -5,8 +5,8 @@ use group_by::GroupBy; use sdset::duo::DifferenceByKey; use sdset::{Set, SetOperation}; -use crate::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob}; -use crate::blob::{positive, negative}; +use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob}; +use crate::database::blob::{positive, negative}; fn blob_same_sign(a: &Blob, b: &Blob) -> bool { a.sign() == b.sign() diff --git a/src/blob/positive/blob.rs b/src/database/blob/positive/blob.rs similarity index 100% rename from src/blob/positive/blob.rs rename to src/database/blob/positive/blob.rs diff --git a/src/blob/positive/mod.rs b/src/database/blob/positive/mod.rs similarity index 100% rename from src/blob/positive/mod.rs rename to src/database/blob/positive/mod.rs diff --git a/src/blob/positive/ops.rs b/src/database/blob/positive/ops.rs similarity index 98% rename from src/blob/positive/ops.rs rename to src/database/blob/positive/ops.rs index aed81aa9a..e94ebdc0d 100644 --- a/src/blob/positive/ops.rs +++ b/src/database/blob/positive/ops.rs @@ -1,7 +1,7 @@ use sdset::multi::OpBuilder as SdOpBuilder; use sdset::{SetOperation, Set}; -use crate::blob::PositiveBlob; +use crate::database::blob::PositiveBlob; use crate::data::DocIndexes; use crate::DocIndex; diff --git a/src/database/database_view.rs b/src/database/database_view.rs index 20f3340e5..1db5047b0 100644 --- a/src/database/database_view.rs +++ b/src/database/database_view.rs @@ -5,11 +5,9 @@ use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey}; use rocksdb::rocksdb_options::ReadOptions; use serde::de::DeserializeOwned; -use crate::database::deserializer::{Deserializer, DeserializerError}; -use crate::database::{DATA_INDEX, DATA_SCHEMA}; -use crate::blob::positive::PositiveBlob; -use crate::index::schema::Schema; use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr}; +use crate::database::deserializer::Deserializer; +use crate::database::schema::Schema; use crate::DocumentId; pub struct DatabaseView<'a> { diff --git a/src/database/deserializer.rs b/src/database/deserializer.rs index 2591b92d2..f8c4fd567 100644 --- a/src/database/deserializer.rs +++ b/src/database/deserializer.rs @@ -8,7 +8,7 @@ use serde::de::value::MapDeserializer; use serde::de::{self, Visitor, IntoDeserializer}; use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::index::schema::Schema; +use crate::database::schema::Schema; use crate::DocumentId; pub struct Deserializer<'a> { diff --git a/src/database/document_key.rs b/src/database/document_key.rs index 62010f008..815953679 100644 --- a/src/database/document_key.rs +++ b/src/database/document_key.rs @@ -4,7 +4,7 @@ use std::fmt; use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt}; -use crate::index::schema::SchemaAttr; +use crate::database::schema::SchemaAttr; use crate::DocumentId; const DOC_KEY_LEN: usize = 4 + size_of::(); diff --git a/src/database/mod.rs b/src/database/mod.rs index a990fffae..db864fa37 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -7,13 +7,16 @@ use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamil use rocksdb::{DB, DBVector, MergeOperands, SeekKey}; use rocksdb::rocksdb::{Writable, Snapshot}; -pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -pub use crate::database::database_view::DatabaseView; -use crate::index::update::Update; -use crate::index::schema::Schema; -use crate::blob::positive::PositiveBlob; -use crate::blob::{self, Blob}; +pub use self::document_key::{DocumentKey, DocumentKeyAttr}; +pub use self::database_view::DatabaseView; +use self::blob::positive::PositiveBlob; +use self::update::Update; +use self::schema::Schema; +use self::blob::Blob; +pub mod blob; +pub mod schema; +pub mod update; mod document_key; mod database_view; mod deserializer; @@ -163,14 +166,13 @@ fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut Merge mod tests { use super::*; use std::error::Error; - use std::path::PathBuf; use serde_derive::{Serialize, Deserialize}; use tempfile::tempdir; use crate::tokenizer::DefaultBuilder; - use crate::index::update::PositiveUpdateBuilder; - use crate::index::schema::{Schema, SchemaBuilder, STORED, INDEXED}; + use crate::database::update::PositiveUpdateBuilder; + use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; #[test] fn ingest_update_file() -> Result<(), Box> { diff --git a/src/index/schema.rs b/src/database/schema.rs similarity index 100% rename from src/index/schema.rs rename to src/database/schema.rs diff --git a/src/index/update/mod.rs b/src/database/update/mod.rs similarity index 85% rename from src/index/update/mod.rs rename to src/database/update/mod.rs index 6d99ab8cb..d298a656f 100644 --- a/src/index/update/mod.rs +++ b/src/database/update/mod.rs @@ -1,12 +1,6 @@ -use std::io::{Cursor, Write}; use std::path::PathBuf; use std::error::Error; -use byteorder::{NetworkEndian, WriteBytesExt}; - -use crate::index::schema::SchemaAttr; -use crate::DocumentId; - mod negative; mod positive; diff --git a/src/index/update/negative/mod.rs b/src/database/update/negative/mod.rs similarity index 100% rename from src/index/update/negative/mod.rs rename to src/database/update/negative/mod.rs diff --git a/src/index/update/negative/unordered_builder.rs b/src/database/update/negative/unordered_builder.rs similarity index 100% rename from src/index/update/negative/unordered_builder.rs rename to src/database/update/negative/unordered_builder.rs diff --git a/src/index/update/negative/update.rs b/src/database/update/negative/update.rs similarity index 87% rename from src/index/update/negative/update.rs rename to src/database/update/negative/update.rs index ddf2fe768..222a29a2c 100644 --- a/src/index/update/negative/update.rs +++ b/src/database/update/negative/update.rs @@ -3,11 +3,11 @@ use std::error::Error; use ::rocksdb::rocksdb_options; -use crate::index::update::negative::unordered_builder::UnorderedNegativeBlobBuilder; -use crate::index::update::Update; -use crate::database::{DocumentKey, DocumentKeyAttr}; -use crate::blob::{Blob, NegativeBlob}; -use crate::index::DATA_INDEX; +use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder; +use crate::database::blob::{Blob, NegativeBlob}; +use crate::database::update::Update; +use crate::database::DocumentKey; +use crate::database::DATA_INDEX; use crate::DocumentId; pub struct NegativeUpdateBuilder { diff --git a/src/index/update/positive/mod.rs b/src/database/update/positive/mod.rs similarity index 100% rename from src/index/update/positive/mod.rs rename to src/database/update/positive/mod.rs diff --git a/src/index/update/positive/unordered_builder.rs b/src/database/update/positive/unordered_builder.rs similarity index 95% rename from src/index/update/positive/unordered_builder.rs rename to src/database/update/positive/unordered_builder.rs index 5b75fe28d..b44455360 100644 --- a/src/index/update/positive/unordered_builder.rs +++ b/src/database/update/positive/unordered_builder.rs @@ -2,7 +2,7 @@ use std::collections::BTreeMap; use std::error::Error; use std::io::Write; -use crate::blob::positive::PositiveBlobBuilder; +use crate::database::blob::positive::PositiveBlobBuilder; use crate::DocIndex; pub struct UnorderedPositiveBlobBuilder { diff --git a/src/index/update/positive/update.rs b/src/database/update/positive/update.rs similarity index 97% rename from src/index/update/positive/update.rs rename to src/database/update/positive/update.rs index da25ffb41..8924073aa 100644 --- a/src/index/update/positive/update.rs +++ b/src/database/update/positive/update.rs @@ -6,15 +6,15 @@ use std::fmt; use ::rocksdb::rocksdb_options; use serde::ser::{self, Serialize}; -use crate::index::update::positive::unordered_builder::UnorderedPositiveBlobBuilder; -use crate::index::schema::{SchemaProps, Schema, SchemaAttr}; -use crate::index::update::Update; -use crate::database::{DocumentKey, DocumentKeyAttr}; -use crate::blob::positive::PositiveBlob; +use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder; +use crate::database::blob::positive::PositiveBlob; +use crate::database::schema::{Schema, SchemaAttr}; use crate::tokenizer::TokenizerBuilder; +use crate::database::DocumentKeyAttr; +use crate::database::update::Update; use crate::{DocumentId, DocIndex}; -use crate::index::DATA_INDEX; -use crate::blob::Blob; +use crate::database::DATA_INDEX; +use crate::database::blob::Blob; pub enum NewState { Updated { value: Vec }, diff --git a/src/index/mod.rs b/src/index/mod.rs deleted file mode 100644 index 0329477b1..000000000 --- a/src/index/mod.rs +++ /dev/null @@ -1,128 +0,0 @@ -pub mod schema; -pub mod update; - -use std::error::Error; -use std::path::Path; - -use ::rocksdb::rocksdb::Writable; -use ::rocksdb::{rocksdb, rocksdb_options}; -use ::rocksdb::merge_operator::MergeOperands; - -use crate::rank::Document; -use crate::index::schema::Schema; -use crate::index::update::Update; -use crate::rank::QueryBuilder; -use crate::blob::{self, Blob}; - -const DATA_INDEX: &[u8] = b"data-index"; -const DATA_SCHEMA: &[u8] = b"data-schema"; - -fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - if key != DATA_INDEX { panic!("The merge operator only supports \"data-index\" merging") } - - let capacity = { - let remaining = operands.size_hint().0; - let already_exist = usize::from(existing_value.is_some()); - remaining + already_exist - }; - - let mut op = blob::OpBuilder::with_capacity(capacity); - if let Some(existing_value) = existing_value { - let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index"); - op.push(Blob::Positive(blob)); - } - - for bytes in operands { - let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob"); - op.push(blob); - } - - let blob = op.merge().expect("BUG: could no merge blobs"); - bincode::serialize(&blob).expect("BUG: could not serialize merged blob") -} - -pub struct Index { - database: rocksdb::DB, -} - -impl Index { - pub fn create>(path: P, schema: Schema) -> Result> { - // Self::open must not take a parameter for create_if_missing - // or we must create an OpenOptions with many parameters - // https://doc.rust-lang.org/std/fs/struct.OpenOptions.html - - let path = path.as_ref(); - if path.exists() { - return Err(format!("File already exists at path: {}, cannot create database.", - path.display()).into()) - } - - let path = path.to_string_lossy(); - let mut opts = rocksdb_options::DBOptions::new(); - opts.create_if_missing(true); - - let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data-index merge operator", merge_indexes); - - let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?; - - let mut schema_bytes = Vec::new(); - schema.write_to(&mut schema_bytes)?; - database.put(DATA_SCHEMA, &schema_bytes)?; - - Ok(Self { database }) - } - - pub fn open>(path: P) -> Result> { - let path = path.as_ref().to_string_lossy(); - - let mut opts = rocksdb_options::DBOptions::new(); - opts.create_if_missing(false); - - let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data-index merge operator", merge_indexes); - - let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?; - - // compacting to avoid calling the merge operator - database.compact_range(Some(DATA_INDEX), Some(DATA_INDEX)); - - let _schema = match database.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from(&*value)?, - None => return Err(String::from("Database does not contain a schema").into()), - }; - - Ok(Self { database }) - } - - pub fn ingest_update(&self, update: Update) -> Result<(), Box> { - let path = update.into_path_buf(); - let path = path.to_string_lossy(); - - let mut options = rocksdb_options::IngestExternalFileOptions::new(); - // options.move_files(true); - - let cf_handle = self.database.cf_handle("default").unwrap(); - self.database.ingest_external_file_optimized(&cf_handle, &options, &[&path])?; - - // compacting to avoid calling the merge operator - self.database.compact_range(Some(DATA_INDEX), Some(DATA_INDEX)); - - Ok(()) - } - - pub fn schema(&self) -> Result> { - let bytes = self.database.get(DATA_SCHEMA)?.expect("data-schema entry not found"); - Ok(Schema::read_from(&*bytes).expect("Invalid schema")) - } - - pub fn search(&self, query: &str) -> Result, Box> { - // this snapshot will allow consistent reads for the whole search operation - let snapshot = self.database.snapshot(); - - let builder = QueryBuilder::new(snapshot)?; - let documents = builder.query(query, 20); - - Ok(documents) - } -} diff --git a/src/index/update/.DS_Store b/src/index/update/.DS_Store deleted file mode 100644 index 445e2e61a..000000000 Binary files a/src/index/update/.DS_Store and /dev/null differ diff --git a/src/lib.rs b/src/lib.rs index f83e59c37..f6fc382de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,6 @@ pub mod automaton; -pub mod blob; pub mod database; pub mod data; -pub mod index; pub mod rank; pub mod tokenizer; pub mod vec_read_only; diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 2f3642b37..34ca4f212 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -12,7 +12,7 @@ use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::rank::criterion::{self, Criterion}; use crate::rank::distinct_map::DistinctMap; use crate::database::retrieve_data_index; -use crate::blob::PositiveBlob; +use crate::database::blob::PositiveBlob; use crate::{Match, DocumentId}; use crate::rank::Document;