chore: Update the module hierarchy

This commit is contained in:
Clément Renault 2018-12-07 12:22:51 +01:00
parent 2c3d71dd8f
commit 8bee31078d
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
26 changed files with 33 additions and 296 deletions

View File

@ -1,87 +0,0 @@
use std::fs;
use std::path::Path;
use std::error::Error;
use std::path::PathBuf;
use elapsed::measure_time;
use moby_name_gen::random_name;
use structopt::StructOpt;
use pentium::index::schema::{Schema, SchemaBuilder, STORED, INDEXED};
use pentium::index::update::{Update, PositiveUpdateBuilder};
use pentium::tokenizer::DefaultBuilder;
use pentium::index::Index;
#[derive(Debug, StructOpt)]
pub struct Cmd {
/// csv file to index
#[structopt(parse(from_os_str))]
pub csv_file: PathBuf,
}
fn generate_update_from_csv(path: &Path) -> Result<(Schema, Update), Box<Error>> {
let mut csv = csv::Reader::from_path(path)?;
let mut attributes = Vec::new();
let (schema, id_attr_index) = {
let mut id_attr_index = None;
let mut builder = SchemaBuilder::new();
for (i, header_name) in csv.headers()?.iter().enumerate() {
// FIXME this does not disallow multiple "id" fields
if header_name == "id" { id_attr_index = Some(i) };
let field = builder.new_attribute(header_name, STORED | INDEXED);
attributes.push(field);
}
let id = match id_attr_index {
Some(index) => index,
None => return Err(String::from("No \"id\" field found which is mandatory").into()),
};
(builder.build(), id)
};
let update_path = PathBuf::from("./positive-update-xxx.sst");
let tokenizer_builder = DefaultBuilder::new();
let mut builder = PositiveUpdateBuilder::new(&update_path, schema.clone(), tokenizer_builder);
for record in csv.records() {
let record = match record {
Ok(x) => x,
Err(e) => { eprintln!("{:?}", e); continue }
};
let id = record.into_iter().nth(id_attr_index).unwrap().parse()?;
for (value, attr) in record.into_iter().zip(&attributes) {
builder.update_field(id, *attr, value.to_string());
}
}
builder.build().map(|update| (schema, update))
}
fn main() -> Result<(), Box<Error>> {
let command = Cmd::from_args();
let path = random_name() + ".rdb";
println!("generating the update...");
let (schema, update) = generate_update_from_csv(&command.csv_file)?;
println!("creating the index");
let index = Index::create(&path, schema)?;
println!("ingesting the changes in the index");
index.ingest_update(update)?;
// FIXME this is really ugly !!!!
// the index does not support moving update files
// so we must remove it by hand
fs::remove_file("./positive-update-xxx.sst")?;
println!("the index {:?} has been created!", path);
Ok(())
}

View File

@ -1,40 +0,0 @@
use std::error::Error;
use std::path::PathBuf;
use std::io::{self, Write};
use elapsed::measure_time;
use structopt::StructOpt;
use pentium::index::Index;
#[derive(Debug, StructOpt)]
pub struct Cmd {
/// Index path (e.g. relaxed-colden).
#[structopt(parse(from_os_str))]
pub index_path: PathBuf,
}
fn main() -> Result<(), Box<Error>> {
let command = Cmd::from_args();
let index = Index::open(command.index_path)?;
loop {
print!("Searching for: ");
io::stdout().flush()?;
let mut query = String::new();
io::stdin().read_line(&mut query)?;
if query.is_empty() { break }
let (elapsed, result) = measure_time(|| index.search(&query));
match result {
Ok(documents) => {
println!("{:?}", documents);
println!("Finished in {}", elapsed)
},
Err(e) => panic!("{}", e),
}
}
Ok(())
}

View File

@ -1,7 +1,7 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::Set;
use crate::blob::NegativeBlob;
use crate::database::blob::NegativeBlob;
use crate::data::DocIds;
use crate::DocumentId;

View File

@ -5,8 +5,8 @@ use group_by::GroupBy;
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use crate::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
use crate::blob::{positive, negative};
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
use crate::database::blob::{positive, negative};
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
a.sign() == b.sign()

View File

@ -1,7 +1,7 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::{SetOperation, Set};
use crate::blob::PositiveBlob;
use crate::database::blob::PositiveBlob;
use crate::data::DocIndexes;
use crate::DocIndex;

View File

@ -5,11 +5,9 @@ use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey};
use rocksdb::rocksdb_options::ReadOptions;
use serde::de::DeserializeOwned;
use crate::database::deserializer::{Deserializer, DeserializerError};
use crate::database::{DATA_INDEX, DATA_SCHEMA};
use crate::blob::positive::PositiveBlob;
use crate::index::schema::Schema;
use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr};
use crate::database::deserializer::Deserializer;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct DatabaseView<'a> {

View File

@ -8,7 +8,7 @@ use serde::de::value::MapDeserializer;
use serde::de::{self, Visitor, IntoDeserializer};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::index::schema::Schema;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Deserializer<'a> {

View File

@ -4,7 +4,7 @@ use std::fmt;
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
use crate::index::schema::SchemaAttr;
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();

View File

@ -7,13 +7,16 @@ use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamil
use rocksdb::{DB, DBVector, MergeOperands, SeekKey};
use rocksdb::rocksdb::{Writable, Snapshot};
pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
pub use crate::database::database_view::DatabaseView;
use crate::index::update::Update;
use crate::index::schema::Schema;
use crate::blob::positive::PositiveBlob;
use crate::blob::{self, Blob};
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::database_view::DatabaseView;
use self::blob::positive::PositiveBlob;
use self::update::Update;
use self::schema::Schema;
use self::blob::Blob;
pub mod blob;
pub mod schema;
pub mod update;
mod document_key;
mod database_view;
mod deserializer;
@ -163,14 +166,13 @@ fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut Merge
mod tests {
use super::*;
use std::error::Error;
use std::path::PathBuf;
use serde_derive::{Serialize, Deserialize};
use tempfile::tempdir;
use crate::tokenizer::DefaultBuilder;
use crate::index::update::PositiveUpdateBuilder;
use crate::index::schema::{Schema, SchemaBuilder, STORED, INDEXED};
use crate::database::update::PositiveUpdateBuilder;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
#[test]
fn ingest_update_file() -> Result<(), Box<Error>> {

View File

@ -1,12 +1,6 @@
use std::io::{Cursor, Write};
use std::path::PathBuf;
use std::error::Error;
use byteorder::{NetworkEndian, WriteBytesExt};
use crate::index::schema::SchemaAttr;
use crate::DocumentId;
mod negative;
mod positive;

View File

@ -3,11 +3,11 @@ use std::error::Error;
use ::rocksdb::rocksdb_options;
use crate::index::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
use crate::index::update::Update;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::blob::{Blob, NegativeBlob};
use crate::index::DATA_INDEX;
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
use crate::database::blob::{Blob, NegativeBlob};
use crate::database::update::Update;
use crate::database::DocumentKey;
use crate::database::DATA_INDEX;
use crate::DocumentId;
pub struct NegativeUpdateBuilder {

View File

@ -2,7 +2,7 @@ use std::collections::BTreeMap;
use std::error::Error;
use std::io::Write;
use crate::blob::positive::PositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlobBuilder;
use crate::DocIndex;
pub struct UnorderedPositiveBlobBuilder<W, X> {

View File

@ -6,15 +6,15 @@ use std::fmt;
use ::rocksdb::rocksdb_options;
use serde::ser::{self, Serialize};
use crate::index::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::index::schema::{SchemaProps, Schema, SchemaAttr};
use crate::index::update::Update;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::blob::positive::PositiveBlob;
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlob;
use crate::database::schema::{Schema, SchemaAttr};
use crate::tokenizer::TokenizerBuilder;
use crate::database::DocumentKeyAttr;
use crate::database::update::Update;
use crate::{DocumentId, DocIndex};
use crate::index::DATA_INDEX;
use crate::blob::Blob;
use crate::database::DATA_INDEX;
use crate::database::blob::Blob;
pub enum NewState {
Updated { value: Vec<u8> },

View File

@ -1,128 +0,0 @@
pub mod schema;
pub mod update;
use std::error::Error;
use std::path::Path;
use ::rocksdb::rocksdb::Writable;
use ::rocksdb::{rocksdb, rocksdb_options};
use ::rocksdb::merge_operator::MergeOperands;
use crate::rank::Document;
use crate::index::schema::Schema;
use crate::index::update::Update;
use crate::rank::QueryBuilder;
use crate::blob::{self, Blob};
const DATA_INDEX: &[u8] = b"data-index";
const DATA_SCHEMA: &[u8] = b"data-schema";
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
if key != DATA_INDEX { panic!("The merge operator only supports \"data-index\" merging") }
let capacity = {
let remaining = operands.size_hint().0;
let already_exist = usize::from(existing_value.is_some());
remaining + already_exist
};
let mut op = blob::OpBuilder::with_capacity(capacity);
if let Some(existing_value) = existing_value {
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
op.push(Blob::Positive(blob));
}
for bytes in operands {
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
op.push(blob);
}
let blob = op.merge().expect("BUG: could no merge blobs");
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
}
pub struct Index {
database: rocksdb::DB,
}
impl Index {
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Index, Box<Error>> {
// Self::open must not take a parameter for create_if_missing
// or we must create an OpenOptions with many parameters
// https://doc.rust-lang.org/std/fs/struct.OpenOptions.html
let path = path.as_ref();
if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.",
path.display()).into())
}
let path = path.to_string_lossy();
let mut opts = rocksdb_options::DBOptions::new();
opts.create_if_missing(true);
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new();
schema.write_to(&mut schema_bytes)?;
database.put(DATA_SCHEMA, &schema_bytes)?;
Ok(Self { database })
}
pub fn open<P: AsRef<Path>>(path: P) -> Result<Index, Box<Error>> {
let path = path.as_ref().to_string_lossy();
let mut opts = rocksdb_options::DBOptions::new();
opts.create_if_missing(false);
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
// compacting to avoid calling the merge operator
database.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
let _schema = match database.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()),
};
Ok(Self { database })
}
pub fn ingest_update(&self, update: Update) -> Result<(), Box<Error>> {
let path = update.into_path_buf();
let path = path.to_string_lossy();
let mut options = rocksdb_options::IngestExternalFileOptions::new();
// options.move_files(true);
let cf_handle = self.database.cf_handle("default").unwrap();
self.database.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
// compacting to avoid calling the merge operator
self.database.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
Ok(())
}
pub fn schema(&self) -> Result<Schema, Box<Error>> {
let bytes = self.database.get(DATA_SCHEMA)?.expect("data-schema entry not found");
Ok(Schema::read_from(&*bytes).expect("Invalid schema"))
}
pub fn search(&self, query: &str) -> Result<Vec<Document>, Box<Error>> {
// this snapshot will allow consistent reads for the whole search operation
let snapshot = self.database.snapshot();
let builder = QueryBuilder::new(snapshot)?;
let documents = builder.query(query, 20);
Ok(documents)
}
}

Binary file not shown.

View File

@ -1,8 +1,6 @@
pub mod automaton;
pub mod blob;
pub mod database;
pub mod data;
pub mod index;
pub mod rank;
pub mod tokenizer;
pub mod vec_read_only;

View File

@ -12,7 +12,7 @@ use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::criterion::{self, Criterion};
use crate::rank::distinct_map::DistinctMap;
use crate::database::retrieve_data_index;
use crate::blob::PositiveBlob;
use crate::database::blob::PositiveBlob;
use crate::{Match, DocumentId};
use crate::rank::Document;