mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-22 19:27:27 +01:00
chore: Update the module hierarchy
This commit is contained in:
parent
2c3d71dd8f
commit
8bee31078d
@ -1,87 +0,0 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::error::Error;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use elapsed::measure_time;
|
||||
use moby_name_gen::random_name;
|
||||
use structopt::StructOpt;
|
||||
|
||||
use pentium::index::schema::{Schema, SchemaBuilder, STORED, INDEXED};
|
||||
use pentium::index::update::{Update, PositiveUpdateBuilder};
|
||||
use pentium::tokenizer::DefaultBuilder;
|
||||
use pentium::index::Index;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Cmd {
|
||||
/// csv file to index
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub csv_file: PathBuf,
|
||||
}
|
||||
|
||||
fn generate_update_from_csv(path: &Path) -> Result<(Schema, Update), Box<Error>> {
|
||||
let mut csv = csv::Reader::from_path(path)?;
|
||||
|
||||
let mut attributes = Vec::new();
|
||||
let (schema, id_attr_index) = {
|
||||
let mut id_attr_index = None;
|
||||
let mut builder = SchemaBuilder::new();
|
||||
|
||||
for (i, header_name) in csv.headers()?.iter().enumerate() {
|
||||
// FIXME this does not disallow multiple "id" fields
|
||||
if header_name == "id" { id_attr_index = Some(i) };
|
||||
|
||||
let field = builder.new_attribute(header_name, STORED | INDEXED);
|
||||
attributes.push(field);
|
||||
}
|
||||
|
||||
let id = match id_attr_index {
|
||||
Some(index) => index,
|
||||
None => return Err(String::from("No \"id\" field found which is mandatory").into()),
|
||||
};
|
||||
|
||||
(builder.build(), id)
|
||||
};
|
||||
|
||||
let update_path = PathBuf::from("./positive-update-xxx.sst");
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = PositiveUpdateBuilder::new(&update_path, schema.clone(), tokenizer_builder);
|
||||
|
||||
for record in csv.records() {
|
||||
let record = match record {
|
||||
Ok(x) => x,
|
||||
Err(e) => { eprintln!("{:?}", e); continue }
|
||||
};
|
||||
|
||||
let id = record.into_iter().nth(id_attr_index).unwrap().parse()?;
|
||||
for (value, attr) in record.into_iter().zip(&attributes) {
|
||||
builder.update_field(id, *attr, value.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
builder.build().map(|update| (schema, update))
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let command = Cmd::from_args();
|
||||
|
||||
let path = random_name() + ".rdb";
|
||||
|
||||
println!("generating the update...");
|
||||
let (schema, update) = generate_update_from_csv(&command.csv_file)?;
|
||||
|
||||
println!("creating the index");
|
||||
let index = Index::create(&path, schema)?;
|
||||
|
||||
println!("ingesting the changes in the index");
|
||||
index.ingest_update(update)?;
|
||||
|
||||
// FIXME this is really ugly !!!!
|
||||
// the index does not support moving update files
|
||||
// so we must remove it by hand
|
||||
fs::remove_file("./positive-update-xxx.sst")?;
|
||||
|
||||
println!("the index {:?} has been created!", path);
|
||||
|
||||
Ok(())
|
||||
}
|
@ -1,40 +0,0 @@
|
||||
use std::error::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use elapsed::measure_time;
|
||||
use structopt::StructOpt;
|
||||
use pentium::index::Index;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Cmd {
|
||||
/// Index path (e.g. relaxed-colden).
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub index_path: PathBuf,
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let command = Cmd::from_args();
|
||||
let index = Index::open(command.index_path)?;
|
||||
|
||||
loop {
|
||||
print!("Searching for: ");
|
||||
io::stdout().flush()?;
|
||||
|
||||
let mut query = String::new();
|
||||
io::stdin().read_line(&mut query)?;
|
||||
|
||||
if query.is_empty() { break }
|
||||
|
||||
let (elapsed, result) = measure_time(|| index.search(&query));
|
||||
match result {
|
||||
Ok(documents) => {
|
||||
println!("{:?}", documents);
|
||||
println!("Finished in {}", elapsed)
|
||||
},
|
||||
Err(e) => panic!("{}", e),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::blob::NegativeBlob;
|
||||
use crate::database::blob::NegativeBlob;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
@ -5,8 +5,8 @@ use group_by::GroupBy;
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
|
||||
use crate::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
|
||||
use crate::blob::{positive, negative};
|
||||
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
|
||||
use crate::database::blob::{positive, negative};
|
||||
|
||||
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
|
||||
a.sign() == b.sign()
|
@ -1,7 +1,7 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::{SetOperation, Set};
|
||||
|
||||
use crate::blob::PositiveBlob;
|
||||
use crate::database::blob::PositiveBlob;
|
||||
use crate::data::DocIndexes;
|
||||
use crate::DocIndex;
|
||||
|
@ -5,11 +5,9 @@ use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey};
|
||||
use rocksdb::rocksdb_options::ReadOptions;
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::database::deserializer::{Deserializer, DeserializerError};
|
||||
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
||||
use crate::blob::positive::PositiveBlob;
|
||||
use crate::index::schema::Schema;
|
||||
use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::deserializer::Deserializer;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct DatabaseView<'a> {
|
||||
|
@ -8,7 +8,7 @@ use serde::de::value::MapDeserializer;
|
||||
use serde::de::{self, Visitor, IntoDeserializer};
|
||||
|
||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::index::schema::Schema;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Deserializer<'a> {
|
||||
|
@ -4,7 +4,7 @@ use std::fmt;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
|
||||
|
||||
use crate::index::schema::SchemaAttr;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::DocumentId;
|
||||
|
||||
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
|
||||
|
@ -7,13 +7,16 @@ use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamil
|
||||
use rocksdb::{DB, DBVector, MergeOperands, SeekKey};
|
||||
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
|
||||
pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use crate::database::database_view::DatabaseView;
|
||||
use crate::index::update::Update;
|
||||
use crate::index::schema::Schema;
|
||||
use crate::blob::positive::PositiveBlob;
|
||||
use crate::blob::{self, Blob};
|
||||
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use self::database_view::DatabaseView;
|
||||
use self::blob::positive::PositiveBlob;
|
||||
use self::update::Update;
|
||||
use self::schema::Schema;
|
||||
use self::blob::Blob;
|
||||
|
||||
pub mod blob;
|
||||
pub mod schema;
|
||||
pub mod update;
|
||||
mod document_key;
|
||||
mod database_view;
|
||||
mod deserializer;
|
||||
@ -163,14 +166,13 @@ fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut Merge
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::error::Error;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::index::update::PositiveUpdateBuilder;
|
||||
use crate::index::schema::{Schema, SchemaBuilder, STORED, INDEXED};
|
||||
use crate::database::update::PositiveUpdateBuilder;
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
|
||||
#[test]
|
||||
fn ingest_update_file() -> Result<(), Box<Error>> {
|
||||
|
@ -1,12 +1,6 @@
|
||||
use std::io::{Cursor, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{NetworkEndian, WriteBytesExt};
|
||||
|
||||
use crate::index::schema::SchemaAttr;
|
||||
use crate::DocumentId;
|
||||
|
||||
mod negative;
|
||||
mod positive;
|
||||
|
@ -3,11 +3,11 @@ use std::error::Error;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
|
||||
use crate::index::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
|
||||
use crate::index::update::Update;
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::blob::{Blob, NegativeBlob};
|
||||
use crate::index::DATA_INDEX;
|
||||
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
|
||||
use crate::database::blob::{Blob, NegativeBlob};
|
||||
use crate::database::update::Update;
|
||||
use crate::database::DocumentKey;
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct NegativeUpdateBuilder {
|
@ -2,7 +2,7 @@ use std::collections::BTreeMap;
|
||||
use std::error::Error;
|
||||
use std::io::Write;
|
||||
|
||||
use crate::blob::positive::PositiveBlobBuilder;
|
||||
use crate::database::blob::positive::PositiveBlobBuilder;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct UnorderedPositiveBlobBuilder<W, X> {
|
@ -6,15 +6,15 @@ use std::fmt;
|
||||
use ::rocksdb::rocksdb_options;
|
||||
use serde::ser::{self, Serialize};
|
||||
|
||||
use crate::index::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
|
||||
use crate::index::schema::{SchemaProps, Schema, SchemaAttr};
|
||||
use crate::index::update::Update;
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::blob::positive::PositiveBlob;
|
||||
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::DocumentKeyAttr;
|
||||
use crate::database::update::Update;
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use crate::index::DATA_INDEX;
|
||||
use crate::blob::Blob;
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::database::blob::Blob;
|
||||
|
||||
pub enum NewState {
|
||||
Updated { value: Vec<u8> },
|
128
src/index/mod.rs
128
src/index/mod.rs
@ -1,128 +0,0 @@
|
||||
pub mod schema;
|
||||
pub mod update;
|
||||
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
|
||||
use ::rocksdb::rocksdb::Writable;
|
||||
use ::rocksdb::{rocksdb, rocksdb_options};
|
||||
use ::rocksdb::merge_operator::MergeOperands;
|
||||
|
||||
use crate::rank::Document;
|
||||
use crate::index::schema::Schema;
|
||||
use crate::index::update::Update;
|
||||
use crate::rank::QueryBuilder;
|
||||
use crate::blob::{self, Blob};
|
||||
|
||||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
|
||||
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
if key != DATA_INDEX { panic!("The merge operator only supports \"data-index\" merging") }
|
||||
|
||||
let capacity = {
|
||||
let remaining = operands.size_hint().0;
|
||||
let already_exist = usize::from(existing_value.is_some());
|
||||
remaining + already_exist
|
||||
};
|
||||
|
||||
let mut op = blob::OpBuilder::with_capacity(capacity);
|
||||
if let Some(existing_value) = existing_value {
|
||||
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
|
||||
op.push(Blob::Positive(blob));
|
||||
}
|
||||
|
||||
for bytes in operands {
|
||||
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
|
||||
op.push(blob);
|
||||
}
|
||||
|
||||
let blob = op.merge().expect("BUG: could no merge blobs");
|
||||
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
|
||||
}
|
||||
|
||||
pub struct Index {
|
||||
database: rocksdb::DB,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Index, Box<Error>> {
|
||||
// Self::open must not take a parameter for create_if_missing
|
||||
// or we must create an OpenOptions with many parameters
|
||||
// https://doc.rust-lang.org/std/fs/struct.OpenOptions.html
|
||||
|
||||
let path = path.as_ref();
|
||||
if path.exists() {
|
||||
return Err(format!("File already exists at path: {}, cannot create database.",
|
||||
path.display()).into())
|
||||
}
|
||||
|
||||
let path = path.to_string_lossy();
|
||||
let mut opts = rocksdb_options::DBOptions::new();
|
||||
opts.create_if_missing(true);
|
||||
|
||||
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
|
||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||
|
||||
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
|
||||
let mut schema_bytes = Vec::new();
|
||||
schema.write_to(&mut schema_bytes)?;
|
||||
database.put(DATA_SCHEMA, &schema_bytes)?;
|
||||
|
||||
Ok(Self { database })
|
||||
}
|
||||
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> Result<Index, Box<Error>> {
|
||||
let path = path.as_ref().to_string_lossy();
|
||||
|
||||
let mut opts = rocksdb_options::DBOptions::new();
|
||||
opts.create_if_missing(false);
|
||||
|
||||
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
|
||||
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
|
||||
|
||||
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
|
||||
|
||||
// compacting to avoid calling the merge operator
|
||||
database.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
|
||||
|
||||
let _schema = match database.get(DATA_SCHEMA)? {
|
||||
Some(value) => Schema::read_from(&*value)?,
|
||||
None => return Err(String::from("Database does not contain a schema").into()),
|
||||
};
|
||||
|
||||
Ok(Self { database })
|
||||
}
|
||||
|
||||
pub fn ingest_update(&self, update: Update) -> Result<(), Box<Error>> {
|
||||
let path = update.into_path_buf();
|
||||
let path = path.to_string_lossy();
|
||||
|
||||
let mut options = rocksdb_options::IngestExternalFileOptions::new();
|
||||
// options.move_files(true);
|
||||
|
||||
let cf_handle = self.database.cf_handle("default").unwrap();
|
||||
self.database.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
|
||||
|
||||
// compacting to avoid calling the merge operator
|
||||
self.database.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schema(&self) -> Result<Schema, Box<Error>> {
|
||||
let bytes = self.database.get(DATA_SCHEMA)?.expect("data-schema entry not found");
|
||||
Ok(Schema::read_from(&*bytes).expect("Invalid schema"))
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &str) -> Result<Vec<Document>, Box<Error>> {
|
||||
// this snapshot will allow consistent reads for the whole search operation
|
||||
let snapshot = self.database.snapshot();
|
||||
|
||||
let builder = QueryBuilder::new(snapshot)?;
|
||||
let documents = builder.query(query, 20);
|
||||
|
||||
Ok(documents)
|
||||
}
|
||||
}
|
BIN
src/index/update/.DS_Store
vendored
BIN
src/index/update/.DS_Store
vendored
Binary file not shown.
@ -1,8 +1,6 @@
|
||||
pub mod automaton;
|
||||
pub mod blob;
|
||||
pub mod database;
|
||||
pub mod data;
|
||||
pub mod index;
|
||||
pub mod rank;
|
||||
pub mod tokenizer;
|
||||
pub mod vec_read_only;
|
||||
|
@ -12,7 +12,7 @@ use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::rank::criterion::{self, Criterion};
|
||||
use crate::rank::distinct_map::DistinctMap;
|
||||
use crate::database::retrieve_data_index;
|
||||
use crate::blob::PositiveBlob;
|
||||
use crate::database::blob::PositiveBlob;
|
||||
use crate::{Match, DocumentId};
|
||||
use crate::rank::Document;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user