2019-01-01 16:37:15 +01:00
|
|
|
#[global_allocator]
|
|
|
|
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
|
|
|
|
2019-02-03 12:22:50 +01:00
|
|
|
use std::collections::{HashMap, HashSet};
|
2019-06-18 13:40:46 +02:00
|
|
|
use std::io::{self, BufRead, BufReader};
|
2018-12-10 15:13:25 +01:00
|
|
|
use std::path::{Path, PathBuf};
|
2019-02-16 20:44:16 +01:00
|
|
|
use std::time::Instant;
|
2018-12-10 15:13:25 +01:00
|
|
|
use std::error::Error;
|
2018-12-29 12:26:33 +01:00
|
|
|
use std::fs::File;
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-05-21 13:27:55 +02:00
|
|
|
use diskus::Walk;
|
|
|
|
use sysinfo::{SystemExt, ProcessExt};
|
2019-04-22 15:26:43 +02:00
|
|
|
use serde::{Serialize, Deserialize};
|
2018-12-10 15:13:25 +01:00
|
|
|
use structopt::StructOpt;
|
|
|
|
|
2019-05-29 15:26:18 +02:00
|
|
|
use meilidb_data::Database;
|
|
|
|
use meilidb_schema::Schema;
|
2018-12-10 15:13:25 +01:00
|
|
|
|
|
|
|
#[derive(Debug, StructOpt)]
|
|
|
|
pub struct Opt {
|
2018-12-29 12:26:33 +01:00
|
|
|
/// The destination where the database must be created.
|
2018-12-10 15:13:25 +01:00
|
|
|
#[structopt(parse(from_os_str))]
|
|
|
|
pub database_path: PathBuf,
|
|
|
|
|
|
|
|
/// The csv file to index.
|
|
|
|
#[structopt(parse(from_os_str))]
|
|
|
|
pub csv_data_path: PathBuf,
|
|
|
|
|
2018-12-29 12:26:33 +01:00
|
|
|
/// The path to the schema.
|
|
|
|
#[structopt(long = "schema", parse(from_os_str))]
|
|
|
|
pub schema_path: PathBuf,
|
2019-01-06 18:03:47 +01:00
|
|
|
|
2019-08-02 12:07:23 +02:00
|
|
|
/// The file with the synonyms.
|
|
|
|
#[structopt(long = "synonyms", parse(from_os_str))]
|
|
|
|
pub synonyms: Option<PathBuf>,
|
|
|
|
|
2019-01-06 18:03:47 +01:00
|
|
|
/// The path to the list of stop words (one by line).
|
|
|
|
#[structopt(long = "stop-words", parse(from_os_str))]
|
2019-08-02 12:07:23 +02:00
|
|
|
pub stop_words: Option<PathBuf>,
|
2019-01-06 18:03:47 +01:00
|
|
|
|
|
|
|
#[structopt(long = "update-group-size")]
|
|
|
|
pub update_group_size: Option<usize>,
|
2018-12-10 15:13:25 +01:00
|
|
|
}
|
|
|
|
|
2018-12-29 12:26:33 +01:00
|
|
|
#[derive(Serialize, Deserialize)]
|
2019-08-21 17:12:52 +02:00
|
|
|
struct Document (
|
|
|
|
HashMap<String, String>
|
2018-12-29 12:26:33 +01:00
|
|
|
);
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-08-02 12:07:23 +02:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(untagged)]
|
|
|
|
pub enum Synonym {
|
|
|
|
OneWay(SynonymOneWay),
|
|
|
|
MultiWay { synonyms: Vec<String> },
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
pub struct SynonymOneWay {
|
|
|
|
pub search_terms: String,
|
|
|
|
pub synonyms: Synonyms,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
#[serde(untagged)]
|
|
|
|
pub enum Synonyms {
|
|
|
|
Multiple(Vec<String>),
|
|
|
|
Single(String),
|
|
|
|
}
|
|
|
|
|
|
|
|
fn read_synomys(path: &Path) -> Result<Vec<Synonym>, Box<dyn Error>> {
|
|
|
|
let file = File::open(path)?;
|
|
|
|
let synonyms = serde_json::from_reader(file)?;
|
|
|
|
Ok(synonyms)
|
|
|
|
}
|
|
|
|
|
2019-01-06 18:03:47 +01:00
|
|
|
fn index(
|
|
|
|
schema: Schema,
|
|
|
|
database_path: &Path,
|
|
|
|
csv_data_path: &Path,
|
|
|
|
update_group_size: Option<usize>,
|
|
|
|
stop_words: &HashSet<String>,
|
2019-08-02 12:07:23 +02:00
|
|
|
synonyms: Vec<Synonym>,
|
2019-06-18 13:40:46 +02:00
|
|
|
) -> Result<Database, Box<dyn Error>>
|
2019-01-06 18:03:47 +01:00
|
|
|
{
|
2019-08-19 18:09:02 +02:00
|
|
|
let database = Database::open(database_path)?;
|
2019-02-07 13:05:55 +01:00
|
|
|
|
2019-05-21 13:27:55 +02:00
|
|
|
let mut wtr = csv::Writer::from_path("./stats.csv").unwrap();
|
|
|
|
wtr.write_record(&["NumberOfDocuments", "DiskUsed", "MemoryUsed"])?;
|
|
|
|
|
|
|
|
let mut system = sysinfo::System::new();
|
|
|
|
|
2019-05-23 14:47:10 +02:00
|
|
|
let index = database.create_index("test", schema.clone())?;
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-08-02 12:07:23 +02:00
|
|
|
let mut synonyms_adder = index.synonyms_addition();
|
|
|
|
for synonym in synonyms {
|
|
|
|
match synonym {
|
|
|
|
Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => {
|
|
|
|
let alternatives = match synonyms {
|
|
|
|
Synonyms::Multiple(alternatives) => alternatives,
|
|
|
|
Synonyms::Single(alternative) => vec![alternative],
|
|
|
|
};
|
|
|
|
synonyms_adder.add_synonym(search_terms, alternatives);
|
|
|
|
},
|
|
|
|
Synonym::MultiWay { mut synonyms } => {
|
|
|
|
for _ in 0..synonyms.len() {
|
|
|
|
if let Some((synonym, alternatives)) = synonyms.split_first() {
|
|
|
|
synonyms_adder.add_synonym(synonym, alternatives);
|
|
|
|
}
|
|
|
|
synonyms.rotate_left(1);
|
|
|
|
}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
synonyms_adder.finalize()?;
|
|
|
|
|
2018-12-10 15:13:25 +01:00
|
|
|
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
|
|
|
let mut raw_record = csv::StringRecord::new();
|
|
|
|
let headers = rdr.headers()?.clone();
|
|
|
|
|
2019-01-06 18:03:47 +01:00
|
|
|
let mut i = 0;
|
|
|
|
let mut end_of_file = false;
|
|
|
|
|
|
|
|
while !end_of_file {
|
2019-04-22 15:26:43 +02:00
|
|
|
let mut update = index.documents_addition();
|
2019-01-06 18:03:47 +01:00
|
|
|
|
|
|
|
loop {
|
|
|
|
end_of_file = !rdr.read_record(&mut raw_record)?;
|
|
|
|
if end_of_file { break }
|
|
|
|
|
|
|
|
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
|
|
|
Ok(document) => document,
|
|
|
|
Err(e) => {
|
|
|
|
eprintln!("{:?}", e);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-08-21 17:12:52 +02:00
|
|
|
update.update_document(document);
|
2019-01-06 18:03:47 +01:00
|
|
|
|
|
|
|
print!("\rindexing document {}", i);
|
|
|
|
i += 1;
|
|
|
|
|
|
|
|
if let Some(group_size) = update_group_size {
|
|
|
|
if i % group_size == 0 { break }
|
2018-12-10 15:13:25 +01:00
|
|
|
}
|
2019-01-06 18:03:47 +01:00
|
|
|
}
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-01-06 18:03:47 +01:00
|
|
|
println!();
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-02-05 14:48:55 +01:00
|
|
|
println!("committing update...");
|
2019-04-22 15:26:43 +02:00
|
|
|
update.finalize()?;
|
2019-05-21 13:27:55 +02:00
|
|
|
|
|
|
|
// write stats
|
|
|
|
let directory_size = Walk::new(&[database_path.to_owned()], 4).run();
|
|
|
|
system.refresh_all();
|
|
|
|
let memory = system.get_process(sysinfo::get_current_pid()).unwrap().memory(); // in kb
|
|
|
|
wtr.write_record(&[i.to_string(), directory_size.to_string(), memory.to_string()])?;
|
|
|
|
wtr.flush()?;
|
2019-01-06 18:03:47 +01:00
|
|
|
}
|
2018-12-10 15:13:25 +01:00
|
|
|
|
|
|
|
Ok(database)
|
|
|
|
}
|
|
|
|
|
2019-01-06 18:03:47 +01:00
|
|
|
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
|
|
|
let f = File::open(path)?;
|
|
|
|
let reader = BufReader::new(f);
|
|
|
|
let mut words = HashSet::new();
|
|
|
|
|
|
|
|
for line in reader.lines() {
|
|
|
|
let line = line?;
|
|
|
|
let word = line.trim().to_string();
|
|
|
|
words.insert(word);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(words)
|
|
|
|
}
|
|
|
|
|
2019-06-18 13:40:46 +02:00
|
|
|
fn main() -> Result<(), Box<dyn Error>> {
|
2019-01-06 15:01:09 +01:00
|
|
|
let _ = env_logger::init();
|
2018-12-10 15:13:25 +01:00
|
|
|
let opt = Opt::from_args();
|
|
|
|
|
2018-12-29 12:26:33 +01:00
|
|
|
let schema = {
|
|
|
|
let file = File::open(&opt.schema_path)?;
|
|
|
|
Schema::from_toml(file)?
|
|
|
|
};
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-08-02 12:07:23 +02:00
|
|
|
let stop_words = match opt.stop_words {
|
2019-01-06 18:03:47 +01:00
|
|
|
Some(ref path) => retrieve_stop_words(path)?,
|
|
|
|
None => HashSet::new(),
|
|
|
|
};
|
|
|
|
|
2019-08-02 12:07:23 +02:00
|
|
|
let synonyms = match opt.synonyms {
|
|
|
|
Some(ref path) => read_synomys(path)?,
|
|
|
|
None => Vec::new(),
|
|
|
|
};
|
|
|
|
|
2019-02-16 20:44:16 +01:00
|
|
|
let start = Instant::now();
|
2019-08-02 12:07:23 +02:00
|
|
|
let result = index(
|
|
|
|
schema,
|
|
|
|
&opt.database_path,
|
|
|
|
&opt.csv_data_path,
|
|
|
|
opt.update_group_size,
|
|
|
|
&stop_words,
|
|
|
|
synonyms,
|
|
|
|
);
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2018-12-11 12:06:02 +01:00
|
|
|
if let Err(e) = result {
|
|
|
|
return Err(e.into())
|
|
|
|
}
|
2018-12-10 15:13:25 +01:00
|
|
|
|
2019-02-16 20:44:16 +01:00
|
|
|
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
|
2018-12-10 15:13:25 +01:00
|
|
|
Ok(())
|
|
|
|
}
|