Introduce the Transform type into the indexing system

This commit is contained in:
Clément Renault 2020-10-24 16:23:08 +02:00
parent b44b04d25b
commit a7a4984175
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
8 changed files with 173 additions and 251 deletions

View file

@ -1,67 +0,0 @@
use std::fs::File;
use std::path::PathBuf;
use anyhow::bail;
use heed::EnvOpenOptions;
use structopt::StructOpt;
use crate::indexing::{self, IndexerOpt};
use crate::Index;
#[derive(Debug, StructOpt)]
#[structopt(name = "milli-indexer")]
/// The indexer binary of the milli project.
pub struct Opt {
/// The database path where the database is located.
/// It is created if it doesn't already exist.
#[structopt(long = "db", parse(from_os_str))]
database: PathBuf,
/// The maximum size the database can take on disk. It is recommended to specify
/// the whole disk space (value must be a multiple of a page size).
#[structopt(long = "db-size", default_value = "107374182400")] // 100 GB
database_size: usize,
#[structopt(flatten)]
indexer: IndexerOpt,
/// Verbose mode (-v, -vv, -vvv, etc.)
#[structopt(short, long, parse(from_occurrences))]
verbose: usize,
/// CSV file to index, if unspecified the CSV is read from standard input.
///
/// You can also provide a ".gz" or ".gzip" CSV file, the indexer will figure out
/// how to decode and read it.
///
/// Note that it is much faster to index from a file as when the indexer reads from stdin
/// it will dedicate a thread for that and context switches could slow down the indexing jobs.
csv_file: Option<PathBuf>,
}
pub fn run(opt: Opt) -> anyhow::Result<()> {
stderrlog::new()
.verbosity(opt.verbose)
.show_level(false)
.timestamp(stderrlog::Timestamp::Off)
.init()?;
if opt.database.exists() {
bail!("Database ({}) already exists, delete it to continue.", opt.database.display());
}
std::fs::create_dir_all(&opt.database)?;
let env = EnvOpenOptions::new()
.map_size(opt.database_size)
.max_dbs(10)
.open(&opt.database)?;
let index = Index::new(&env)?;
let file_path = opt.csv_file.unwrap();
let gzipped = file_path.extension().map_or(false, |e| e == "gz" || e == "gzip");
let file = File::open(file_path)?;
let content = unsafe { memmap::Mmap::map(&file)? };
indexing::run(&env, &index, &opt.indexer, &content, gzipped, |_docid| { })
}

View file

@ -1,4 +1,3 @@
pub mod indexer;
pub mod infos;
pub mod search;
pub mod serve;

View file

@ -1,6 +1,7 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::{File, create_dir_all};
use std::mem;
use std::{mem, io};
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
@ -8,6 +9,7 @@ use std::sync::Arc;
use std::time::Instant;
use askama_warp::Template;
use flate2::read::GzDecoder;
use futures::stream;
use futures::{FutureExt, StreamExt};
use heed::EnvOpenOptions;
@ -20,9 +22,9 @@ use tokio::sync::broadcast;
use warp::filters::ws::Message;
use warp::{Filter, http::Response};
use crate::indexing::{self, IndexerOpt};
use crate::indexing::{self, IndexerOpt, Transform, TransformOutput};
use crate::tokenizer::{simple_tokenizer, TokenType};
use crate::{Index, UpdateStore, SearchResult};
use crate::{Index, UpdateStore, SearchResult, AvailableDocumentsIds};
#[derive(Debug, StructOpt)]
/// The HTTP main server of the milli project.
@ -103,9 +105,7 @@ enum UpdateStatus<M, P, N> {
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
enum UpdateMeta {
DocumentsAddition {
total_number_of_documents: Option<usize>,
},
DocumentsAddition,
DocumentsAdditionFromPath {
path: PathBuf,
},
@ -153,19 +153,63 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
update_store_path,
move |update_id, meta, content| {
let result = match meta {
UpdateMeta::DocumentsAddition { total_number_of_documents } => {
UpdateMeta::DocumentsAddition => {
// We must use the write transaction of the update here.
let rtxn = env_cloned.read_txn()?;
let fields_ids_map = index_cloned.fields_ids_map(&rtxn)?.unwrap_or_default();
let documents_ids = index_cloned.documents_ids(&rtxn)?.unwrap_or_default();
let available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
let users_ids_documents_ids = match index_cloned.users_ids_documents_ids(&rtxn).unwrap() {
Some(map) => map.map_data(Cow::Borrowed).unwrap(),
None => fst::Map::default().map_data(Cow::Owned).unwrap(),
};
let transform = Transform {
fields_ids_map,
available_documents_ids,
users_ids_documents_ids,
chunk_compression_type: indexer_opt_cloned.chunk_compression_type,
chunk_compression_level: indexer_opt_cloned.chunk_compression_level,
chunk_fusing_shrink_size: Some(indexer_opt_cloned.chunk_fusing_shrink_size),
max_nb_chunks: indexer_opt_cloned.max_nb_chunks,
max_memory: Some(indexer_opt_cloned.max_memory),
};
let gzipped = false;
let reader = if gzipped {
Box::new(GzDecoder::new(content))
} else {
Box::new(content) as Box<dyn io::Read>
};
let TransformOutput {
fields_ids_map,
users_ids_documents_ids,
new_documents_ids,
replaced_documents_ids,
documents_count,
documents_file,
} = transform.from_csv(reader).unwrap();
drop(rtxn);
let mmap = unsafe { memmap::Mmap::map(&documents_file)? };
let documents = grenad::Reader::new(mmap.as_ref()).unwrap();
indexing::run(
&env_cloned,
&index_cloned,
&indexer_opt_cloned,
content,
gzipped,
|count| {
fields_ids_map,
users_ids_documents_ids,
new_documents_ids,
documents,
documents_count as u32,
|count, total| {
// We send progress status...
let meta = UpdateMetaProgress::DocumentsAddition {
processed_number_of_documents: count as usize,
total_number_of_documents,
total_number_of_documents: Some(total as usize),
};
let progress = UpdateStatus::Progressing { update_id, meta };
let _ = update_status_sender_cloned.send(progress);
@ -173,38 +217,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
)
},
UpdateMeta::DocumentsAdditionFromPath { path } => {
let file = match File::open(&path) {
Ok(file) => file,
Err(e) => {
let meta = format!("documents addition file ({}) error: {}", path.display(), e);
return Ok(meta);
}
};
let content = match unsafe { memmap::Mmap::map(&file) } {
Ok(mmap) => mmap,
Err(e) => {
let meta = format!("documents addition file ({}) mmap error: {}", path.display(), e);
return Ok(meta);
},
};
let gzipped = path.extension().map_or(false, |e| e == "gz" || e == "gzip");
indexing::run(
&env_cloned,
&index_cloned,
&indexer_opt_cloned,
&content,
gzipped,
|count| {
// We send progress status...
let meta = UpdateMetaProgress::DocumentsAddition {
processed_number_of_documents: count as usize,
total_number_of_documents: None,
};
let progress = UpdateStatus::Progressing { update_id, meta };
let _ = update_status_sender_cloned.send(progress);
},
)
todo!()
}
};
@ -388,7 +401,8 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let mut record = record.iter()
.map(|(key_id, value)| {
let key = fields_ids_map.name(key_id).unwrap().to_owned();
let value = std::str::from_utf8(value).unwrap().to_owned();
// TODO we must deserialize a Json Value and highlight it.
let value = serde_json::from_slice(value).unwrap();
(key, value)
})
.collect();
@ -423,7 +437,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
let file = file.into_std().await;
let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
let meta = UpdateMeta::DocumentsAddition { total_number_of_documents: None };
let meta = UpdateMeta::DocumentsAddition;
let update_id = update_store.register_update(&meta, &mmap[..]).unwrap();
let _ = update_status_sender.send(UpdateStatus::Pending { update_id, meta });
eprintln!("update {} registered", update_id);