first mostly working version

This commit is contained in:
Tamo 2022-10-16 01:39:01 +02:00 committed by Clément Renault
parent c051166bcc
commit d976e680c5
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
18 changed files with 403 additions and 57 deletions

View file

@ -13,14 +13,28 @@ pub mod metrics;
#[cfg(feature = "metrics")]
pub mod route_metrics;
use std::sync::{atomic::AtomicBool, Arc};
use std::{
fs::File,
io::{BufReader, BufWriter, Seek, SeekFrom},
path::Path,
sync::{atomic::AtomicBool, Arc},
};
use crate::error::MeilisearchHttpError;
use actix_web::error::JsonPayloadError;
use actix_web::web::Data;
use analytics::Analytics;
use anyhow::bail;
use error::PayloadError;
use http::header::CONTENT_TYPE;
use meilisearch_types::{
milli::{
self,
documents::{DocumentsBatchBuilder, DocumentsBatchReader},
update::{IndexDocumentsConfig, IndexDocumentsMethod},
},
settings::apply_settings_to_builder,
};
pub use option::Opt;
use actix_web::{web, HttpRequest};
@ -31,19 +45,83 @@ use meilisearch_auth::AuthController;
pub static AUTOBATCHING_ENABLED: AtomicBool = AtomicBool::new(false);
/// Check if a db is empty. It does not provide any information on the
/// validity of the data in it.
/// We consider a database as non empty when it's a non empty directory.
fn is_empty_db(db_path: impl AsRef<Path>) -> bool {
let db_path = db_path.as_ref();
if !db_path.exists() {
true
// if we encounter an error or if the db is a file we consider the db non empty
} else if let Ok(dir) = db_path.read_dir() {
dir.count() == 0
} else {
true
}
}
// TODO: TAMO: Finish setting up things
pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<IndexScheduler> {
let meilisearch = IndexScheduler::new(
opt.db_path.join("tasks"),
opt.db_path.join("update_files"),
opt.db_path.join("indexes"),
opt.dumps_dir.clone(),
opt.max_index_size.get_bytes() as usize,
(&opt.indexer_options).try_into()?,
true,
#[cfg(test)]
todo!("We'll see later"),
)?;
pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(IndexScheduler, AuthController)> {
// we don't want to create anything in the data.ms yet, thus we
// wrap our two builders in a closure that'll be executed later.
let auth_controller_builder = || AuthController::new(&opt.db_path, &opt.master_key);
let index_scheduler_builder = || {
IndexScheduler::new(
opt.db_path.join("tasks"),
opt.db_path.join("update_files"),
opt.db_path.join("indexes"),
opt.dumps_dir.clone(),
opt.max_index_size.get_bytes() as usize,
(&opt.indexer_options).try_into()?,
true,
#[cfg(test)]
todo!("We'll see later"),
)
};
let (index_scheduler, auth_controller) = if let Some(ref _path) = opt.import_snapshot {
// handle the snapshot with something akin to the dumps
// + the snapshot interval / spawning a thread
todo!();
} else if let Some(ref path) = opt.import_dump {
let empty_db = is_empty_db(&opt.db_path);
let src_path_exists = path.exists();
if empty_db && src_path_exists {
let mut index_scheduler = index_scheduler_builder()?;
let mut auth_controller = auth_controller_builder()?;
import_dump(
&opt.db_path,
path,
&mut index_scheduler,
&mut auth_controller,
)?;
(index_scheduler, auth_controller)
} else if !empty_db && !opt.ignore_dump_if_db_exists {
bail!(
"database already exists at {:?}, try to delete it or rename it",
opt.db_path
.canonicalize()
.unwrap_or_else(|_| opt.db_path.to_owned())
)
} else if !src_path_exists && !opt.ignore_missing_dump {
bail!("dump doesn't exist at {:?}", path)
} else {
let mut index_scheduler = index_scheduler_builder()?;
let mut auth_controller = auth_controller_builder()?;
import_dump(
&opt.db_path,
path,
&mut index_scheduler,
&mut auth_controller,
)?;
(index_scheduler, auth_controller)
}
} else {
(index_scheduler_builder()?, auth_controller_builder()?)
};
/*
TODO: We should start a thread to handle the snapshots.
@ -53,25 +131,125 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<IndexScheduler> {
.set_ignore_snapshot_if_db_exists(opt.ignore_snapshot_if_db_exists)
.set_snapshot_interval(Duration::from_secs(opt.snapshot_interval_sec))
.set_snapshot_dir(opt.snapshot_dir.clone())
// dump
.set_ignore_missing_dump(opt.ignore_missing_dump)
.set_ignore_dump_if_db_exists(opt.ignore_dump_if_db_exists)
.set_dump_dst(opt.dumps_dir.clone());
if let Some(ref path) = opt.import_snapshot {
meilisearch.set_import_snapshot(path.clone());
}
if let Some(ref path) = opt.import_dump {
meilisearch.set_dump_src(path.clone());
}
if opt.schedule_snapshot {
meilisearch.set_schedule_snapshot();
}
*/
Ok(meilisearch)
Ok((index_scheduler, auth_controller))
}
fn import_dump(
db_path: &Path,
dump_path: &Path,
index_scheduler: &mut IndexScheduler,
auth: &mut AuthController,
) -> Result<(), anyhow::Error> {
let reader = File::open(dump_path)?;
let mut dump_reader = dump::DumpReader::open(reader)?;
if let Some(date) = dump_reader.date() {
log::info!(
"Importing a dump of meilisearch `{:?}` from the {}",
dump_reader.version(), // TODO: get the meilisearch version instead of the dump version
date
);
} else {
log::info!(
"Importing a dump of meilisearch `{:?}`",
dump_reader.version(), // TODO: get the meilisearch version instead of the dump version
);
}
let instance_uid = dump_reader.instance_uid()?;
// 1. Import the instance-uid.
if let Some(ref instance_uid) = instance_uid {
// we don't want to panic if there is an error with the instance-uid.
let _ = std::fs::write(
db_path.join("instance-uid"),
instance_uid.to_string().as_bytes(),
);
};
// 2. Import the `Key`s.
let mut keys = Vec::new();
auth.raw_delete_all_keys()?;
for key in dump_reader.keys() {
let key = key?;
auth.raw_insert_key(key.clone())?;
keys.push(key);
}
// 3. Import the tasks.
for ret in dump_reader.tasks() {
let (task, file) = ret?;
index_scheduler.register_dumpped_task(task, file, &keys, instance_uid)?;
}
let indexer_config = index_scheduler.indexer_config();
// 4. Import the indexes.
for index_reader in dump_reader.indexes()? {
let mut index_reader = index_reader?;
let metadata = index_reader.metadata();
log::info!("Importing index `{}`.", metadata.uid);
let index = index_scheduler.create_raw_index(&metadata.uid)?;
let mut wtxn = index.write_txn()?;
let mut builder = milli::update::Settings::new(&mut wtxn, &index, indexer_config);
// 4.1 Import the primary key if there is one.
if let Some(ref primary_key) = metadata.primary_key {
builder.set_primary_key(primary_key.to_string());
}
// 4.2 Import the settings.
log::info!("Importing the settings.");
let settings = index_reader.settings()?;
apply_settings_to_builder(&settings, &mut builder);
builder.execute(|indexing_step| {
log::debug!("update: {:?}", indexing_step);
})?;
// 4.3 Import the documents.
// 4.3.1 We need to recreate the grenad+obkv format accepted by the index.
log::info!("Importing the documents.");
let mut file = tempfile::tempfile()?;
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(&mut file));
for document in index_reader.documents()? {
builder.append_json_object(&document?)?;
}
builder.into_inner()?; // this actually flush the content of the batch builder.
// 4.3.2 We feed it to the milli index.
file.seek(SeekFrom::Start(0))?;
let reader = BufReader::new(file);
let reader = DocumentsBatchReader::from_reader(reader)?;
let builder = milli::update::IndexDocuments::new(
&mut wtxn,
&index,
indexer_config,
IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
},
|indexing_step| log::debug!("update: {:?}", indexing_step),
)?;
let (builder, user_result) = builder.add_documents(reader)?;
log::info!("{} documents found.", user_result?);
builder.execute()?;
wtxn.commit()?;
log::info!("All documents successfully imported.");
}
Ok(())
}
pub fn configure_data(

View file

@ -48,9 +48,13 @@ async fn main() -> anyhow::Result<()> {
_ => unreachable!(),
}
let index_scheduler = setup_meilisearch(&opt)?;
let auth_controller = AuthController::new(&opt.db_path, &opt.master_key)?;
let (index_scheduler, auth_controller) = match setup_meilisearch(&opt) {
Ok(ret) => ret,
Err(e) => {
std::fs::remove_dir_all(opt.db_path)?;
return Err(e);
}
};
#[cfg(all(not(debug_assertions), feature = "analytics"))]
let analytics = if !opt.no_analytics {

View file

@ -242,7 +242,9 @@ async fn document_addition(
let (uuid, mut update_file) = index_scheduler.create_update_file()?;
// TODO: this can be slow, maybe we should spawn a thread? But the payload isn't Send+Sync :weary:
// push the entire stream into a `Vec`.
// If someone sends us a never ending stream we're going to block the thread.
// TODO: Maybe we should write it to a file to reduce the RAM consumption
// and then reread it to convert it to obkv?
let mut buffer = Vec::new();