first mostly working version

2025-07-03 20:07:09 +02:00 · 2022-10-16 01:39:01 +02:00 · 2022-10-16 01:39:01 +02:00 · d976e680c5
commit d976e680c5
parent c051166bcc
18 changed files with 403 additions and 57 deletions
--- a/meilisearch-http/src/lib.rs
+++ b/meilisearch-http/src/lib.rs
@ -13,14 +13,28 @@ pub mod metrics;
 #[cfg(feature = "metrics")]
 pub mod route_metrics;

-use std::sync::{atomic::AtomicBool, Arc};
+use std::{
+    fs::File,
+    io::{BufReader, BufWriter, Seek, SeekFrom},
+    path::Path,
+    sync::{atomic::AtomicBool, Arc},
+};

 use crate::error::MeilisearchHttpError;
 use actix_web::error::JsonPayloadError;
 use actix_web::web::Data;
 use analytics::Analytics;
+use anyhow::bail;
 use error::PayloadError;
 use http::header::CONTENT_TYPE;
+use meilisearch_types::{
+    milli::{
+        self,
+        documents::{DocumentsBatchBuilder, DocumentsBatchReader},
+        update::{IndexDocumentsConfig, IndexDocumentsMethod},
+    },
+    settings::apply_settings_to_builder,
+};
 pub use option::Opt;

 use actix_web::{web, HttpRequest};
@ -31,19 +45,83 @@ use meilisearch_auth::AuthController;

 pub static AUTOBATCHING_ENABLED: AtomicBool = AtomicBool::new(false);

+/// Check if a db is empty. It does not provide any information on the
+/// validity of the data in it.
+/// We consider a database as non empty when it's a non empty directory.
+fn is_empty_db(db_path: impl AsRef<Path>) -> bool {
+    let db_path = db_path.as_ref();
+
+    if !db_path.exists() {
+        true
+    // if we encounter an error or if the db is a file we consider the db non empty
+    } else if let Ok(dir) = db_path.read_dir() {
+        dir.count() == 0
+    } else {
+        true
+    }
+}
+
 // TODO: TAMO: Finish setting up things
-pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<IndexScheduler> {
-    let meilisearch = IndexScheduler::new(
-        opt.db_path.join("tasks"),
-        opt.db_path.join("update_files"),
-        opt.db_path.join("indexes"),
-        opt.dumps_dir.clone(),
-        opt.max_index_size.get_bytes() as usize,
-        (&opt.indexer_options).try_into()?,
-        true,
-        #[cfg(test)]
-        todo!("We'll see later"),
-    )?;
+pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(IndexScheduler, AuthController)> {
+    // we don't want to create anything in the data.ms yet, thus we
+    // wrap our two builders in a closure that'll be executed later.
+    let auth_controller_builder = || AuthController::new(&opt.db_path, &opt.master_key);
+
+    let index_scheduler_builder = || {
+        IndexScheduler::new(
+            opt.db_path.join("tasks"),
+            opt.db_path.join("update_files"),
+            opt.db_path.join("indexes"),
+            opt.dumps_dir.clone(),
+            opt.max_index_size.get_bytes() as usize,
+            (&opt.indexer_options).try_into()?,
+            true,
+            #[cfg(test)]
+            todo!("We'll see later"),
+        )
+    };
+
+    let (index_scheduler, auth_controller) = if let Some(ref _path) = opt.import_snapshot {
+        // handle the snapshot with something akin to the dumps
+        // + the snapshot interval / spawning a thread
+        todo!();
+    } else if let Some(ref path) = opt.import_dump {
+        let empty_db = is_empty_db(&opt.db_path);
+        let src_path_exists = path.exists();
+
+        if empty_db && src_path_exists {
+            let mut index_scheduler = index_scheduler_builder()?;
+            let mut auth_controller = auth_controller_builder()?;
+            import_dump(
+                &opt.db_path,
+                path,
+                &mut index_scheduler,
+                &mut auth_controller,
+            )?;
+            (index_scheduler, auth_controller)
+        } else if !empty_db && !opt.ignore_dump_if_db_exists {
+            bail!(
+                "database already exists at {:?}, try to delete it or rename it",
+                opt.db_path
+                    .canonicalize()
+                    .unwrap_or_else(|_| opt.db_path.to_owned())
+            )
+        } else if !src_path_exists && !opt.ignore_missing_dump {
+            bail!("dump doesn't exist at {:?}", path)
+        } else {
+            let mut index_scheduler = index_scheduler_builder()?;
+            let mut auth_controller = auth_controller_builder()?;
+            import_dump(
+                &opt.db_path,
+                path,
+                &mut index_scheduler,
+                &mut auth_controller,
+            )?;
+            (index_scheduler, auth_controller)
+        }
+    } else {
+        (index_scheduler_builder()?, auth_controller_builder()?)
+    };

    /*
    TODO: We should start a thread to handle the snapshots.
@ -53,25 +131,125 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<IndexScheduler> {
        .set_ignore_snapshot_if_db_exists(opt.ignore_snapshot_if_db_exists)
        .set_snapshot_interval(Duration::from_secs(opt.snapshot_interval_sec))
        .set_snapshot_dir(opt.snapshot_dir.clone())
-        // dump
-        .set_ignore_missing_dump(opt.ignore_missing_dump)
-        .set_ignore_dump_if_db_exists(opt.ignore_dump_if_db_exists)
-        .set_dump_dst(opt.dumps_dir.clone());

    if let Some(ref path) = opt.import_snapshot {
        meilisearch.set_import_snapshot(path.clone());
    }

-    if let Some(ref path) = opt.import_dump {
-        meilisearch.set_dump_src(path.clone());
-    }
-
    if opt.schedule_snapshot {
        meilisearch.set_schedule_snapshot();
    }
    */

-    Ok(meilisearch)
+    Ok((index_scheduler, auth_controller))
+}
+
+fn import_dump(
+    db_path: &Path,
+    dump_path: &Path,
+    index_scheduler: &mut IndexScheduler,
+    auth: &mut AuthController,
+) -> Result<(), anyhow::Error> {
+    let reader = File::open(dump_path)?;
+    let mut dump_reader = dump::DumpReader::open(reader)?;
+
+    if let Some(date) = dump_reader.date() {
+        log::info!(
+            "Importing a dump of meilisearch `{:?}` from the {}",
+            dump_reader.version(), // TODO: get the meilisearch version instead of the dump version
+            date
+        );
+    } else {
+        log::info!(
+            "Importing a dump of meilisearch `{:?}`",
+            dump_reader.version(), // TODO: get the meilisearch version instead of the dump version
+        );
+    }
+
+    let instance_uid = dump_reader.instance_uid()?;
+
+    // 1. Import the instance-uid.
+    if let Some(ref instance_uid) = instance_uid {
+        // we don't want to panic if there is an error with the instance-uid.
+        let _ = std::fs::write(
+            db_path.join("instance-uid"),
+            instance_uid.to_string().as_bytes(),
+        );
+    };
+
+    // 2. Import the `Key`s.
+    let mut keys = Vec::new();
+    auth.raw_delete_all_keys()?;
+    for key in dump_reader.keys() {
+        let key = key?;
+        auth.raw_insert_key(key.clone())?;
+        keys.push(key);
+    }
+
+    // 3. Import the tasks.
+    for ret in dump_reader.tasks() {
+        let (task, file) = ret?;
+        index_scheduler.register_dumpped_task(task, file, &keys, instance_uid)?;
+    }
+
+    let indexer_config = index_scheduler.indexer_config();
+
+    // 4. Import the indexes.
+    for index_reader in dump_reader.indexes()? {
+        let mut index_reader = index_reader?;
+        let metadata = index_reader.metadata();
+        log::info!("Importing index `{}`.", metadata.uid);
+        let index = index_scheduler.create_raw_index(&metadata.uid)?;
+
+        let mut wtxn = index.write_txn()?;
+
+        let mut builder = milli::update::Settings::new(&mut wtxn, &index, indexer_config);
+        // 4.1 Import the primary key if there is one.
+        if let Some(ref primary_key) = metadata.primary_key {
+            builder.set_primary_key(primary_key.to_string());
+        }
+
+        // 4.2 Import the settings.
+        log::info!("Importing the settings.");
+        let settings = index_reader.settings()?;
+        apply_settings_to_builder(&settings, &mut builder);
+        builder.execute(|indexing_step| {
+            log::debug!("update: {:?}", indexing_step);
+        })?;
+
+        // 4.3 Import the documents.
+        // 4.3.1 We need to recreate the grenad+obkv format accepted by the index.
+        log::info!("Importing the documents.");
+        let mut file = tempfile::tempfile()?;
+        let mut builder = DocumentsBatchBuilder::new(BufWriter::new(&mut file));
+        for document in index_reader.documents()? {
+            builder.append_json_object(&document?)?;
+        }
+        builder.into_inner()?; // this actually flush the content of the batch builder.
+
+        // 4.3.2 We feed it to the milli index.
+        file.seek(SeekFrom::Start(0))?;
+        let reader = BufReader::new(file);
+        let reader = DocumentsBatchReader::from_reader(reader)?;
+
+        let builder = milli::update::IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            indexer_config,
+            IndexDocumentsConfig {
+                update_method: IndexDocumentsMethod::ReplaceDocuments,
+                ..Default::default()
+            },
+            |indexing_step| log::debug!("update: {:?}", indexing_step),
+        )?;
+
+        let (builder, user_result) = builder.add_documents(reader)?;
+        log::info!("{} documents found.", user_result?);
+        builder.execute()?;
+        wtxn.commit()?;
+        log::info!("All documents successfully imported.");
+    }
+    Ok(())
 }

 pub fn configure_data(
--- a/meilisearch-http/src/main.rs
+++ b/meilisearch-http/src/main.rs
@ -48,9 +48,13 @@ async fn main() -> anyhow::Result<()> {
        _ => unreachable!(),
    }

-    let index_scheduler = setup_meilisearch(&opt)?;
-
-    let auth_controller = AuthController::new(&opt.db_path, &opt.master_key)?;
+    let (index_scheduler, auth_controller) = match setup_meilisearch(&opt) {
+        Ok(ret) => ret,
+        Err(e) => {
+            std::fs::remove_dir_all(opt.db_path)?;
+            return Err(e);
+        }
+    };

    #[cfg(all(not(debug_assertions), feature = "analytics"))]
    let analytics = if !opt.no_analytics {
--- a/meilisearch-http/src/routes/indexes/documents.rs
+++ b/meilisearch-http/src/routes/indexes/documents.rs
@ -242,7 +242,9 @@ async fn document_addition(

    let (uuid, mut update_file) = index_scheduler.create_update_file()?;

+    // TODO: this can be slow, maybe we should spawn a thread? But the payload isn't Send+Sync :weary:
    // push the entire stream into a `Vec`.
+    // If someone sends us a never ending stream we're going to block the thread.
    // TODO: Maybe we should write it to a file to reduce the RAM consumption
    // and then reread it to convert it to obkv?
    let mut buffer = Vec::new();