update queue refactor, first iteration

2025-06-14 03:51:38 +02:00 · 2020-12-22 17:13:50 +01:00 · 2020-12-22 17:13:50 +01:00 · 55e1552957
commit 55e1552957
parent 7c9eaaeadb
5 changed files with 357 additions and 11 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1142,6 +1142,7 @@ dependencies = [
 "lmdb-rkv-sys",
 "once_cell",
 "page_size",
 "serde",
 "synchronoise",
 "url",
 "zerocopy",
@ -1527,6 +1528,7 @@ dependencies = [
 "actix-rt",
 "actix-service",
 "actix-web",
 "anyhow",
 "assert-json-diff",
 "byte-unit",
 "bytes 0.6.0",
@ -1535,6 +1537,8 @@ dependencies = [
 "env_logger 0.8.2",
 "flate2",
 "futures",
 "grenad",
 "heed",
 "http",
 "indexmap",
 "jemallocator",
@ -1545,6 +1549,7 @@ dependencies = [
 "mime",
 "once_cell",
 "rand 0.7.3",
 "rayon",
 "regex",
 "rustls 0.18.1",
 "sentry",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -18,21 +18,26 @@ actix-http = "2"
 actix-rt = "1"
 actix-service = "1.0.6"
 actix-web = { version = "3.3.2", features = ["rustls"] }
 anyhow = "1.0.36"
 byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
 bytes = "0.6.0"
 chrono = { version = "0.4.19", features = ["serde"] }
 crossbeam-channel = "0.5.0"
 env_logger = "0.8.2"
-flate2 = "1.0.18"
+flate2 = "1.0.19"
 futures = "0.3.7"
 grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" }
 heed = "0.10.6"
 http = "0.2.1"
 indexmap =  { version = "1.3.2", features = ["serde-1"] }
 log = "0.4.8"
 main_error = "0.1.0"
 meilisearch-error = { path = "../MeiliSearch/meilisearch-error" }
 milli = { path = "../milli" }
 mime = "0.3.16"
 once_cell = "1.5.2"
 rand = "0.7.3"
 rayon = "1.5.0"
 regex = "1.4.2"
 rustls = "0.18"
 serde = { version = "1.0", features = ["derive"] }
@ -48,7 +53,6 @@ tokio = "*"
 ureq = { version = "1.5.1", default-features = false, features = ["tls"] }
 walkdir = "2.3.1"
 whoami = "1.0.0"
 meilisearch-error = { path = "../MeiliSearch/meilisearch-error" }
 [dependencies.sentry]
 default-features = false
--- a/src/option.rs
+++ b/src/option.rs
@ -11,13 +11,15 @@ use rustls::{
 };
 use structopt::StructOpt;
 use crate::updates::IndexerOpts;
 const POSSIBLE_ENV: [&str; 2] = ["development", "production"];
 #[derive(Debug, Clone, StructOpt)]
 pub struct Opt {
    /// The destination where the database must be created.
    #[structopt(long, env = "MEILI_DB_PATH", default_value = "./data.ms")]
-    pub db_path: String,
+    pub db_path: PathBuf,
    /// The address on which the http server will listen.
    #[structopt(long, env = "MEILI_HTTP_ADDR", default_value = "127.0.0.1:7700")]
@ -132,6 +134,9 @@ pub struct Opt {
    /// The batch size used in the importation process, the bigger it is the faster the dump is created.
    #[structopt(long, env = "MEILI_DUMP_BATCH_SIZE", default_value = "1024")]
    pub dump_batch_size: usize,
    #[structopt(flatten)]
    pub indexer_options: IndexerOpts,
 }
 impl Opt {
--- a/src/updates/mod.rs
+++ b/src/updates/mod.rs
@ -2,7 +2,22 @@ mod settings;
 pub use settings::{Settings, Facets};
 use std::io;
 use std::path::Path;
 use std::sync::Arc;
 use anyhow::Result;
 use flate2::read::GzDecoder;
 use grenad::CompressionType;
 use byte_unit::Byte;
 use milli::update::{UpdateBuilder, UpdateFormat, IndexDocumentsMethod, UpdateIndexingStep::*};
 use milli::{UpdateStore, UpdateHandler as Handler, Index};
 use rayon::ThreadPool;
 use serde::{Serialize, Deserialize};
 use tokio::sync::broadcast;
 use structopt::StructOpt;
 use crate::option::Opt;
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "type")]
@ -13,5 +28,322 @@ enum UpdateMeta {
    Facets(Facets),
 }
-#[derive(Clone, Debug)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct UpdateQueue;
+#[serde(tag = "type")]
 enum UpdateMetaProgress {
    DocumentsAddition {
        step: usize,
        total_steps: usize,
        current: usize,
        total: Option<usize>,
    },
 }
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "type")]
 enum UpdateStatus<M, P, N> {
    Pending { update_id: u64, meta: M },
    Progressing { update_id: u64, meta: P },
    Processed { update_id: u64, meta: N },
    Aborted { update_id: u64, meta: M },
 }
 #[derive(Clone)]
 pub struct UpdateQueue {
    inner: Arc<UpdateStore<UpdateMeta, String>>,
 }
 #[derive(Debug, Clone, StructOpt)]
 pub struct IndexerOpts {
    /// The amount of documents to skip before printing
    /// a log regarding the indexing advancement.
    #[structopt(long, default_value = "100000")] // 100k
    pub log_every_n: usize,
    /// MTBL max number of chunks in bytes.
    #[structopt(long)]
    pub max_nb_chunks: Option<usize>,
    /// The maximum amount of memory to use for the MTBL buffer. It is recommended
    /// to use something like 80%-90% of the available memory.
    ///
    /// It is automatically split by the number of jobs e.g. if you use 7 jobs
    /// and 7 GB of max memory, each thread will use a maximum of 1 GB.
    #[structopt(long, default_value = "7 GiB")]
    pub max_memory: Byte,
    /// Size of the linked hash map cache when indexing.
    /// The bigger it is, the faster the indexing is but the more memory it takes.
    #[structopt(long, default_value = "500")]
    pub linked_hash_map_size: usize,
    /// The name of the compression algorithm to use when compressing intermediate
    /// chunks during indexing documents.
    ///
    /// Choosing a fast algorithm will make the indexing faster but may consume more memory.
    #[structopt(long, default_value = "snappy", possible_values = &["snappy", "zlib", "lz4", "lz4hc", "zstd"])]
    pub chunk_compression_type: CompressionType,
    /// The level of compression of the chosen algorithm.
    #[structopt(long, requires = "chunk-compression-type")]
    pub chunk_compression_level: Option<u32>,
    /// The number of bytes to remove from the begining of the chunks while reading/sorting
    /// or merging them.
    ///
    /// File fusing must only be enable on file systems that support the `FALLOC_FL_COLLAPSE_RANGE`,
    /// (i.e. ext4 and XFS). File fusing will only work if the `enable-chunk-fusing` is set.
    #[structopt(long, default_value = "4 GiB")]
    pub chunk_fusing_shrink_size: Byte,
    /// Enable the chunk fusing or not, this reduces the amount of disk used by a factor of 2.
    #[structopt(long)]
    pub enable_chunk_fusing: bool,
    /// Number of parallel jobs for indexing, defaults to # of CPUs.
    #[structopt(long)]
    pub indexing_jobs: Option<usize>,
 }
 type UpdateSender = broadcast::Sender<UpdateStatus<UpdateMeta, UpdateMetaProgress, String>>;
 struct UpdateHandler {
    indexes: Arc<Index>,
    max_nb_chunks: Option<usize>,
    chunk_compression_level: Option<u32>,
    thread_pool: ThreadPool,
    log_frequency: usize,
    max_memory: usize,
    linked_hash_map_size: usize,
    chunk_compression_type: CompressionType,
    chunk_fusing_shrink_size: u64,
    update_status_sender: UpdateSender,
 }
 impl UpdateHandler {
    fn new(
        opt: &IndexerOpts,
        indexes: Arc<Index>,
        update_status_sender: UpdateSender,
    ) -> Result<Self> {
        let thread_pool = rayon::ThreadPoolBuilder::new()
            .num_threads(opt.indexing_jobs.unwrap_or(0))
            .build()?;
        Ok(Self {
            indexes,
            max_nb_chunks: opt.max_nb_chunks,
            chunk_compression_level: opt.chunk_compression_level,
            thread_pool,
            log_frequency: opt.log_every_n,
            max_memory: opt.max_memory.get_bytes() as usize,
            linked_hash_map_size: opt.linked_hash_map_size,
            chunk_compression_type: opt.chunk_compression_type,
            chunk_fusing_shrink_size: opt.chunk_fusing_shrink_size.get_bytes(),
            update_status_sender,
        })
    }
    fn update_buidler(&self, update_id: u64) -> UpdateBuilder {
        // We prepare the update by using the update builder.
        let mut update_builder = UpdateBuilder::new(update_id);
        if let Some(max_nb_chunks) = self.max_nb_chunks {
            update_builder.max_nb_chunks(max_nb_chunks);
        }
        if let Some(chunk_compression_level) = self.chunk_compression_level {
            update_builder.chunk_compression_level(chunk_compression_level);
        }
        update_builder.thread_pool(&self.thread_pool);
        update_builder.log_every_n(self.log_frequency);
        update_builder.max_memory(self.max_memory);
        update_builder.linked_hash_map_size(self.linked_hash_map_size);
        update_builder.chunk_compression_type(self.chunk_compression_type);
        update_builder.chunk_fusing_shrink_size(self.chunk_fusing_shrink_size);
        update_builder
    }
    fn update_documents(
        &self,
        format: String,
        method: String,
        content: &[u8],
        update_builder: UpdateBuilder,
    ) -> Result<()> {
        // We must use the write transaction of the update here.
        let mut wtxn = self.indexes.write_txn()?;
        let mut builder = update_builder.index_documents(&mut wtxn, &self.indexes);
        match format.as_str() {
            "csv" => builder.update_format(UpdateFormat::Csv),
            "json" => builder.update_format(UpdateFormat::Json),
            "json-stream" => builder.update_format(UpdateFormat::JsonStream),
            otherwise => panic!("invalid update format {:?}", otherwise),
        };
        match method.as_str() {
            "replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments),
            "update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments),
            otherwise => panic!("invalid indexing method {:?}", otherwise),
        };
        let gzipped = true;
        let reader = if gzipped {
            Box::new(GzDecoder::new(content))
        } else {
            Box::new(content) as Box<dyn io::Read>
        };
        let result = builder.execute(reader, |indexing_step, update_id| {
            let (current, total) = match indexing_step {
                TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
                ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
                IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
                MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
            };
            let _ = self.update_status_sender.send(UpdateStatus::Progressing {
                update_id,
                meta: UpdateMetaProgress::DocumentsAddition {
                    step: indexing_step.step(),
                    total_steps: indexing_step.number_of_steps(),
                    current,
                    total,
                }
            });
        });
        match result {
            Ok(()) => wtxn.commit().map_err(Into::into),
            Err(e) => Err(e.into())
        }
    }
    fn clear_documents(&self, update_builder: UpdateBuilder) -> Result<()> {
        // We must use the write transaction of the update here.
        let mut wtxn = self.indexes.write_txn()?;
        let builder = update_builder.clear_documents(&mut wtxn, &self.indexes);
        match builder.execute() {
            Ok(_count) => wtxn.commit().map_err(Into::into),
            Err(e) => Err(e.into())
        }
    }
    fn update_settings(&self, settings: Settings, update_builder: UpdateBuilder) -> Result<()> {
        // We must use the write transaction of the update here.
        let mut wtxn = self.indexes.write_txn()?;
        let mut builder = update_builder.settings(&mut wtxn, &self.indexes);
        // We transpose the settings JSON struct into a real setting update.
        if let Some(names) = settings.searchable_attributes {
            match names {
                Some(names) => builder.set_searchable_fields(names),
                None => builder.reset_searchable_fields(),
            }
        }
        // We transpose the settings JSON struct into a real setting update.
        if let Some(names) = settings.displayed_attributes {
            match names {
                Some(names) => builder.set_displayed_fields(names),
                None => builder.reset_displayed_fields(),
            }
        }
        // We transpose the settings JSON struct into a real setting update.
        if let Some(facet_types) = settings.faceted_attributes {
            builder.set_faceted_fields(facet_types);
        }
        // We transpose the settings JSON struct into a real setting update.
        if let Some(criteria) = settings.criteria {
            match criteria {
                Some(criteria) => builder.set_criteria(criteria),
                None => builder.reset_criteria(),
            }
        }
        let result = builder.execute(|indexing_step, update_id| {
            let (current, total) = match indexing_step {
                TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
                ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
                IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
                MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
            };
            let _ = self.update_status_sender.send(UpdateStatus::Progressing {
                update_id,
                meta: UpdateMetaProgress::DocumentsAddition {
                    step: indexing_step.step(),
                    total_steps: indexing_step.number_of_steps(),
                    current,
                    total,
                }
            });
        });
        match result {
            Ok(_count) => wtxn.commit().map_err(Into::into),
            Err(e) => Err(e.into())
        }
    }
    fn update_facets(&self, levels: Facets, update_builder: UpdateBuilder) -> Result<()> {
        // We must use the write transaction of the update here.
        let mut wtxn = self.indexes.write_txn()?;
        let mut builder = update_builder.facets(&mut wtxn, &self.indexes);
        if let Some(value) = levels.level_group_size {
            builder.level_group_size(value);
        }
        if let Some(value) = levels.min_level_size {
            builder.min_level_size(value);
        }
        match builder.execute() {
            Ok(()) => wtxn.commit().map_err(Into::into),
            Err(e) => Err(e.into())
        }
    }
 }
 impl Handler<UpdateMeta, String> for UpdateHandler {
    fn handle_update(&mut self, update_id: u64, meta: UpdateMeta, content: &[u8]) -> heed::Result<String> {
        use UpdateMeta::*;
        let update_builder = self.update_buidler(update_id);
        let result: anyhow::Result<()> = match meta {
            DocumentsAddition { method, format } => {
                self.update_documents(format, method, content, update_builder)
            },
            ClearDocuments => self.clear_documents(update_builder),
            Settings(settings) => self.update_settings(settings, update_builder),
            Facets(levels) => self.update_facets(levels, update_builder),
        };
        let meta = match result {
            Ok(()) => format!("valid update content"),
            Err(e) => format!("error while processing update content: {:?}", e),
        };
        let processed = UpdateStatus::Processed { update_id, meta: meta.clone() };
        let _ = self.update_status_sender.send(processed);
        Ok(meta)
    }
 }
 impl UpdateQueue {
    pub fn new<P: AsRef<Path>>(
        opt: Opt,
        indexes: Arc<Index>,
        ) -> Result<Self> {
        let (sender, _) = broadcast::channel(100);
        let handler = UpdateHandler::new(&opt.indexer_options, indexes, sender)?;
        let size = opt.max_udb_size.get_bytes() as usize;
        let path = opt.db_path.join("updates.mdb");
        let inner = UpdateStore::open(
            Some(size),
            path,
            handler
        )?;
        Ok(Self { inner })
    }
 }
--- a/src/updates/settings.rs
+++ b/src/updates/settings.rs
@ -20,24 +20,24 @@ pub struct Settings {
        deserialize_with = "deserialize_some",
        skip_serializing_if = "Option::is_none",
    )]
-    displayed_attributes: Option<Option<Vec<String>>>,
+    pub displayed_attributes: Option<Option<Vec<String>>>,
    #[serde(
        default,
        deserialize_with = "deserialize_some",
        skip_serializing_if = "Option::is_none",
    )]
-    searchable_attributes: Option<Option<Vec<String>>>,
+    pub searchable_attributes: Option<Option<Vec<String>>>,
    #[serde(default)]
-    faceted_attributes: Option<HashMap<String, String>>,
+    pub faceted_attributes: Option<HashMap<String, String>>,
    #[serde(
        default,
        deserialize_with = "deserialize_some",
        skip_serializing_if = "Option::is_none",
    )]
-    criteria: Option<Option<Vec<String>>>,
+    pub criteria: Option<Option<Vec<String>>>,
 }
@ -45,7 +45,7 @@ pub struct Settings {
 #[serde(deny_unknown_fields)]
 #[serde(rename_all = "camelCase")]
 pub struct Facets {
-    level_group_size: Option<NonZeroUsize>,
+    pub level_group_size: Option<NonZeroUsize>,
-    min_level_size: Option<NonZeroUsize>,
+    pub min_level_size: Option<NonZeroUsize>,
 }