WIP: refactor IndexController

change the architecture of the index controller to allow it to own an index store.
2025-07-04 20:37:15 +02:00 · 2021-01-16 15:09:48 +01:00 · 2021-01-16 15:09:48 +01:00 · 6a3f625e11
commit 6a3f625e11
parent 686f987180
15 changed files with 1197 additions and 287 deletions
--- a/src/option.rs
+++ b/src/option.rs
@ -9,10 +9,60 @@ use rustls::{
    AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, NoClientAuth,
    RootCertStore,
 };
+use grenad::CompressionType;
 use structopt::StructOpt;

-use crate::updates::IndexerOpts;
+#[derive(Debug, Clone, StructOpt)]
+pub struct IndexerOpts {
+    /// The amount of documents to skip before printing
+    /// a log regarding the indexing advancement.
+    #[structopt(long, default_value = "100000")] // 100k
+    pub log_every_n: usize,

+    /// MTBL max number of chunks in bytes.
+    #[structopt(long)]
+    pub max_nb_chunks: Option<usize>,
+
+    /// The maximum amount of memory to use for the MTBL buffer. It is recommended
+    /// to use something like 80%-90% of the available memory.
+    ///
+    /// It is automatically split by the number of jobs e.g. if you use 7 jobs
+    /// and 7 GB of max memory, each thread will use a maximum of 1 GB.
+    #[structopt(long, default_value = "7 GiB")]
+    pub max_memory: Byte,
+
+    /// Size of the linked hash map cache when indexing.
+    /// The bigger it is, the faster the indexing is but the more memory it takes.
+    #[structopt(long, default_value = "500")]
+    pub linked_hash_map_size: usize,
+
+    /// The name of the compression algorithm to use when compressing intermediate
+    /// chunks during indexing documents.
+    ///
+    /// Choosing a fast algorithm will make the indexing faster but may consume more memory.
+    #[structopt(long, default_value = "snappy", possible_values = &["snappy", "zlib", "lz4", "lz4hc", "zstd"])]
+    pub chunk_compression_type: CompressionType,
+
+    /// The level of compression of the chosen algorithm.
+    #[structopt(long, requires = "chunk-compression-type")]
+    pub chunk_compression_level: Option<u32>,
+
+    /// The number of bytes to remove from the begining of the chunks while reading/sorting
+    /// or merging them.
+    ///
+    /// File fusing must only be enable on file systems that support the `FALLOC_FL_COLLAPSE_RANGE`,
+    /// (i.e. ext4 and XFS). File fusing will only work if the `enable-chunk-fusing` is set.
+    #[structopt(long, default_value = "4 GiB")]
+    pub chunk_fusing_shrink_size: Byte,
+
+    /// Enable the chunk fusing or not, this reduces the amount of disk used by a factor of 2.
+    #[structopt(long)]
+    pub enable_chunk_fusing: bool,
+
+    /// Number of parallel jobs for indexing, defaults to # of CPUs.
+    #[structopt(long)]
+    pub indexing_jobs: Option<usize>,
+}
 const POSSIBLE_ENV: [&str; 2] = ["development", "production"];

 #[derive(Debug, Clone, StructOpt)]