Introduce the Transform type into the indexing system

2025-07-04 20:37:15 +02:00 · 2020-10-24 16:23:08 +02:00 · 2020-10-24 16:23:08 +02:00 · a7a4984175
commit a7a4984175
parent b44b04d25b
8 changed files with 173 additions and 251 deletions
--- a/src/subcommand/indexer.rs
+++ b/src/subcommand/indexer.rs
@ -1,67 +0,0 @@
-use std::fs::File;
-use std::path::PathBuf;
-
-use anyhow::bail;
-use heed::EnvOpenOptions;
-use structopt::StructOpt;
-
-use crate::indexing::{self, IndexerOpt};
-use crate::Index;
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "milli-indexer")]
-/// The indexer binary of the milli project.
-pub struct Opt {
-    /// The database path where the database is located.
-    /// It is created if it doesn't already exist.
-    #[structopt(long = "db", parse(from_os_str))]
-    database: PathBuf,
-
-    /// The maximum size the database can take on disk. It is recommended to specify
-    /// the whole disk space (value must be a multiple of a page size).
-    #[structopt(long = "db-size", default_value = "107374182400")] // 100 GB
-    database_size: usize,
-
-    #[structopt(flatten)]
-    indexer: IndexerOpt,
-
-    /// Verbose mode (-v, -vv, -vvv, etc.)
-    #[structopt(short, long, parse(from_occurrences))]
-    verbose: usize,
-
-    /// CSV file to index, if unspecified the CSV is read from standard input.
-    ///
-    /// You can also provide a ".gz" or ".gzip" CSV file, the indexer will figure out
-    /// how to decode and read it.
-    ///
-    /// Note that it is much faster to index from a file as when the indexer reads from stdin
-    /// it will dedicate a thread for that and context switches could slow down the indexing jobs.
-    csv_file: Option<PathBuf>,
-}
-
-pub fn run(opt: Opt) -> anyhow::Result<()> {
-    stderrlog::new()
-        .verbosity(opt.verbose)
-        .show_level(false)
-        .timestamp(stderrlog::Timestamp::Off)
-        .init()?;
-
-    if opt.database.exists() {
-        bail!("Database ({}) already exists, delete it to continue.", opt.database.display());
-    }
-
-    std::fs::create_dir_all(&opt.database)?;
-    let env = EnvOpenOptions::new()
-        .map_size(opt.database_size)
-        .max_dbs(10)
-        .open(&opt.database)?;
-
-    let index = Index::new(&env)?;
-
-    let file_path = opt.csv_file.unwrap();
-    let gzipped = file_path.extension().map_or(false, |e| e == "gz" || e == "gzip");
-    let file = File::open(file_path)?;
-    let content = unsafe { memmap::Mmap::map(&file)? };
-
-    indexing::run(&env, &index, &opt.indexer, &content, gzipped, |_docid| { })
-}
--- a/src/subcommand/mod.rs
+++ b/src/subcommand/mod.rs
@ -1,4 +1,3 @@
-pub mod indexer;
 pub mod infos;
 pub mod search;
 pub mod serve;
--- a/src/subcommand/serve.rs
+++ b/src/subcommand/serve.rs
@ -1,6 +1,7 @@
+use std::borrow::Cow;
 use std::collections::HashSet;
 use std::fs::{File, create_dir_all};
-use std::mem;
+use std::{mem, io};
 use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::str::FromStr;
@ -8,6 +9,7 @@ use std::sync::Arc;
 use std::time::Instant;

 use askama_warp::Template;
+use flate2::read::GzDecoder;
 use futures::stream;
 use futures::{FutureExt, StreamExt};
 use heed::EnvOpenOptions;
@ -20,9 +22,9 @@ use tokio::sync::broadcast;
 use warp::filters::ws::Message;
 use warp::{Filter, http::Response};

-use crate::indexing::{self, IndexerOpt};
+use crate::indexing::{self, IndexerOpt, Transform, TransformOutput};
 use crate::tokenizer::{simple_tokenizer, TokenType};
-use crate::{Index, UpdateStore, SearchResult};
+use crate::{Index, UpdateStore, SearchResult, AvailableDocumentsIds};

 #[derive(Debug, StructOpt)]
 /// The HTTP main server of the milli project.
@ -103,9 +105,7 @@ enum UpdateStatus<M, P, N> {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "type")]
 enum UpdateMeta {
-    DocumentsAddition {
-        total_number_of_documents: Option<usize>,
-    },
+    DocumentsAddition,
    DocumentsAdditionFromPath {
        path: PathBuf,
    },
@ -153,19 +153,63 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
        update_store_path,
        move |update_id, meta, content| {
            let result = match meta {
-                UpdateMeta::DocumentsAddition { total_number_of_documents } => {
+                UpdateMeta::DocumentsAddition => {
+                    // We must use the write transaction of the update here.
+                    let rtxn = env_cloned.read_txn()?;
+                    let fields_ids_map = index_cloned.fields_ids_map(&rtxn)?.unwrap_or_default();
+                    let documents_ids = index_cloned.documents_ids(&rtxn)?.unwrap_or_default();
+                    let available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
+                    let users_ids_documents_ids = match index_cloned.users_ids_documents_ids(&rtxn).unwrap() {
+                        Some(map) => map.map_data(Cow::Borrowed).unwrap(),
+                        None => fst::Map::default().map_data(Cow::Owned).unwrap(),
+                    };
+
+                    let transform = Transform {
+                        fields_ids_map,
+                        available_documents_ids,
+                        users_ids_documents_ids,
+                        chunk_compression_type: indexer_opt_cloned.chunk_compression_type,
+                        chunk_compression_level: indexer_opt_cloned.chunk_compression_level,
+                        chunk_fusing_shrink_size: Some(indexer_opt_cloned.chunk_fusing_shrink_size),
+                        max_nb_chunks: indexer_opt_cloned.max_nb_chunks,
+                        max_memory: Some(indexer_opt_cloned.max_memory),
+                    };
+
                    let gzipped = false;
+                    let reader = if gzipped {
+                        Box::new(GzDecoder::new(content))
+                    } else {
+                        Box::new(content) as Box<dyn io::Read>
+                    };
+
+                    let TransformOutput {
+                        fields_ids_map,
+                        users_ids_documents_ids,
+                        new_documents_ids,
+                        replaced_documents_ids,
+                        documents_count,
+                        documents_file,
+                    } = transform.from_csv(reader).unwrap();
+
+                    drop(rtxn);
+
+                    let mmap = unsafe { memmap::Mmap::map(&documents_file)? };
+                    let documents = grenad::Reader::new(mmap.as_ref()).unwrap();
+
                    indexing::run(
                        &env_cloned,
                        &index_cloned,
                        &indexer_opt_cloned,
-                        content,
-                        gzipped,
-                        |count| {
+                        fields_ids_map,
+                        users_ids_documents_ids,
+                        new_documents_ids,
+                        documents,
+                        documents_count as u32,
+                        |count, total| {
                            // We send progress status...
                            let meta = UpdateMetaProgress::DocumentsAddition {
                                processed_number_of_documents: count as usize,
-                                total_number_of_documents,
+                                total_number_of_documents: Some(total as usize),
                            };
                            let progress = UpdateStatus::Progressing { update_id, meta };
                            let _ = update_status_sender_cloned.send(progress);
@ -173,38 +217,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
                    )
                },
                UpdateMeta::DocumentsAdditionFromPath { path } => {
-                    let file = match File::open(&path) {
-                        Ok(file) => file,
-                        Err(e) => {
-                            let meta = format!("documents addition file ({}) error: {}", path.display(), e);
-                            return Ok(meta);
-                        }
-                    };
-                    let content = match unsafe { memmap::Mmap::map(&file) } {
-                        Ok(mmap) => mmap,
-                        Err(e) => {
-                            let meta = format!("documents addition file ({}) mmap error: {}", path.display(), e);
-                            return Ok(meta);
-                        },
-                    };
-
-                    let gzipped = path.extension().map_or(false, |e| e == "gz" || e == "gzip");
-                    indexing::run(
-                        &env_cloned,
-                        &index_cloned,
-                        &indexer_opt_cloned,
-                        &content,
-                        gzipped,
-                        |count| {
-                            // We send progress status...
-                            let meta = UpdateMetaProgress::DocumentsAddition {
-                                processed_number_of_documents: count as usize,
-                                total_number_of_documents: None,
-                            };
-                            let progress = UpdateStatus::Progressing { update_id, meta };
-                            let _ = update_status_sender_cloned.send(progress);
-                        },
-                    )
+                    todo!()
                }
            };

@ -388,7 +401,8 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
                let mut record = record.iter()
                    .map(|(key_id, value)| {
                        let key = fields_ids_map.name(key_id).unwrap().to_owned();
-                        let value = std::str::from_utf8(value).unwrap().to_owned();
+                        // TODO we must deserialize a Json Value and highlight it.
+                        let value = serde_json::from_slice(value).unwrap();
                        (key, value)
                    })
                    .collect();
@ -423,7 +437,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> {
        let file = file.into_std().await;
        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };

-        let meta = UpdateMeta::DocumentsAddition { total_number_of_documents: None };
+        let meta = UpdateMeta::DocumentsAddition;
        let update_id = update_store.register_update(&meta, &mmap[..]).unwrap();
        let _ = update_status_sender.send(UpdateStatus::Pending { update_id, meta });
        eprintln!("update {} registered", update_id);