diff --git a/meilisearch-http/src/index_controller/dump_actor/loaders/v1.rs b/meilisearch-http/src/index_controller/dump_actor/loaders/v1.rs index 76207ff7b..471e66d17 100644 --- a/meilisearch-http/src/index_controller/dump_actor/loaders/v1.rs +++ b/meilisearch-http/src/index_controller/dump_actor/loaders/v1.rs @@ -1,137 +1,178 @@ -use std::path::Path; +use std::{ + collections::{BTreeMap, BTreeSet}, + fs::File, + marker::PhantomData, + path::Path, + sync::Arc, +}; +use heed::EnvOpenOptions; +use log::{error, info, warn}; +use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; use serde::{Deserialize, Serialize}; +use uuid::Uuid; -use crate::index_controller::IndexMetadata; +use crate::{index::deserialize_some, index_controller::uuid_resolver::HeedUuidStore}; +use crate::{ + index::{Index, Unchecked}, + index_controller::{self, IndexMetadata}, +}; #[derive(Serialize, Deserialize, Debug)] +#[serde(rename_all = "camelCase")] pub struct MetadataV1 { db_version: String, indexes: Vec, } impl MetadataV1 { - pub fn load_dump(self, _src: impl AsRef, _dst: impl AsRef) -> anyhow::Result<()> { - todo!("implement load v1") + pub fn load_dump( + self, + src: impl AsRef, + dst: impl AsRef, + size: usize, + ) -> anyhow::Result<()> { + info!( + "Loading dump, dump database version: {}, dump version: V1", + self.db_version + ); + + dbg!("here"); + + let uuid_store = HeedUuidStore::new(&dst)?; + dbg!("here"); + for index in self.indexes { + let uuid = Uuid::new_v4(); + uuid_store.insert(index.uid.clone(), uuid)?; + let src = src.as_ref().join(index.uid); + load_index(&src, &dst, uuid, index.meta.primary_key.as_deref(), size)?; + } + + Ok(()) } } -// This is the settings used in the last version of meilisearch exporting dump in V1 -//#[derive(Default, Clone, Serialize, Deserialize, Debug)] -//#[serde(rename_all = "camelCase", deny_unknown_fields)] -//struct Settings { - //#[serde(default, deserialize_with = "deserialize_some")] - //pub ranking_rules: Option>>, - //#[serde(default, deserialize_with = "deserialize_some")] - //pub distinct_attribute: Option>, - //#[serde(default, deserialize_with = "deserialize_some")] - //pub searchable_attributes: Option>>, - //#[serde(default, deserialize_with = "deserialize_some")] - //pub displayed_attributes: Option>>, - //#[serde(default, deserialize_with = "deserialize_some")] - //pub stop_words: Option>>, - //#[serde(default, deserialize_with = "deserialize_some")] - //pub synonyms: Option>>>, - //#[serde(default, deserialize_with = "deserialize_some")] - //pub attributes_for_faceting: Option>>, -//} +//This is the settings used in the last version of meilisearch exporting dump in V1 +#[derive(Default, Clone, Serialize, Deserialize, Debug)] +#[serde(rename_all = "camelCase", deny_unknown_fields)] +struct Settings { + #[serde(default, deserialize_with = "deserialize_some")] + pub ranking_rules: Option>>, + #[serde(default, deserialize_with = "deserialize_some")] + pub distinct_attribute: Option>, + #[serde(default, deserialize_with = "deserialize_some")] + pub searchable_attributes: Option>>, + #[serde(default, deserialize_with = "deserialize_some")] + pub displayed_attributes: Option>>, + #[serde(default, deserialize_with = "deserialize_some")] + pub stop_words: Option>>, + #[serde(default, deserialize_with = "deserialize_some")] + pub synonyms: Option>>>, + #[serde(default, deserialize_with = "deserialize_some")] + pub attributes_for_faceting: Option>>, +} -///// we need to **always** be able to convert the old settings to the settings currently being used -//impl From for index_controller::Settings { - //fn from(settings: Settings) -> Self { - //if settings.synonyms.flatten().is_some() { - //error!("`synonyms` are not yet implemented and thus will be ignored"); - //} - //Self { - //distinct_attribute: settings.distinct_attribute, - //// we need to convert the old `Vec` into a `BTreeSet` - //displayed_attributes: settings.displayed_attributes.map(|o| o.map(|vec| vec.into_iter().collect())), - //searchable_attributes: settings.searchable_attributes, - //// we previously had a `Vec` but now we have a `HashMap` - //// representing the name of the faceted field + the type of the field. Since the type - //// was not known in the V1 of the dump we are just going to assume everything is a - //// String - //attributes_for_faceting: settings.attributes_for_faceting.map(|o| o.map(|vec| vec.into_iter().map(|key| (key, String::from("string"))).collect())), - //// we need to convert the old `Vec` into a `BTreeSet` - //ranking_rules: settings.ranking_rules.map(|o| o.map(|vec| vec.into_iter().filter_map(|criterion| { - //match criterion.as_str() { - //"words" | "typo" | "proximity" | "attribute" => Some(criterion), - //s if s.starts_with("asc") || s.starts_with("desc") => Some(criterion), - //"wordsPosition" => { - //warn!("The criteria `words` and `wordsPosition` have been merged into a single criterion `words` so `wordsPositon` will be ignored"); - //Some(String::from("words")) - //} - //"exactness" => { - //error!("The criterion `{}` is not implemented currently and thus will be ignored", criterion); - //None - //} - //s => { - //error!("Unknown criterion found in the dump: `{}`, it will be ignored", s); - //None - //} - //} - //}).collect())), - //// we need to convert the old `Vec` into a `BTreeSet` - //stop_words: settings.stop_words.map(|o| o.map(|vec| vec.into_iter().collect())), - //_kind: PhantomData, - //} - //} -//} +impl std::ops::Deref for Settings { + type Target = Option>>; -///// Extract Settings from `settings.json` file present at provided `dir_path` -//fn import_settings(dir_path: &Path) -> anyhow::Result { - //let path = dir_path.join("settings.json"); - //let file = File::open(path)?; - //let reader = std::io::BufReader::new(file); - //let metadata = serde_json::from_reader(reader)?; + fn deref(&self) -> &Self::Target { + &self.stop_words + } +} - //Ok(metadata) -//} +fn load_index( + src: impl AsRef, + dst: impl AsRef, + uuid: Uuid, + primary_key: Option<&str>, + size: usize, +) -> anyhow::Result<()> { + let index_path = dst.as_ref().join(&format!("indexes/index-{}", uuid)); -//pub fn import_dump( - //size: usize, - //uuid: Uuid, - //dump_path: &Path, - //db_path: &Path, - //primary_key: Option<&str>, -//) -> anyhow::Result<()> { - //let index_path = db_path.join(&format!("indexes/index-{}", uuid)); - //info!("Importing a dump from an old version of meilisearch with dump version 1"); + std::fs::create_dir_all(&index_path)?; + let mut options = EnvOpenOptions::new(); + options.map_size(size); + let index = milli::Index::new(options, index_path)?; + let index = Index(Arc::new(index)); - //std::fs::create_dir_all(&index_path)?; - //let mut options = EnvOpenOptions::new(); - //options.map_size(size); - //let index = milli::Index::new(options, index_path)?; - //let index = Index(Arc::new(index)); + // extract `settings.json` file and import content + let settings = import_settings(&src)?; + let settings: index_controller::Settings = settings.into(); + let update_builder = UpdateBuilder::new(0); + index.update_settings(&settings.check(), update_builder)?; - //// extract `settings.json` file and import content - //let settings = import_settings(&dump_path)?; - //let settings: index_controller::Settings = settings.into(); - //let update_builder = UpdateBuilder::new(0); - //index.update_settings(&settings.check(), update_builder)?; + let update_builder = UpdateBuilder::new(0); + let file = File::open(&src.as_ref().join("documents.jsonl"))?; + let reader = std::io::BufReader::new(file); - //let update_builder = UpdateBuilder::new(1); - //let file = File::open(&dump_path.join("documents.jsonl"))?; - //let reader = std::io::BufReader::new(file); + index.update_documents( + UpdateFormat::JsonStream, + IndexDocumentsMethod::ReplaceDocuments, + Some(reader), + update_builder, + primary_key, + )?; - //// TODO: TAMO: waiting for milli. We should use the result - //let _ = index.update_documents( - //UpdateFormat::JsonStream, - //IndexDocumentsMethod::ReplaceDocuments, - //Some(reader), - //update_builder, - //primary_key, - //); + // the last step: we extract the original milli::Index and close it + Arc::try_unwrap(index.0) + .map_err(|_e| "[dumps] At this point no one is supposed to have a reference on the index") + .unwrap() + .prepare_for_closing() + .wait(); - //// the last step: we extract the original milli::Index and close it - //Arc::try_unwrap(index.0) - //.map_err(|_e| "[dumps] At this point no one is supposed to have a reference on the index") - //.unwrap() - //.prepare_for_closing() - //.wait(); + // Ignore updates in v1. - //// at this point we should handle the import of the updates, but since the update logic is not handled in - //// meilisearch we are just going to ignore this part + Ok(()) +} - //Ok(()) -//} +/// we need to **always** be able to convert the old settings to the settings currently being used +impl From for index_controller::Settings { + fn from(settings: Settings) -> Self { + if settings.synonyms.flatten().is_some() { + error!("`synonyms` are not yet implemented and thus will be ignored"); + } + Self { + distinct_attribute: settings.distinct_attribute, + // we need to convert the old `Vec` into a `BTreeSet` + displayed_attributes: settings.displayed_attributes.map(|o| o.map(|vec| vec.into_iter().collect())), + searchable_attributes: settings.searchable_attributes, + // we previously had a `Vec` but now we have a `HashMap` + // representing the name of the faceted field + the type of the field. Since the type + // was not known in the V1 of the dump we are just going to assume everything is a + // String + attributes_for_faceting: settings.attributes_for_faceting.map(|o| o.map(|vec| vec.into_iter().map(|key| (key, String::from("string"))).collect())), + // we need to convert the old `Vec` into a `BTreeSet` + ranking_rules: settings.ranking_rules.map(|o| o.map(|vec| vec.into_iter().filter_map(|criterion| { + match criterion.as_str() { + "words" | "typo" | "proximity" | "attribute" => Some(criterion), + s if s.starts_with("asc") || s.starts_with("desc") => Some(criterion), + "wordsPosition" => { + warn!("The criteria `words` and `wordsPosition` have been merged into a single criterion `words` so `wordsPositon` will be ignored"); + Some(String::from("words")) + } + "exactness" => { + error!("The criterion `{}` is not implemented currently and thus will be ignored", criterion); + None + } + s => { + error!("Unknown criterion found in the dump: `{}`, it will be ignored", s); + None + } + } + }).collect())), + // we need to convert the old `Vec` into a `BTreeSet` + stop_words: settings.stop_words.map(|o| o.map(|vec| vec.into_iter().collect())), + _kind: PhantomData, + } + } +} + +/// Extract Settings from `settings.json` file present at provided `dir_path` +fn import_settings(dir_path: impl AsRef) -> anyhow::Result { + let path = dbg!(dir_path.as_ref().join("settings.json")); + let file = File::open(path)?; + let reader = std::io::BufReader::new(file); + let metadata = serde_json::from_reader(reader)?; + + Ok(metadata) +} diff --git a/meilisearch-http/src/index_controller/dump_actor/loaders/v2.rs b/meilisearch-http/src/index_controller/dump_actor/loaders/v2.rs index def47fecb..c0fe0abe6 100644 --- a/meilisearch-http/src/index_controller/dump_actor/loaders/v2.rs +++ b/meilisearch-http/src/index_controller/dump_actor/loaders/v2.rs @@ -1,13 +1,13 @@ use std::path::Path; -use anyhow::Context; use chrono::{DateTime, Utc}; -use log::{info, warn}; +use log::info; use serde::{Deserialize, Serialize}; use crate::{index::Index, index_controller::{update_actor::UpdateStore, uuid_resolver::HeedUuidStore}, option::IndexerOpts}; #[derive(Serialize, Deserialize, Debug)] +#[serde(rename_all = "camelCase")] pub struct MetadataV2 { db_version: String, index_db_size: u64, @@ -29,6 +29,7 @@ impl MetadataV2 { self, src: impl AsRef, dst: impl AsRef, + // TODO: use these variable to test if loading the index is possible. _index_db_size: u64, _update_db_size: u64, indexing_options: &IndexerOpts, @@ -37,37 +38,21 @@ impl MetadataV2 { "Loading dump from {}, dump database version: {}, dump version: V2", self.dump_date, self.db_version ); - // get dir in which to load the db: - let dst_dir = dst - .as_ref() - .parent() - .with_context(|| format!("Invalid db path: {}", dst.as_ref().display()))?; - - let tmp_dst = tempfile::tempdir_in(dst_dir)?; info!("Loading index database."); - HeedUuidStore::load_dump(src.as_ref(), &tmp_dst)?; + HeedUuidStore::load_dump(src.as_ref(), &dst)?; info!("Loading updates."); - UpdateStore::load_dump(&src, &tmp_dst, self.update_db_size)?; + UpdateStore::load_dump(&src, &dst, self.update_db_size)?; info!("Loading indexes"); let indexes_path = src.as_ref().join("indexes"); let indexes = indexes_path.read_dir()?; for index in indexes { let index = index?; - Index::load_dump(&index.path(), &tmp_dst, self.index_db_size, indexing_options)?; + Index::load_dump(&index.path(), &dst, self.index_db_size, indexing_options)?; } - // Persist and atomically rename the db - let persisted_dump = tmp_dst.into_path(); - if dst.as_ref().exists() { - warn!("Overwriting database at {}", dst.as_ref().display()); - std::fs::remove_dir_all(&dst)?; - } - - std::fs::rename(&persisted_dump, &dst)?; - Ok(()) } } diff --git a/meilisearch-http/src/index_controller/dump_actor/mod.rs b/meilisearch-http/src/index_controller/dump_actor/mod.rs index b236122bd..e1998f876 100644 --- a/meilisearch-http/src/index_controller/dump_actor/mod.rs +++ b/meilisearch-http/src/index_controller/dump_actor/mod.rs @@ -2,11 +2,12 @@ use std::fs::File; use std::path::{Path, PathBuf}; use chrono::{DateTime, Utc}; -use log::{error, info}; +use log::{error, info, warn}; #[cfg(test)] use mockall::automock; use serde::{Deserialize, Serialize}; use thiserror::Error; +use anyhow::Context; use loaders::v1::MetadataV1; use loaders::v2::MetadataV2; @@ -53,22 +54,16 @@ pub trait DumpActorHandle { } #[derive(Debug, Serialize, Deserialize)] -#[serde(rename_all = "camelCase", tag = "dump_version")] +#[serde(tag = "dumpVersion")] pub enum Metadata { - V1 { - #[serde(flatten)] - meta: MetadataV1, - }, - V2 { - #[serde(flatten)] - meta: MetadataV2, - }, + V1(MetadataV1), + V2(MetadataV2), } impl Metadata { pub fn new_v2(index_db_size: u64, update_db_size: u64) -> Self { let meta = MetadataV2::new(index_db_size, update_db_size); - Self::V2 { meta } + Self::V2(meta) } } @@ -135,16 +130,31 @@ pub fn load_dump( let mut meta_file = File::open(&meta_path)?; let meta: Metadata = serde_json::from_reader(&mut meta_file)?; + let dst_dir = dst_path + .as_ref() + .parent() + .with_context(|| format!("Invalid db path: {}", dst_path.as_ref().display()))?; + + let tmp_dst = tempfile::tempdir_in(dst_dir)?; + match meta { - Metadata::V1 { meta } => meta.load_dump(&tmp_src_path, dst_path)?, - Metadata::V2 { meta } => meta.load_dump( + Metadata::V1(meta) => meta.load_dump(&tmp_src_path, tmp_dst.path(), index_db_size as usize)?, + Metadata::V2(meta) => meta.load_dump( &tmp_src_path, - dst_path.as_ref(), + tmp_dst.path(), index_db_size, update_db_size, indexer_opts, )?, } + // Persist and atomically rename the db + let persisted_dump = tmp_dst.into_path(); + if dst_path.as_ref().exists() { + warn!("Overwriting database at {}", dst_path.as_ref().display()); + std::fs::remove_dir_all(&dst_path)?; + } + + std::fs::rename(&persisted_dump, &dst_path)?; Ok(()) } diff --git a/meilisearch-http/src/index_controller/uuid_resolver/store.rs b/meilisearch-http/src/index_controller/uuid_resolver/store.rs index 2fd9ff301..e666a536e 100644 --- a/meilisearch-http/src/index_controller/uuid_resolver/store.rs +++ b/meilisearch-http/src/index_controller/uuid_resolver/store.rs @@ -1,13 +1,16 @@ -use std::{collections::HashSet, io::{BufReader, BufRead, Write}}; use std::fs::{create_dir_all, File}; use std::path::{Path, PathBuf}; +use std::{ + collections::HashSet, + io::{BufRead, BufReader, Write}, +}; use heed::{ types::{ByteSlice, Str}, CompactionOption, Database, Env, EnvOpenOptions, }; +use serde::{Deserialize, Serialize}; use uuid::Uuid; -use serde::{Serialize, Deserialize}; use super::{Result, UuidResolverError, UUID_STORE_SIZE}; use crate::helpers::EnvSizer; @@ -45,7 +48,14 @@ impl HeedUuidStore { let mut options = EnvOpenOptions::new(); options.map_size(UUID_STORE_SIZE); // 1GB let env = options.open(path)?; - let db = env.create_database(None)?; Ok(Self { env, db }) } pub fn create_uuid(&self, name: String, err: bool) -> Result { let env = self.env.clone(); let db = self.db; let mut txn = env.write_txn()?; + let db = env.create_database(None)?; + Ok(Self { env, db }) + } + + pub fn create_uuid(&self, name: String, err: bool) -> Result { + let env = self.env.clone(); + let db = self.db; + let mut txn = env.write_txn()?; match db.get(&txn, &name)? { Some(uuid) => { if err { @@ -62,7 +72,10 @@ impl HeedUuidStore { Ok(uuid) } } - } pub fn get_uuid(&self, name: String) -> Result> { let env = self.env.clone(); let db = self.db; + } + pub fn get_uuid(&self, name: String) -> Result> { + let env = self.env.clone(); + let db = self.db; let txn = env.read_txn()?; match db.get(&txn, &name)? { Some(uuid) => { @@ -149,9 +162,7 @@ impl HeedUuidStore { let uid = uid.to_string(); let uuid = Uuid::from_slice(uuid)?; - let entry = DumpEntry { - uuid, uid - }; + let entry = DumpEntry { uuid, uid }; serde_json::to_writer(&mut dump_file, &entry)?; dump_file.write(b"\n").unwrap();