2022-10-02 13:24:59 +02:00
|
|
|
use std::{
|
|
|
|
fs::{self, File},
|
2022-10-10 20:00:29 +02:00
|
|
|
io::{BufWriter, Write},
|
2022-10-02 13:24:59 +02:00
|
|
|
path::PathBuf,
|
|
|
|
};
|
|
|
|
|
|
|
|
use flate2::{write::GzEncoder, Compression};
|
2022-10-03 18:50:06 +02:00
|
|
|
use meilisearch_auth::Key;
|
2022-10-11 18:26:49 +02:00
|
|
|
use meilisearch_types::settings::{Checked, Settings};
|
2022-10-12 00:43:24 +02:00
|
|
|
use meilisearch_types::tasks::TaskDump;
|
2022-10-03 18:50:06 +02:00
|
|
|
use serde_json::{Map, Value};
|
2022-10-02 13:24:59 +02:00
|
|
|
use tempfile::TempDir;
|
|
|
|
use time::OffsetDateTime;
|
|
|
|
use uuid::Uuid;
|
|
|
|
|
2022-10-10 19:57:47 +02:00
|
|
|
use crate::{reader::Document, IndexMetadata, Metadata, Result, CURRENT_DUMP_VERSION};
|
2022-10-02 13:24:59 +02:00
|
|
|
|
|
|
|
pub struct DumpWriter {
|
|
|
|
dir: TempDir,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl DumpWriter {
|
|
|
|
pub fn new(instance_uuid: Uuid) -> Result<DumpWriter> {
|
|
|
|
let dir = TempDir::new()?;
|
2022-10-04 19:13:30 +02:00
|
|
|
|
2022-10-02 13:24:59 +02:00
|
|
|
fs::write(
|
2022-10-04 19:13:30 +02:00
|
|
|
dir.path().join("instance_uid.uuid"),
|
2022-10-02 13:24:59 +02:00
|
|
|
&instance_uuid.as_hyphenated().to_string(),
|
|
|
|
)?;
|
|
|
|
|
|
|
|
let metadata = Metadata {
|
2022-10-03 16:12:01 +02:00
|
|
|
dump_version: CURRENT_DUMP_VERSION,
|
2022-10-02 13:24:59 +02:00
|
|
|
db_version: env!("CARGO_PKG_VERSION").to_string(),
|
|
|
|
dump_date: OffsetDateTime::now_utc(),
|
|
|
|
};
|
|
|
|
fs::write(
|
|
|
|
dir.path().join("metadata.json"),
|
|
|
|
serde_json::to_string(&metadata)?,
|
|
|
|
)?;
|
|
|
|
|
|
|
|
std::fs::create_dir(&dir.path().join("indexes"))?;
|
|
|
|
|
|
|
|
Ok(DumpWriter { dir })
|
|
|
|
}
|
|
|
|
|
2022-10-04 19:13:30 +02:00
|
|
|
pub fn create_index(&self, index_name: &str, metadata: &IndexMetadata) -> Result<IndexWriter> {
|
|
|
|
IndexWriter::new(self.dir.path().join("indexes").join(index_name), metadata)
|
2022-10-02 13:24:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn create_keys(&self) -> Result<KeyWriter> {
|
|
|
|
KeyWriter::new(self.dir.path().to_path_buf())
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn create_tasks_queue(&self) -> Result<TaskWriter> {
|
|
|
|
TaskWriter::new(self.dir.path().join("tasks"))
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn persist_to(self, mut writer: impl Write) -> Result<()> {
|
|
|
|
let gz_encoder = GzEncoder::new(&mut writer, Compression::default());
|
|
|
|
let mut tar_encoder = tar::Builder::new(gz_encoder);
|
|
|
|
tar_encoder.append_dir_all(".", self.dir.path())?;
|
|
|
|
let gz_encoder = tar_encoder.into_inner()?;
|
|
|
|
gz_encoder.finish()?;
|
|
|
|
writer.flush()?;
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct KeyWriter {
|
|
|
|
file: File,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl KeyWriter {
|
|
|
|
pub(crate) fn new(path: PathBuf) -> Result<Self> {
|
|
|
|
let file = File::create(path.join("keys.jsonl"))?;
|
|
|
|
Ok(KeyWriter { file })
|
|
|
|
}
|
|
|
|
|
2022-10-03 18:50:06 +02:00
|
|
|
pub fn push_key(&mut self, key: &Key) -> Result<()> {
|
|
|
|
self.file.write_all(&serde_json::to_vec(key)?)?;
|
2022-10-02 13:24:59 +02:00
|
|
|
self.file.write_all(b"\n")?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct TaskWriter {
|
|
|
|
queue: File,
|
|
|
|
update_files: PathBuf,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl TaskWriter {
|
|
|
|
pub(crate) fn new(path: PathBuf) -> Result<Self> {
|
|
|
|
std::fs::create_dir(&path)?;
|
|
|
|
|
|
|
|
let queue = File::create(path.join("queue.jsonl"))?;
|
|
|
|
let update_files = path.join("update_files");
|
|
|
|
std::fs::create_dir(&update_files)?;
|
|
|
|
|
|
|
|
Ok(TaskWriter {
|
|
|
|
queue,
|
|
|
|
update_files,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Pushes tasks in the dump.
|
|
|
|
/// If the tasks has an associated `update_file` it'll use the `task_id` as its name.
|
2022-10-12 00:43:24 +02:00
|
|
|
pub fn push_task(&mut self, task: &TaskDump) -> Result<UpdateFile> {
|
|
|
|
self.queue.write_all(&serde_json::to_vec(task)?)?;
|
2022-10-02 13:24:59 +02:00
|
|
|
self.queue.write_all(b"\n")?;
|
2022-10-10 19:57:47 +02:00
|
|
|
|
|
|
|
Ok(UpdateFile::new(
|
2022-10-12 00:43:24 +02:00
|
|
|
self.update_files.join(format!("{}.jsonl", task.uid)),
|
2022-10-10 19:57:47 +02:00
|
|
|
))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct UpdateFile {
|
|
|
|
path: PathBuf,
|
|
|
|
writer: Option<BufWriter<File>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl UpdateFile {
|
|
|
|
pub(crate) fn new(path: PathBuf) -> UpdateFile {
|
|
|
|
UpdateFile { path, writer: None }
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn push_document(&mut self, document: &Document) -> Result<()> {
|
|
|
|
if let Some(writer) = self.writer.as_mut() {
|
|
|
|
writer.write_all(&serde_json::to_vec(document)?)?;
|
|
|
|
writer.write_all(b"\n")?;
|
|
|
|
writer.flush()?;
|
|
|
|
} else {
|
|
|
|
dbg!(&self.path);
|
|
|
|
let file = File::create(&self.path).unwrap();
|
|
|
|
self.writer = Some(BufWriter::new(file));
|
|
|
|
self.push_document(document)?;
|
2022-10-02 13:24:59 +02:00
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct IndexWriter {
|
|
|
|
documents: File,
|
|
|
|
settings: File,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl IndexWriter {
|
2022-10-04 19:13:30 +02:00
|
|
|
pub(self) fn new(path: PathBuf, metadata: &IndexMetadata) -> Result<Self> {
|
2022-10-02 13:24:59 +02:00
|
|
|
std::fs::create_dir(&path)?;
|
|
|
|
|
2022-10-04 19:13:30 +02:00
|
|
|
let metadata_file = File::create(path.join("metadata.json"))?;
|
|
|
|
serde_json::to_writer(metadata_file, metadata)?;
|
|
|
|
|
2022-10-02 13:24:59 +02:00
|
|
|
let documents = File::create(path.join("documents.jsonl"))?;
|
|
|
|
let settings = File::create(path.join("settings.json"))?;
|
|
|
|
|
|
|
|
Ok(IndexWriter {
|
|
|
|
documents,
|
|
|
|
settings,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2022-10-03 18:50:06 +02:00
|
|
|
pub fn push_document(&mut self, document: &Map<String, Value>) -> Result<()> {
|
|
|
|
self.documents.write_all(&serde_json::to_vec(document)?)?;
|
2022-10-02 13:24:59 +02:00
|
|
|
self.documents.write_all(b"\n")?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2022-10-03 18:50:06 +02:00
|
|
|
pub fn settings(mut self, settings: &Settings<Checked>) -> Result<()> {
|
2022-10-02 13:24:59 +02:00
|
|
|
self.settings.write_all(&serde_json::to_vec(&settings)?)?;
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
pub(crate) mod test {
|
2022-10-03 18:50:06 +02:00
|
|
|
use std::{fmt::Write, io::BufReader, path::Path, str::FromStr};
|
2022-10-02 13:24:59 +02:00
|
|
|
|
2022-10-03 18:50:06 +02:00
|
|
|
use flate2::bufread::GzDecoder;
|
2022-10-11 18:26:49 +02:00
|
|
|
use meilisearch_types::settings::Unchecked;
|
2022-10-03 18:50:06 +02:00
|
|
|
|
2022-10-10 18:57:27 +02:00
|
|
|
use crate::{
|
|
|
|
reader::Document,
|
|
|
|
test::{
|
|
|
|
create_test_api_keys, create_test_documents, create_test_dump,
|
|
|
|
create_test_instance_uid, create_test_settings, create_test_tasks,
|
|
|
|
},
|
2022-10-03 18:50:06 +02:00
|
|
|
};
|
2022-10-02 13:24:59 +02:00
|
|
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
fn create_directory_hierarchy(dir: &Path) -> String {
|
|
|
|
let mut ret = String::new();
|
|
|
|
writeln!(ret, ".").unwrap();
|
|
|
|
ret.push_str(&_create_directory_hierarchy(dir, 0));
|
|
|
|
ret
|
|
|
|
}
|
|
|
|
|
|
|
|
fn _create_directory_hierarchy(dir: &Path, depth: usize) -> String {
|
|
|
|
let mut ret = String::new();
|
|
|
|
|
|
|
|
// the entries are not guarenteed to be returned in the same order thus we need to sort them.
|
|
|
|
let mut entries = fs::read_dir(dir)
|
|
|
|
.unwrap()
|
|
|
|
.collect::<std::result::Result<Vec<_>, _>>()
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
// I want the directories first and then sort by name.
|
|
|
|
entries.sort_by(|a, b| {
|
|
|
|
let (aft, bft) = (a.file_type().unwrap(), b.file_type().unwrap());
|
|
|
|
|
|
|
|
if aft.is_dir() && bft.is_dir() {
|
|
|
|
a.file_name().cmp(&b.file_name())
|
|
|
|
} else if aft.is_file() {
|
|
|
|
std::cmp::Ordering::Greater
|
|
|
|
} else if bft.is_file() {
|
|
|
|
std::cmp::Ordering::Less
|
|
|
|
} else {
|
|
|
|
a.file_name().cmp(&b.file_name())
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
for (idx, entry) in entries.iter().enumerate() {
|
|
|
|
let mut ident = String::new();
|
|
|
|
|
|
|
|
for _ in 0..depth {
|
|
|
|
ident.push_str(&"│");
|
|
|
|
ident.push_str(&" ".repeat(4));
|
|
|
|
}
|
|
|
|
if idx == entries.len() - 1 {
|
|
|
|
ident.push_str(&"└");
|
|
|
|
} else {
|
|
|
|
ident.push_str(&"├");
|
|
|
|
}
|
|
|
|
ident.push_str(&"-".repeat(4));
|
|
|
|
|
|
|
|
let name = entry.file_name().into_string().unwrap();
|
|
|
|
let file_type = entry.file_type().unwrap();
|
|
|
|
let is_dir = file_type.is_dir().then_some("/").unwrap_or("");
|
|
|
|
|
|
|
|
assert!(!file_type.is_symlink());
|
|
|
|
writeln!(ret, "{ident} {name}{is_dir}").unwrap();
|
|
|
|
|
|
|
|
if file_type.is_dir() {
|
|
|
|
ret.push_str(&_create_directory_hierarchy(&entry.path(), depth + 1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_creating_dump() {
|
2022-10-03 18:50:06 +02:00
|
|
|
let file = create_test_dump();
|
|
|
|
let mut file = BufReader::new(file);
|
2022-10-02 13:24:59 +02:00
|
|
|
|
2022-10-03 18:50:06 +02:00
|
|
|
// ============ ensuring we wrote everything in the correct place.
|
2022-10-02 13:24:59 +02:00
|
|
|
let dump = tempfile::tempdir().unwrap();
|
|
|
|
|
|
|
|
let gz = GzDecoder::new(&mut file);
|
|
|
|
let mut tar = tar::Archive::new(gz);
|
|
|
|
tar.unpack(dump.path()).unwrap();
|
|
|
|
|
|
|
|
let dump_path = dump.path();
|
|
|
|
|
|
|
|
// ==== checking global file hierarchy (we want to be sure there isn't too many files or too few)
|
|
|
|
insta::assert_display_snapshot!(create_directory_hierarchy(dump_path), @r###"
|
|
|
|
.
|
|
|
|
├---- indexes/
|
|
|
|
│ └---- doggos/
|
|
|
|
│ │ ├---- settings.json
|
2022-10-04 19:13:30 +02:00
|
|
|
│ │ ├---- documents.jsonl
|
|
|
|
│ │ └---- metadata.json
|
2022-10-02 13:24:59 +02:00
|
|
|
├---- tasks/
|
|
|
|
│ ├---- update_files/
|
2022-10-10 19:57:47 +02:00
|
|
|
│ │ └---- 1.jsonl
|
2022-10-02 13:24:59 +02:00
|
|
|
│ └---- queue.jsonl
|
|
|
|
├---- keys.jsonl
|
|
|
|
├---- metadata.json
|
2022-10-04 19:13:30 +02:00
|
|
|
└---- instance_uid.uuid
|
2022-10-02 13:24:59 +02:00
|
|
|
"###);
|
|
|
|
|
|
|
|
// ==== checking the top level infos
|
|
|
|
let metadata = fs::read_to_string(dump_path.join("metadata.json")).unwrap();
|
|
|
|
let metadata: Metadata = serde_json::from_str(&metadata).unwrap();
|
|
|
|
insta::assert_json_snapshot!(metadata, { ".dumpDate" => "[date]" }, @r###"
|
|
|
|
{
|
|
|
|
"dumpVersion": "V6",
|
|
|
|
"dbVersion": "0.29.0",
|
|
|
|
"dumpDate": "[date]"
|
|
|
|
}
|
|
|
|
"###);
|
|
|
|
|
2022-10-04 19:13:30 +02:00
|
|
|
let instance_uid = fs::read_to_string(dump_path.join("instance_uid.uuid")).unwrap();
|
2022-10-02 13:24:59 +02:00
|
|
|
assert_eq!(
|
2022-10-03 18:50:06 +02:00
|
|
|
Uuid::from_str(&instance_uid).unwrap(),
|
|
|
|
create_test_instance_uid()
|
2022-10-02 13:24:59 +02:00
|
|
|
);
|
|
|
|
|
|
|
|
// ==== checking the index
|
|
|
|
let docs = fs::read_to_string(dump_path.join("indexes/doggos/documents.jsonl")).unwrap();
|
2022-10-03 18:50:06 +02:00
|
|
|
for (document, expected) in docs.lines().zip(create_test_documents()) {
|
|
|
|
assert_eq!(
|
|
|
|
serde_json::from_str::<Map<String, Value>>(document).unwrap(),
|
|
|
|
expected
|
|
|
|
);
|
2022-10-02 13:24:59 +02:00
|
|
|
}
|
|
|
|
let test_settings =
|
|
|
|
fs::read_to_string(dump_path.join("indexes/doggos/settings.json")).unwrap();
|
2022-10-03 18:50:06 +02:00
|
|
|
assert_eq!(
|
|
|
|
serde_json::from_str::<Settings<Unchecked>>(&test_settings).unwrap(),
|
|
|
|
create_test_settings().into_unchecked()
|
|
|
|
);
|
2022-10-04 19:13:30 +02:00
|
|
|
let metadata = fs::read_to_string(dump_path.join("indexes/doggos/metadata.json")).unwrap();
|
|
|
|
let metadata: IndexMetadata = serde_json::from_str(&metadata).unwrap();
|
|
|
|
insta::assert_json_snapshot!(metadata, { ".createdAt" => "[date]", ".updatedAt" => "[date]" }, @r###"
|
|
|
|
{
|
|
|
|
"uid": "doggo",
|
|
|
|
"primaryKey": null,
|
|
|
|
"createdAt": "[date]",
|
|
|
|
"updatedAt": "[date]"
|
|
|
|
}
|
|
|
|
"###);
|
2022-10-02 13:24:59 +02:00
|
|
|
|
|
|
|
// ==== checking the task queue
|
|
|
|
let tasks_queue = fs::read_to_string(dump_path.join("tasks/queue.jsonl")).unwrap();
|
2022-10-03 18:50:06 +02:00
|
|
|
for (task, mut expected) in tasks_queue.lines().zip(create_test_tasks()) {
|
|
|
|
// TODO: This can be removed once `Duration` from the `TaskView` is implemented.
|
|
|
|
expected.0.duration = None;
|
2022-10-11 18:26:49 +02:00
|
|
|
// TODO: uncomment this one once the we write the dump integration in the index-scheduler
|
|
|
|
// assert_eq!(serde_json::from_str::<TaskView>(task).unwrap(), expected.0);
|
2022-10-03 18:50:06 +02:00
|
|
|
|
|
|
|
if let Some(expected_update) = expected.1 {
|
2022-10-10 19:57:47 +02:00
|
|
|
let path = dump_path.join(format!("tasks/update_files/{}.jsonl", expected.0.uid));
|
2022-10-02 13:24:59 +02:00
|
|
|
println!("trying to open {}", path.display());
|
2022-10-10 18:57:27 +02:00
|
|
|
let update = fs::read_to_string(path).unwrap();
|
|
|
|
let documents: Vec<Document> = update
|
|
|
|
.lines()
|
|
|
|
.map(|line| serde_json::from_str(line).unwrap())
|
|
|
|
.collect();
|
|
|
|
assert_eq!(documents, expected_update);
|
2022-10-02 13:24:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// ==== checking the keys
|
|
|
|
let keys = fs::read_to_string(dump_path.join("keys.jsonl")).unwrap();
|
2022-10-03 18:50:06 +02:00
|
|
|
for (key, expected) in keys.lines().zip(create_test_api_keys()) {
|
|
|
|
assert_eq!(serde_json::from_str::<Key>(key).unwrap(), expected);
|
2022-10-02 13:24:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|