MeiliSearch/dump/src/reader/v4/mod.rs

337 lines
11 KiB
Rust
Raw Normal View History

2022-10-20 18:00:07 +02:00
use std::fs::{self, File};
use std::io::{BufRead, BufReader, ErrorKind};
2022-10-20 18:00:07 +02:00
use std::path::Path;
2022-10-06 15:49:30 +02:00
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
use time::OffsetDateTime;
use uuid::Uuid;
pub mod errors;
pub mod keys;
pub mod meta;
pub mod settings;
pub mod tasks;
2022-10-06 15:49:30 +02:00
use self::meta::{DumpMeta, IndexMeta, IndexUuid};
2022-10-10 15:16:22 +02:00
use super::compat::v4_to_v5::CompatV4ToV5;
2022-10-20 18:00:07 +02:00
use crate::{Error, IndexMetadata, Result, Version};
2022-10-06 15:49:30 +02:00
pub type Document = serde_json::Map<String, serde_json::Value>;
2022-10-06 16:37:13 +02:00
pub type Settings<T> = settings::Settings<T>;
pub type Checked = settings::Checked;
pub type Unchecked = settings::Unchecked;
pub type Task = tasks::Task;
pub type Key = keys::Key;
// everything related to the settings
pub type Setting<T> = settings::Setting<T>;
// everything related to the api keys
pub type Action = keys::Action;
// everything related to the errors
pub type ResponseError = errors::ResponseError;
pub type Code = errors::Code;
2022-10-06 15:49:30 +02:00
#[derive(Serialize, Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Metadata {
db_version: String,
index_db_size: usize,
update_db_size: usize,
#[serde(with = "time::serde::rfc3339")]
dump_date: OffsetDateTime,
}
pub struct V4Reader {
dump: TempDir,
metadata: Metadata,
tasks: BufReader<File>,
keys: BufReader<File>,
index_uuid: Vec<IndexUuid>,
}
impl V4Reader {
pub fn open(dump: TempDir) -> Result<Self> {
let meta_file = fs::read(dump.path().join("metadata.json"))?;
let metadata = serde_json::from_reader(&*meta_file)?;
let index_uuid = File::open(dump.path().join("index_uuids/data.jsonl"))?;
let index_uuid = BufReader::new(index_uuid);
let index_uuid = index_uuid
.lines()
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })
.collect::<Result<Vec<_>>>()?;
Ok(V4Reader {
metadata,
tasks: BufReader::new(
File::open(dump.path().join("updates").join("data.jsonl")).unwrap(),
),
keys: BufReader::new(File::open(dump.path().join("keys"))?),
index_uuid,
dump,
})
}
pub fn to_v5(self) -> CompatV4ToV5 {
CompatV4ToV5::new(self)
}
2022-10-06 15:49:30 +02:00
pub fn version(&self) -> Version {
Version::V4
}
pub fn date(&self) -> Option<OffsetDateTime> {
Some(self.metadata.dump_date)
}
pub fn instance_uid(&self) -> Result<Option<Uuid>> {
match fs::read_to_string(self.dump.path().join("instance-uid")) {
Ok(uuid) => Ok(Some(Uuid::parse_str(&uuid)?)),
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
Err(e) => Err(e.into()),
}
2022-10-06 15:49:30 +02:00
}
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V4IndexReader>> + '_> {
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
2022-10-22 16:35:42 +02:00
V4IndexReader::new(
2022-10-06 15:49:30 +02:00
index.uid.clone(),
2022-10-20 18:00:07 +02:00
&self.dump.path().join("indexes").join(index.index_meta.uuid.to_string()),
&index.index_meta,
BufReader::new(self.tasks.get_ref().try_clone().unwrap()),
2022-10-22 16:35:42 +02:00
)
2022-10-06 15:49:30 +02:00
}))
}
pub fn tasks(
&mut self,
) -> Box<dyn Iterator<Item = Result<(Task, Option<Box<super::UpdateFile>>)>> + '_> {
Box::new((&mut self.tasks).lines().map(|line| -> Result<_> {
2022-10-06 15:49:30 +02:00
let task: Task = serde_json::from_str(&line?)?;
if !task.is_finished() {
if let Some(uuid) = task.get_content_uuid() {
let update_file_path = self
.dump
.path()
.join("updates")
.join("updates_files")
.join(uuid.to_string());
Ok((
task,
Some(
Box::new(UpdateFile::new(&update_file_path)?) as Box<super::UpdateFile>
),
))
2022-10-06 15:49:30 +02:00
} else {
Ok((task, None))
}
} else {
Ok((task, None))
}
}))
2022-10-06 15:49:30 +02:00
}
pub fn keys(&mut self) -> Box<dyn Iterator<Item = Result<Key>> + '_> {
Box::new(
2022-10-20 18:00:07 +02:00
(&mut self.keys).lines().map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }),
)
2022-10-06 15:49:30 +02:00
}
}
pub struct V4IndexReader {
metadata: IndexMetadata,
settings: Settings<Checked>,
documents: BufReader<File>,
}
impl V4IndexReader {
pub fn new(
name: String,
path: &Path,
index_metadata: &IndexMeta,
tasks: BufReader<File>,
) -> Result<Self> {
2022-10-06 15:49:30 +02:00
let meta = File::open(path.join("meta.json"))?;
let meta: DumpMeta = serde_json::from_reader(meta)?;
let mut created_at = None;
let mut updated_at = None;
for line in tasks.lines() {
let task: Task = serde_json::from_str(&line?)?;
if task.index_uid.to_string() == name {
if updated_at.is_none() {
updated_at = task.updated_at()
}
if created_at.is_none() {
created_at = task.created_at()
}
if task.id as usize == index_metadata.creation_task_id {
2022-11-13 10:12:51 +01:00
created_at = task.processed_at();
break;
}
}
}
2022-10-06 15:49:30 +02:00
let metadata = IndexMetadata {
uid: name,
primary_key: meta.primary_key,
2022-11-12 20:57:27 +01:00
created_at: created_at.unwrap_or_else(OffsetDateTime::now_utc),
updated_at: updated_at.unwrap_or_else(OffsetDateTime::now_utc),
2022-10-06 15:49:30 +02:00
};
let ret = V4IndexReader {
metadata,
settings: meta.settings.check(),
documents: BufReader::new(File::open(path.join("documents.jsonl"))?),
};
Ok(ret)
}
pub fn metadata(&self) -> &IndexMetadata {
&self.metadata
}
pub fn documents(&mut self) -> Result<impl Iterator<Item = Result<Document>> + '_> {
Ok((&mut self.documents)
.lines()
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn settings(&mut self) -> Result<Settings<Checked>> {
Ok(self.settings.clone())
}
}
pub struct UpdateFile {
reader: BufReader<File>,
}
impl UpdateFile {
fn new(path: &Path) -> Result<Self> {
2022-10-20 18:00:07 +02:00
Ok(UpdateFile { reader: BufReader::new(File::open(path)?) })
}
}
impl Iterator for UpdateFile {
type Item = Result<Document>;
fn next(&mut self) -> Option<Self::Item> {
(&mut self.reader)
.lines()
.map(|line| {
line.map_err(Error::from)
.and_then(|line| serde_json::from_str(&line).map_err(Error::from))
})
.next()
}
}
2022-10-06 15:49:30 +02:00
#[cfg(test)]
pub(crate) mod test {
2022-10-20 18:00:07 +02:00
use std::fs::File;
use std::io::BufReader;
2022-10-06 15:49:30 +02:00
use flate2::bufread::GzDecoder;
2022-10-26 18:49:47 +02:00
use meili_snap::insta;
2022-10-06 15:49:30 +02:00
use tempfile::TempDir;
use super::*;
#[test]
2022-10-26 19:28:30 +02:00
#[ignore]
2022-10-06 15:49:30 +02:00
fn read_dump_v4() {
let dump = File::open("tests/assets/v4.dump").unwrap();
let dir = TempDir::new().unwrap();
let mut dump = BufReader::new(dump);
let gz = GzDecoder::new(&mut dump);
let mut archive = tar::Archive::new(gz);
archive.unpack(dir.path()).unwrap();
let mut dump = V4Reader::open(dir).unwrap();
// top level infos
insta::assert_display_snapshot!(dump.date().unwrap(), @"2022-10-06 12:53:49.131989609 +00:00:00");
insta::assert_display_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d");
// tasks
let tasks = dump.tasks().collect::<Result<Vec<_>>>().unwrap();
let (tasks, mut update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
meili_snap::snapshot_hash!(meili_snap::json_string!(tasks), @"f4efacbea0c1a4400873f4b2ee33f975");
2022-10-06 15:49:30 +02:00
assert_eq!(update_files.len(), 10);
assert!(update_files[0].is_some()); // the enqueued document addition
assert!(update_files[1..].iter().all(|u| u.is_none())); // everything already processed
2022-10-20 18:00:07 +02:00
let update_file = update_files.remove(0).unwrap().collect::<Result<Vec<_>>>().unwrap();
meili_snap::snapshot_hash!(meili_snap::json_string!(update_file), @"7b8889539b669c7b9ddba448bafa385d");
2022-10-06 15:49:30 +02:00
// keys
let keys = dump.keys().collect::<Result<Vec<_>>>().unwrap();
meili_snap::snapshot_hash!(meili_snap::json_string!(keys, { "[].uid" => "[uuid]" }), @"9240300dca8f962cdf58359ef4c76f09");
2022-10-06 15:49:30 +02:00
// indexes
let mut indexes = dump.indexes().unwrap().collect::<Result<Vec<_>>>().unwrap();
// the index are not ordered in any way by default
indexes.sort_by_key(|index| index.metadata().uid.to_string());
let mut products = indexes.pop().unwrap();
let mut movies = indexes.pop().unwrap();
let mut spells = indexes.pop().unwrap();
assert!(indexes.is_empty());
// products
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
{
"uid": "products",
"primaryKey": "sku",
"createdAt": "[now]",
"updatedAt": "[now]"
}
"###);
2022-10-26 18:44:10 +02:00
meili_snap::snapshot_hash!(format!("{:#?}", products.settings()), @"65b139c6b9fc251e187073c8557803e2");
2022-10-20 18:00:07 +02:00
let documents = products.documents().unwrap().collect::<Result<Vec<_>>>().unwrap();
2022-10-06 15:49:30 +02:00
assert_eq!(documents.len(), 10);
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"b01c8371aea4c7171af0d4d846a2bdca");
2022-10-06 15:49:30 +02:00
// movies
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
{
"uid": "movies",
"primaryKey": "id",
"createdAt": "[now]",
"updatedAt": "[now]"
}
"###);
2022-10-26 18:44:10 +02:00
meili_snap::snapshot_hash!(format!("{:#?}", movies.settings()), @"06aa1988493485d9b2cda7c751e6bb15");
2022-10-20 18:00:07 +02:00
let documents = movies.documents().unwrap().collect::<Result<Vec<_>>>().unwrap();
2022-10-06 15:49:30 +02:00
assert_eq!(documents.len(), 110);
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"786022a66ecb992c8a2a60fee070a5ab");
2022-10-06 15:49:30 +02:00
// spells
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
"createdAt": "[now]",
"updatedAt": "[now]"
}
"###);
2022-10-26 18:44:10 +02:00
meili_snap::snapshot_hash!(format!("{:#?}", spells.settings()), @"7d722fc2629eaa45032ed3deb0c9b4ce");
2022-10-20 18:00:07 +02:00
let documents = spells.documents().unwrap().collect::<Result<Vec<_>>>().unwrap();
2022-10-06 15:49:30 +02:00
assert_eq!(documents.len(), 10);
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"235016433dd04262c7f2da01d1e808ce");
2022-10-06 15:49:30 +02:00
}
}