MeiliSearch/meilisearch-lib/src/tasks/task_store/store.rs

462 lines
14 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#[allow(clippy::upper_case_acronyms)]
type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
const UID_TASK_IDS: &str = "uid_task_id";
const TASKS: &str = "tasks";
use std::borrow::Cow;
use std::collections::BinaryHeap;
use std::convert::TryInto;
use std::mem::size_of;
use std::ops::Range;
use std::result::Result as StdResult;
use std::sync::Arc;
use heed::types::{ByteSlice, OwnedType, SerdeJson, Unit};
use heed::{BytesDecode, BytesEncode, Database, Env, RoTxn, RwTxn};
use crate::tasks::task::{Task, TaskId};
use super::super::Result;
use super::{Pending, TaskFilter};
enum IndexUidTaskIdCodec {}
impl<'a> BytesEncode<'a> for IndexUidTaskIdCodec {
type EItem = (&'a str, TaskId);
fn bytes_encode((s, id): &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let size = s.len() + std::mem::size_of::<TaskId>() + 1;
if size > 512 {
return None;
}
let mut b = Vec::with_capacity(size);
b.extend_from_slice(s.as_bytes());
// null terminate the string
b.push(0);
b.extend_from_slice(&id.to_be_bytes());
Some(Cow::Owned(b))
}
}
impl<'a> BytesDecode<'a> for IndexUidTaskIdCodec {
type DItem = (&'a str, TaskId);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let len = bytes.len();
let s_end = len.checked_sub(size_of::<TaskId>())?.checked_sub(1)?;
let str_bytes = &bytes[..s_end];
let str = std::str::from_utf8(str_bytes).ok()?;
let id = TaskId::from_be_bytes(bytes[(len - size_of::<TaskId>())..].try_into().ok()?);
Some((str, id))
}
}
pub struct Store {
env: Arc<Env>,
uids_task_ids: Database<IndexUidTaskIdCodec, Unit>,
tasks: Database<OwnedType<BEU64>, SerdeJson<Task>>,
}
impl Drop for Store {
fn drop(&mut self) {
if Arc::strong_count(&self.env) == 1 {
self.env.as_ref().clone().prepare_for_closing();
}
}
}
impl Store {
/// Create a new store from the specified `Path`.
/// Be really cautious when calling this function, the returned `Store` may
/// be in an invalid state, with dangling processing tasks.
/// You want to patch all un-finished tasks and put them in your pending
/// queue with the `reset_and_return_unfinished_update` method.
pub fn new(env: Arc<heed::Env>) -> Result<Self> {
let uids_task_ids = env.create_database(Some(UID_TASK_IDS))?;
let tasks = env.create_database(Some(TASKS))?;
Ok(Self {
env,
uids_task_ids,
tasks,
})
}
/// This function should be called *right after* creating the store.
/// It put back all unfinished update in the `Created` state. This
/// allow us to re-enqueue an update that didn't had the time to finish
/// when Meilisearch closed.
pub fn reset_and_return_unfinished_tasks(&mut self) -> Result<BinaryHeap<Pending<TaskId>>> {
let mut unfinished_tasks: BinaryHeap<Pending<TaskId>> = BinaryHeap::new();
let mut wtxn = self.wtxn()?;
let mut iter = self.tasks.rev_iter_mut(&mut wtxn)?;
while let Some(entry) = iter.next() {
let entry = entry?;
let (id, mut task): (BEU64, Task) = entry;
// Since all tasks are ordered, we can stop iterating when we encounter our first non-finished task.
if task.is_finished() {
break;
}
// we only keep the first state. Its supposed to be a `Created` state.
task.events.drain(1..);
unfinished_tasks.push(Pending::Task(id.get()));
// Since we own the id and the task this is a safe operation.
unsafe {
iter.put_current(&id, &task)?;
}
}
drop(iter);
wtxn.commit()?;
Ok(unfinished_tasks)
}
pub fn wtxn(&self) -> Result<RwTxn> {
Ok(self.env.write_txn()?)
}
pub fn rtxn(&self) -> Result<RoTxn> {
Ok(self.env.read_txn()?)
}
/// Returns the id for the next task.
///
/// The required `mut txn` acts as a reservation system. It guarantees that as long as you commit
/// the task to the store in the same transaction, no one else will hav this task id.
pub fn next_task_id(&self, txn: &mut RwTxn) -> Result<TaskId> {
let id = self
.tasks
.lazily_decode_data()
.last(txn)?
.map(|(id, _)| id.get() + 1)
.unwrap_or(0);
Ok(id)
}
pub fn put(&self, txn: &mut RwTxn, task: &Task) -> Result<()> {
self.tasks.put(txn, &BEU64::new(task.id), task)?;
self.uids_task_ids
.put(txn, &(&task.index_uid, task.id), &())?;
Ok(())
}
pub fn get(&self, txn: &RoTxn, id: TaskId) -> Result<Option<Task>> {
let task = self.tasks.get(txn, &BEU64::new(id))?;
Ok(task)
}
pub fn list_tasks<'a>(
&self,
txn: &'a RoTxn,
from: Option<TaskId>,
filter: Option<TaskFilter>,
limit: Option<usize>,
) -> Result<Vec<Task>> {
let from = from.unwrap_or_default();
let range = from..limit
.map(|limit| (limit as u64).saturating_add(from))
.unwrap_or(u64::MAX);
let iter: Box<dyn Iterator<Item = StdResult<_, heed::Error>>> = match filter {
Some(filter) => {
let iter = self
.compute_candidates(txn, filter, range)?
.into_iter()
.filter_map(|id| self.tasks.get(txn, &BEU64::new(id)).transpose());
Box::new(iter)
}
None => Box::new(
self.tasks
.rev_range(txn, &(BEU64::new(range.start)..BEU64::new(range.end)))?
.map(|r| r.map(|(_, t)| t)),
),
};
// Collect 'limit' task if it exists or all of them.
let tasks = iter
.take(limit.unwrap_or(usize::MAX))
.try_fold::<_, _, StdResult<_, heed::Error>>(Vec::new(), |mut v, task| {
v.push(task?);
Ok(v)
})?;
Ok(tasks)
}
fn compute_candidates(
&self,
txn: &heed::RoTxn,
filter: TaskFilter,
range: Range<TaskId>,
) -> Result<BinaryHeap<TaskId>> {
let mut candidates = BinaryHeap::new();
if let Some(indexes) = filter.indexes {
for index in indexes {
// We need to prefix search the null terminated string to make sure that we only
// get exact matches for the index, and not other uids that would share the same
// prefix, i.e test and test1.
let mut index_uid = index.as_bytes().to_vec();
index_uid.push(0);
self.uids_task_ids
.remap_key_type::<ByteSlice>()
.rev_prefix_iter(txn, &index_uid)?
.map(|entry| -> StdResult<_, heed::Error> {
let (key, _) = entry?;
let (_, id) =
IndexUidTaskIdCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
Ok(id)
})
.skip_while(|entry| {
entry
.as_ref()
.ok()
// we skip all elements till we enter in the range
.map(|key| !range.contains(key))
// if we encounter an error we returns true to collect it later
.unwrap_or(true)
})
.take_while(|entry| {
entry
.as_ref()
.ok()
// as soon as we are out of the range we exit
.map(|key| range.contains(key))
// if we encounter an error we returns true to collect it later
.unwrap_or(true)
})
.try_for_each::<_, StdResult<(), heed::Error>>(|id| {
candidates.push(id?);
Ok(())
})?;
}
}
Ok(candidates)
}
}
#[cfg(test)]
pub mod test {
use heed::EnvOpenOptions;
use itertools::Itertools;
use nelson::Mocker;
use proptest::collection::vec;
use proptest::prelude::*;
use tempfile::TempDir;
use crate::index_resolver::IndexUid;
use crate::tasks::task::TaskContent;
use super::*;
/// TODO: use this mock to test the task store properly.
#[allow(dead_code)]
pub enum MockStore {
Real(Store),
Fake(Mocker),
}
pub struct TmpEnv(TempDir, Arc<heed::Env>);
impl TmpEnv {
pub fn env(&self) -> Arc<heed::Env> {
self.1.clone()
}
}
pub fn tmp_env() -> TmpEnv {
let tmp = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(4096 * 100000);
options.max_dbs(1000);
let env = Arc::new(options.open(tmp.path()).unwrap());
TmpEnv(tmp, env)
}
impl MockStore {
pub fn new(env: Arc<heed::Env>) -> Result<Self> {
Ok(Self::Real(Store::new(env)?))
}
pub fn reset_and_return_unfinished_tasks(&mut self) -> Result<BinaryHeap<Pending<TaskId>>> {
match self {
MockStore::Real(index) => index.reset_and_return_unfinished_tasks(),
MockStore::Fake(_) => todo!(),
}
}
pub fn wtxn(&self) -> Result<RwTxn> {
match self {
MockStore::Real(index) => index.wtxn(),
MockStore::Fake(_) => todo!(),
}
}
pub fn rtxn(&self) -> Result<RoTxn> {
match self {
MockStore::Real(index) => index.rtxn(),
MockStore::Fake(_) => todo!(),
}
}
pub fn next_task_id(&self, txn: &mut RwTxn) -> Result<TaskId> {
match self {
MockStore::Real(index) => index.next_task_id(txn),
MockStore::Fake(_) => todo!(),
}
}
pub fn put(&self, txn: &mut RwTxn, task: &Task) -> Result<()> {
match self {
MockStore::Real(index) => index.put(txn, task),
MockStore::Fake(_) => todo!(),
}
}
pub fn get(&self, txn: &RoTxn, id: TaskId) -> Result<Option<Task>> {
match self {
MockStore::Real(index) => index.get(txn, id),
MockStore::Fake(_) => todo!(),
}
}
pub fn list_tasks<'a>(
&self,
txn: &'a RoTxn,
from: Option<TaskId>,
filter: Option<TaskFilter>,
limit: Option<usize>,
) -> Result<Vec<Task>> {
match self {
MockStore::Real(index) => index.list_tasks(txn, from, filter, limit),
MockStore::Fake(_) => todo!(),
}
}
}
#[test]
fn test_ordered_filtered_updates() {
let tmp = tmp_env();
let store = Store::new(tmp.env()).unwrap();
let tasks = (0..100)
.map(|_| Task {
id: rand::random(),
index_uid: IndexUid::new_unchecked("test".to_string()),
content: TaskContent::IndexDeletion,
events: vec![],
})
.collect::<Vec<_>>();
let mut txn = store.env.write_txn().unwrap();
tasks
.iter()
.try_for_each(|t| store.put(&mut txn, t))
.unwrap();
let mut filter = TaskFilter::default();
filter.filter_index("test".into());
let tasks = store.list_tasks(&txn, None, Some(filter), None).unwrap();
assert!(tasks
.iter()
.map(|t| t.id)
.tuple_windows()
.all(|(a, b)| a > b));
}
#[test]
fn test_filter_same_index_prefix() {
let tmp = tmp_env();
let store = Store::new(tmp.env()).unwrap();
let task_1 = Task {
id: 1,
index_uid: IndexUid::new_unchecked("test".to_string()),
content: TaskContent::IndexDeletion,
events: vec![],
};
let task_2 = Task {
id: 0,
index_uid: IndexUid::new_unchecked("test1".to_string()),
content: TaskContent::IndexDeletion,
events: vec![],
};
let mut txn = store.wtxn().unwrap();
store.put(&mut txn, &task_1).unwrap();
store.put(&mut txn, &task_2).unwrap();
let mut filter = TaskFilter::default();
filter.filter_index("test".into());
let tasks = store.list_tasks(&txn, None, Some(filter), None).unwrap();
txn.abort().unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(&*tasks.first().unwrap().index_uid, "test");
// same thing but invert the ids
let task_1 = Task {
id: 0,
index_uid: IndexUid::new_unchecked("test".to_string()),
content: TaskContent::IndexDeletion,
events: vec![],
};
let task_2 = Task {
id: 1,
index_uid: IndexUid::new_unchecked("test1".to_string()),
content: TaskContent::IndexDeletion,
events: vec![],
};
let mut txn = store.wtxn().unwrap();
store.put(&mut txn, &task_1).unwrap();
store.put(&mut txn, &task_2).unwrap();
let mut filter = TaskFilter::default();
filter.filter_index("test".into());
let tasks = store.list_tasks(&txn, None, Some(filter), None).unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(&*tasks.first().unwrap().index_uid, "test");
}
proptest! {
#[test]
fn encode_decode_roundtrip(index_uid in any::<IndexUid>(), task_id in 0..TaskId::MAX) {
let value = (index_uid.as_ref(), task_id);
let bytes = IndexUidTaskIdCodec::bytes_encode(&value).unwrap();
let (index, id) = IndexUidTaskIdCodec::bytes_decode(bytes.as_ref()).unwrap();
assert_eq!(&*index_uid, index);
assert_eq!(task_id, id);
}
#[test]
fn encode_doesnt_crash(index_uid in "\\PC*", task_id in 0..TaskId::MAX) {
let value = (index_uid.as_ref(), task_id);
IndexUidTaskIdCodec::bytes_encode(&value);
}
#[test]
fn decode_doesnt_crash(bytes in vec(any::<u8>(), 0..1000)) {
IndexUidTaskIdCodec::bytes_decode(&bytes);
}
}
}