START THE REWRITE OF THE INDEX SCHEDULER: index & register has been implemented

This commit is contained in:
Tamo 2022-09-06 16:43:59 +02:00 committed by Clément Renault
parent ab1800551f
commit 1a47949063
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
7 changed files with 453 additions and 9 deletions

View file

@ -0,0 +1,15 @@
[package]
name = "index-scheduler"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.64"
bincode = "1.3.3"
milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.0" }
roaring = "0.9.0"
serde = { version = "1.0.136", features = ["derive"] }
thiserror = "1.0.30"
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }

View file

@ -0,0 +1,7 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum Error {
#[error("Index not found")]
IndexNotFound,
}

168
index-scheduler/src/lib.rs Normal file
View file

@ -0,0 +1,168 @@
pub mod error;
pub mod task;
use error::Error;
use milli::heed::types::{DecodeIgnore, OwnedType, SerdeBincode, Str};
pub use task::Task;
use task::{Kind, Status};
use std::collections::hash_map::Entry;
use std::sync::atomic::AtomicBool;
use std::sync::Arc;
use std::{collections::HashMap, sync::RwLock};
use anyhow::Result;
use milli::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn};
use milli::{Index, RoaringBitmapCodec, BEU32};
use roaring::RoaringBitmap;
pub type TaskId = u32;
type IndexName = String;
type IndexUuid = String;
/// This module is responsible for two things;
/// 1. Resolve the name of the indexes.
/// 2. Schedule the tasks.
#[derive(Clone)]
pub struct IndexScheduler {
// Keep track of the opened indexes and is used
// mainly by the index resolver.
index_map: Arc<RwLock<HashMap<IndexUuid, Index>>>,
/// The list of tasks currently processing.
processing_tasks: Arc<RwLock<RoaringBitmap>>,
/// The LMDB environment which the DBs are associated with.
env: Env,
// The main database, it contains all the tasks accessible by their Id.
all_tasks: Database<OwnedType<BEU32>, SerdeBincode<Task>>,
// All the tasks ids grouped by their status.
status: Database<SerdeBincode<Status>, RoaringBitmapCodec>,
// All the tasks ids grouped by their kind.
kind: Database<OwnedType<BEU32>, RoaringBitmapCodec>,
// Map an index name with an indexuuid.
index_name_mapper: Database<Str, Str>,
// Store the tasks associated to an index.
index_tasks: Database<IndexName, RoaringBitmap>,
// set to true when there is work to do.
wake_up: Arc<AtomicBool>,
}
impl IndexScheduler {
pub fn index(&self, name: &str) -> Result<Index> {
let rtxn = self.env.read_txn()?;
let uuid = self
.index_name_mapper
.get(&rtxn, name)?
.ok_or(Error::IndexNotFound)?;
// we clone here to drop the lock before entering the match
let index = self.index_map.read().unwrap().get(&*uuid).cloned();
let index = match index {
Some(index) => index,
// since we're lazy, it's possible that the index doesn't exist yet.
// We need to open it ourselves.
None => {
let mut index_map = self.index_map.write().unwrap();
// between the read lock and the write lock it's not impossible
// that someone already opened the index (eg if two search happens
// at the same time), thus before opening it we check a second time
// if it's not already there.
// Since there is a good chance it's not already there we can use
// the entry method.
match index_map.entry(uuid.to_string()) {
Entry::Vacant(entry) => {
// TODO: TAMO: get the envopenoptions from somewhere
let index = milli::Index::new(EnvOpenOptions::new(), uuid)?;
entry.insert(index.clone());
index
}
Entry::Occupied(entry) => entry.get().clone(),
}
}
};
Ok(index)
}
fn next_task_id(&self, rtxn: &RoTxn) -> Result<TaskId> {
Ok(self
.all_tasks
.remap_data_type::<DecodeIgnore>()
.last(rtxn)?
.map(|(k, _)| k.get())
.unwrap_or(0))
}
/// Register a new task in the scheduler. If it fails and data was associated with the task
/// it tries to delete the file.
pub fn register(&self, task: Task) -> Result<()> {
let mut wtxn = self.env.write_txn()?;
let task_id = self.next_task_id(&wtxn)?;
self.all_tasks
.append(&mut wtxn, &BEU32::new(task_id), &task)?;
self.update_status(&mut wtxn, Status::Enqueued, |mut bitmap| {
bitmap.insert(task_id);
bitmap
})?;
self.update_kind(&mut wtxn, &task.kind, |mut bitmap| {
bitmap.insert(task_id);
bitmap
})?;
// we persist the file in last to be sure everything before was applied successfuly
task.persist()?;
match wtxn.commit() {
Ok(()) => (),
e @ Err(_) => {
task.remove_data()?;
e?;
}
}
self.notify();
Ok(())
}
pub fn notify(&self) {
self.wake_up
.store(true, std::sync::atomic::Ordering::Relaxed);
}
fn update_status(
&self,
wtxn: &mut RwTxn,
status: Status,
f: impl Fn(RoaringBitmap) -> RoaringBitmap,
) -> Result<()> {
let tasks = self.status.get(&wtxn, &status)?.unwrap_or_default();
let tasks = f(tasks);
self.status.put(wtxn, &status, &tasks)?;
Ok(())
}
fn update_kind(
&self,
wtxn: &mut RwTxn,
kind: &Kind,
f: impl Fn(RoaringBitmap) -> RoaringBitmap,
) -> Result<()> {
let kind = BEU32::new(kind.to_u32());
let tasks = self.kind.get(&wtxn, &kind)?.unwrap_or_default();
let tasks = f(tasks);
self.kind.put(wtxn, &kind, &tasks)?;
Ok(())
}
}

View file

@ -0,0 +1,3 @@
fn main() {
println!("Hello, world!");
}

141
index-scheduler/src/task.rs Normal file
View file

@ -0,0 +1,141 @@
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use time::OffsetDateTime;
use crate::TaskId;
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum Status {
Enqueued,
Processing,
Succeeded,
Failed,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Task {
#[serde(with = "time::serde::rfc3339::option")]
pub enqueued_at: Option<OffsetDateTime>,
#[serde(with = "time::serde::rfc3339::option")]
pub started_at: Option<OffsetDateTime>,
#[serde(with = "time::serde::rfc3339::option")]
pub finished_at: Option<OffsetDateTime>,
pub status: Status,
pub kind: Kind,
}
impl Task {
pub fn persist(&self) -> Result<()> {
self.kind.persist()
}
pub fn remove_data(&self) -> Result<()> {
self.kind.remove_data()
}
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum Kind {
DumpExport {
output: PathBuf,
},
DocumentAddition {
index_name: String,
content_file: String,
},
DocumentDeletion {
index_name: String,
documents_ids: Vec<String>,
},
ClearAllDocuments {
index_name: String,
},
// TODO: TAMO: uncomment the settings
// Settings {
// index_name: String,
// new_settings: Settings,
// },
RenameIndex {
index_name: String,
new_name: String,
},
CreateIndex {
index_name: String,
primary_key: Option<String>,
},
DeleteIndex {
index_name: String,
},
SwapIndex {
lhs: String,
rhs: String,
},
CancelTask {
tasks: Vec<TaskId>,
},
}
impl Kind {
pub fn persist(&self) -> Result<()> {
match self {
Kind::DocumentAddition {
index_name,
content_file,
} => {
// TODO: TAMO: persist the file
// content_file.persist();
Ok(())
}
// There is nothing to persist for all these tasks
Kind::DumpExport { .. }
| Kind::DocumentDeletion { .. }
| Kind::ClearAllDocuments { .. }
| Kind::RenameIndex { .. }
| Kind::CreateIndex { .. }
| Kind::DeleteIndex { .. }
| Kind::SwapIndex { .. }
| Kind::CancelTask { .. } => Ok(()),
}
}
pub fn remove_data(&self) -> Result<()> {
match self {
Kind::DocumentAddition {
index_name,
content_file,
} => {
// TODO: TAMO: delete the file
// content_file.delete();
Ok(())
}
// There is no data associated with all these tasks
Kind::DumpExport { .. }
| Kind::DocumentDeletion { .. }
| Kind::ClearAllDocuments { .. }
| Kind::RenameIndex { .. }
| Kind::CreateIndex { .. }
| Kind::DeleteIndex { .. }
| Kind::SwapIndex { .. }
| Kind::CancelTask { .. } => Ok(()),
}
}
pub fn to_u32(&self) -> u32 {
match self {
Kind::DumpExport { .. } => 0,
Kind::DocumentAddition { .. } => 1,
Kind::DocumentDeletion { .. } => 2,
Kind::ClearAllDocuments { .. } => 3,
Kind::RenameIndex { .. } => 4,
Kind::CreateIndex { .. } => 5,
Kind::DeleteIndex { .. } => 6,
Kind::SwapIndex { .. } => 7,
Kind::CancelTask { .. } => 8,
}
}
}