WIP: refactor IndexController

change the architecture of the index controller to allow it to own an
index store.
This commit is contained in:
mpostma 2021-01-16 15:09:48 +01:00
parent 686f987180
commit 6a3f625e11
No known key found for this signature in database
GPG key ID: CBC8A7C1D7A28C3A
15 changed files with 1197 additions and 287 deletions

View file

@ -0,0 +1,255 @@
use std::fs::File;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::collections::HashMap;
use anyhow::Result;
use chrono::{DateTime, Utc};
use dashmap::DashMap;
use heed::types::{Str, SerdeBincode};
use heed::{EnvOpenOptions, Env, Database};
use milli::{Index, FieldsIdsMap, SearchResult, FieldId, facet::FacetType};
use serde::{Serialize, Deserialize};
use ouroboros::self_referencing;
use crate::data::SearchQuery;
const CONTROLLER_META_FILENAME: &str = "index_controller_meta";
const INDEXES_CONTROLLER_FILENAME: &str = "indexes_db";
const INDEXES_DB_NAME: &str = "indexes_db";
#[derive(Debug, Serialize, Deserialize)]
struct IndexStoreMeta {
open_options: EnvOpenOptions,
created_at: DateTime<Utc>,
}
impl IndexStoreMeta {
fn from_path(path: impl AsRef<Path>) -> Result<Option<IndexStoreMeta>> {
let mut path = path.as_ref().to_path_buf();
path.push(CONTROLLER_META_FILENAME);
if path.exists() {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
let n = file.read_to_end(&mut buffer)?;
let meta: IndexStoreMeta = serde_json::from_slice(&buffer[..n])?;
Ok(Some(meta))
} else {
Ok(None)
}
}
fn to_path(self, path: impl AsRef<Path>) -> Result<()> {
let mut path = path.as_ref().to_path_buf();
path.push(CONTROLLER_META_FILENAME);
if path.exists() {
Err(anyhow::anyhow!("Index controller metadata already exists"))
} else {
let mut file = File::create(path)?;
let json = serde_json::to_vec(&self)?;
file.write_all(&json)?;
Ok(())
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct IndexMetadata {
created_at: DateTime<Utc>,
open_options: EnvOpenOptions,
uuid: String,
}
impl IndexMetadata {
fn open_index(self, path: impl AsRef<Path>) -> Result<Index> {
// create a path in the form "db_path/indexes/index_id"
let mut path = path.as_ref().to_path_buf();
path.push("indexes");
path.push(&self.uuid);
Ok(Index::new(self.open_options, path)?)
}
}
#[self_referencing]
pub struct IndexView {
pub index: Arc<Index>,
#[borrows(index)]
#[covariant]
pub txn: heed::RoTxn<'this>,
uuid: String,
}
impl IndexView {
pub fn search(&self, search_query: &SearchQuery) -> Result<SearchResult> {
self.with(|this| {
let mut search = this.index.search(&this.txn);
if let Some(query) = &search_query.q {
search.query(query);
}
if let Some(offset) = search_query.offset {
search.offset(offset);
}
let limit = search_query.limit;
search.limit(limit);
Ok(search.execute()?)
})
}
#[inline]
pub fn fields_ids_map(&self) -> Result<FieldsIdsMap> {
self.with(|this| Ok(this.index.fields_ids_map(&this.txn)?))
}
#[inline]
pub fn displayed_fields_ids(&self) -> Result<Option<Vec<FieldId>>> {
self.with(|this| Ok(this.index.displayed_fields_ids(&this.txn)?))
}
#[inline]
pub fn displayed_fields(&self) -> Result<Option<Vec<String>>> {
self.with(|this| Ok(this.index
.displayed_fields(&this.txn)?
.map(|fields| fields.into_iter().map(String::from).collect())))
}
#[inline]
pub fn searchable_fields(&self) -> Result<Option<Vec<String>>> {
self.with(|this| Ok(this.index
.searchable_fields(&this.txn)?
.map(|fields| fields.into_iter().map(String::from).collect())))
}
#[inline]
pub fn faceted_fields(&self) -> Result<HashMap<std::string::String, FacetType>> {
self.with(|this| Ok(this.index.faceted_fields(&this.txn)?))
}
pub fn documents(&self, ids: &[u32]) -> Result<Vec<(u32, obkv::KvReader<'_>)>> {
let txn = self.borrow_txn();
let index = self.borrow_index();
Ok(index.documents(txn, ids.into_iter().copied())?)
}
//pub async fn add_documents<B, E>(
//&self,
//method: IndexDocumentsMethod,
//format: UpdateFormat,
//mut stream: impl futures::Stream<Item=Result<B, E>> + Unpin,
//) -> anyhow::Result<UpdateStatusResponse>
//where
//B: Deref<Target = [u8]>,
//E: std::error::Error + Send + Sync + 'static,
//{
//let file = tokio::task::spawn_blocking(tempfile::tempfile).await?;
//let file = tokio::fs::File::from_std(file?);
//let mut encoder = GzipEncoder::new(file);
//while let Some(result) = stream.next().await {
//let bytes = &*result?;
//encoder.write_all(&bytes[..]).await?;
//}
//encoder.shutdown().await?;
//let mut file = encoder.into_inner();
//file.sync_all().await?;
//let file = file.into_std().await;
//let mmap = unsafe { memmap::Mmap::map(&file)? };
//let meta = UpdateMeta::DocumentsAddition { method, format };
//let index = self.index.clone();
//let queue = self.update_store.clone();
//let update = tokio::task::spawn_blocking(move || queue.register_update(index, meta, &mmap[..])).await??;
//Ok(update.into())
//}
}
pub struct IndexStore {
path: PathBuf,
env: Env,
indexes_db: Database<Str, SerdeBincode<IndexMetadata>>,
indexes: DashMap<String, (String, Arc<Index>)>,
}
impl IndexStore {
/// Open the index controller from meta found at path, and create a new one if no meta is
/// found.
pub fn new(path: impl AsRef<Path>) -> Result<Self> {
// If index controller metadata is present, we return the env, otherwise, we create a new
// metadata from scratch before returning a new env.
let path = path.as_ref().to_path_buf();
let env = match IndexStoreMeta::from_path(&path)? {
Some(meta) => meta.open_options.open(INDEXES_CONTROLLER_FILENAME)?,
None => {
let mut open_options = EnvOpenOptions::new();
open_options.map_size(page_size::get() * 1000);
let env = open_options.open(INDEXES_CONTROLLER_FILENAME)?;
let created_at = Utc::now();
let meta = IndexStoreMeta { open_options: open_options.clone(), created_at };
meta.to_path(&path)?;
env
}
};
let indexes = DashMap::new();
let indexes_db = match env.open_database(Some(INDEXES_DB_NAME))? {
Some(indexes_db) => indexes_db,
None => env.create_database(Some(INDEXES_DB_NAME))?,
};
Ok(Self { env, indexes, indexes_db, path })
}
pub fn get_or_create<S: AsRef<str>>(&self, _name: S) -> Result<IndexView> {
todo!()
}
/// Get an index with read access to the db. The index are lazily loaded, meaning that we first
/// check for its exixtence in the indexes map, and if it doesn't exist, the index db is check
/// for metadata to launch the index.
pub fn get<S: AsRef<str>>(&self, name: S) -> Result<Option<IndexView>> {
match self.indexes.get(name.as_ref()) {
Some(entry) => {
let index = entry.1.clone();
let uuid = entry.0.clone();
let view = IndexView::try_new(index, |index| index.read_txn(), uuid)?;
Ok(Some(view))
}
None => {
let txn = self.env.read_txn()?;
match self.indexes_db.get(&txn, name.as_ref())? {
Some(meta) => {
let uuid = meta.uuid.clone();
let index = Arc::new(meta.open_index(&self.path)?);
self.indexes.insert(name.as_ref().to_owned(), (uuid.clone(), index.clone()));
let view = IndexView::try_new(index, |index| index.read_txn(), uuid)?;
Ok(Some(view))
}
None => Ok(None)
}
}
}
}
pub fn get_mut<S: AsRef<str>>(&self, _name: S) -> Result<Option<IndexView>> {
todo!()
}
pub async fn delete_index<S: AsRef<str>>(&self, _name:S) -> Result<()> {
todo!()
}
pub async fn list_indices(&self) -> Result<Vec<(String, IndexMetadata)>> {
todo!()
}
pub async fn rename_index(&self, _old: &str, _new: &str) -> Result<()> {
todo!()
}
}

View file

@ -1,196 +1,145 @@
use std::fs::File;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
mod index_store;
mod update_store;
pub use index_store::IndexStore;
pub use update_store::UpdateStore;
use std::num::NonZeroUsize;
use std::ops::Deref;
use std::collections::HashMap;
use anyhow::Result;
use chrono::{DateTime, Utc};
use dashmap::DashMap;
use dashmap::mapref::one::Ref;
use heed::types::{Str, SerdeBincode};
use heed::{EnvOpenOptions, Env, Database};
use milli::{Index, FieldsIdsMap, SearchResult, FieldId};
use serde::{Serialize, Deserialize};
use milli::update::{IndexDocumentsMethod, UpdateFormat};
use milli::update_store::{Processed, Processing, Failed, Pending, Aborted};
use serde::{Serialize, Deserialize, de::Deserializer};
use crate::data::SearchQuery;
pub type UpdateStatusResponse = UpdateStatus<UpdateMeta, UpdateResult, String>;
const CONTROLLER_META_FILENAME: &str = "index_controller_meta";
const INDEXES_CONTROLLER_FILENAME: &str = "indexes_db";
const INDEXES_DB_NAME: &str = "indexes_db";
pub trait UpdateStore {}
pub struct IndexController<U> {
path: PathBuf,
update_store: U,
env: Env,
indexes_db: Database<Str, SerdeBincode<IndexMetadata>>,
indexes: DashMap<String, Index>,
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum UpdateMeta {
DocumentsAddition { method: IndexDocumentsMethod, format: UpdateFormat },
ClearDocuments,
Settings(Settings),
Facets(Facets),
}
#[derive(Debug, Serialize, Deserialize)]
struct IndexControllerMeta {
open_options: EnvOpenOptions,
created_at: DateTime<Utc>,
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct Facets {
pub level_group_size: Option<NonZeroUsize>,
pub min_level_size: Option<NonZeroUsize>,
}
impl IndexControllerMeta {
fn from_path(path: impl AsRef<Path>) -> Result<Option<IndexControllerMeta>> {
let mut path = path.as_ref().to_path_buf();
path.push(CONTROLLER_META_FILENAME);
if path.exists() {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
let n = file.read_to_end(&mut buffer)?;
let meta: IndexControllerMeta = serde_json::from_slice(&buffer[..n])?;
Ok(Some(meta))
} else {
Ok(None)
}
}
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type")]
pub enum UpdateStatus<M, P, N> {
Pending { update_id: u64, meta: Pending<M> },
Progressing { update_id: u64, meta: P },
Processed { update_id: u64, meta: Processed<M, N> },
Aborted { update_id: u64, meta: Aborted<M> },
}
fn to_path(self, path: impl AsRef<Path>) -> Result<()> {
let mut path = path.as_ref().to_path_buf();
path.push(CONTROLLER_META_FILENAME);
if path.exists() {
Err(anyhow::anyhow!("Index controller metadata already exists"))
} else {
let mut file = File::create(path)?;
let json = serde_json::to_vec(&self)?;
file.write_all(&json)?;
Ok(())
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
where T: Deserialize<'de>,
D: Deserializer<'de>
{
Deserialize::deserialize(deserializer).map(Some)
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct Settings {
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none",
)]
pub displayed_attributes: Option<Option<Vec<String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none",
)]
pub searchable_attributes: Option<Option<Vec<String>>>,
#[serde(default)]
pub faceted_attributes: Option<Option<HashMap<String, String>>>,
#[serde(
default,
deserialize_with = "deserialize_some",
skip_serializing_if = "Option::is_none",
)]
pub criteria: Option<Option<Vec<String>>>,
}
impl Settings {
pub fn cleared() -> Self {
Self {
displayed_attributes: Some(None),
searchable_attributes: Some(None),
faceted_attributes: Some(None),
criteria: Some(None),
}
}
}
#[derive(Debug, Serialize, Deserialize)]
struct IndexMetadata {
created_at: DateTime<Utc>,
open_options: EnvOpenOptions,
id: String,
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateResult {
//DocumentsAddition(DocumentAdditionResult),
Other,
}
impl IndexMetadata {
fn open_index(&self, path: impl AsRef<Path>) -> Result<Index> {
// create a path in the form "db_path/indexes/index_id"
let mut path = path.as_ref().to_path_buf();
path.push("indexes");
path.push(&self.id);
Ok(Index::new(self.open_options, path)?)
}
}
/// The `IndexController` is in charge of the access to the underlying indices. It splits the logic
/// for read access which is provided, and write access which must be provided. This allows the
/// implementer to define the behaviour of write accesses to the indices, and abstract the
/// scheduling of the updates. The implementer must be able to provide an instance of `IndexStore`
pub trait IndexController: Deref<Target = IndexStore> {
struct IndexView<'a, U> {
txn: heed::RoTxn<'a>,
index: Ref<'a, String, Index>,
update_store: &'a U,
}
/*
* Write operations
*
* Logic for the write operation need to be provided by the implementer, since they can be made
* asynchronous thanks to an update_store for example.
*
* */
impl<'a, U: UpdateStore> IndexView<'a, U> {
pub fn search(&self, search_query: SearchQuery) -> Result<SearchResult> {
let mut search = self.index.search(&self.txn);
if let Some(query) = &search_query.q {
search.query(query);
}
/// Perform document addition on the database. If the provided index does not exist, it will be
/// created when the addition is applied to the index.
fn add_documents<S: AsRef<str>>(
&self,
index: S,
method: IndexDocumentsMethod,
format: UpdateFormat,
data: &[u8],
) -> anyhow::Result<UpdateStatusResponse>;
if let Some(offset) = search_query.offset {
search.offset(offset);
}
/// Updates an index settings. If the index does not exist, it will be created when the update
/// is applied to the index.
fn update_settings<S: AsRef<str>>(&self, index_uid: S, settings: Settings) -> anyhow::Result<UpdateStatusResponse>;
let limit = search_query.limit;
search.limit(limit);
/// Create an index with the given `index_uid`.
fn create_index<S: AsRef<str>>(&self, index_uid: S) -> Result<()>;
Ok(search.execute()?)
}
/// Delete index with the given `index_uid`, attempting to close it beforehand.
fn delete_index<S: AsRef<str>>(&self, index_uid: S) -> Result<()>;
pub fn fields_ids_map(&self) -> Result<FieldsIdsMap> {
Ok(self.index.fields_ids_map(&self.txn)?)
}
/// Swap two indexes, concretely, it simply swaps the index the names point to.
fn swap_indices<S1: AsRef<str>, S2: AsRef<str>>(&self, index1_uid: S1, index2_uid: S2) -> Result<()>;
pub fn fields_displayed_fields_ids(&self) -> Result<Option<Vec<FieldId>>> {
Ok(self.index.displayed_fields_ids(&self.txn)?)
}
pub fn documents(&self, ids: Vec<u32>) -> Result<Vec<(u32, obkv::KvReader<'_>)>> {
Ok(self.index.documents(&self.txn, ids)?)
}
}
impl<U: UpdateStore> IndexController<U> {
/// Open the index controller from meta found at path, and create a new one if no meta is
/// found.
pub fn new(path: impl AsRef<Path>, update_store: U) -> Result<Self> {
// If index controller metadata is present, we return the env, otherwise, we create a new
// metadata from scratch before returning a new env.
let path = path.as_ref().to_path_buf();
let env = match IndexControllerMeta::from_path(&path)? {
Some(meta) => meta.open_options.open(INDEXES_CONTROLLER_FILENAME)?,
None => {
let open_options = EnvOpenOptions::new()
.map_size(page_size::get() * 1000);
let env = open_options.open(INDEXES_CONTROLLER_FILENAME)?;
let created_at = Utc::now();
let meta = IndexControllerMeta { open_options: open_options.clone(), created_at };
meta.to_path(path)?;
env
}
};
let indexes = DashMap::new();
let indexes_db = match env.open_database(Some(INDEXES_DB_NAME))? {
Some(indexes_db) => indexes_db,
None => env.create_database(Some(INDEXES_DB_NAME))?,
};
Ok(Self { env, indexes, indexes_db, update_store, path })
}
pub fn get_or_create<S: AsRef<str>>(&mut self, name: S) -> Result<IndexView<'_, U>> {
todo!()
}
/// Get an index with read access to the db. The index are lazily loaded, meaning that we first
/// check for its exixtence in the indexes map, and if it doesn't exist, the index db is check
/// for metadata to launch the index.
pub fn get<S: AsRef<str>>(&self, name: S) -> Result<Option<IndexView<'_, U>>> {
let update_store = &self.update_store;
match self.indexes.get(name.as_ref()) {
Some(index) => {
let txn = index.read_txn()?;
Ok(Some(IndexView { index, update_store, txn }))
}
None => {
let txn = self.env.read_txn()?;
match self.indexes_db.get(&txn, name.as_ref())? {
Some(meta) => {
let index = meta.open_index(self.path)?;
self.indexes.insert(name.as_ref().to_owned(), index);
// TODO: create index view
match self.indexes.get(name.as_ref()) {
Some(index) => {
let txn = index.read_txn()?;
Ok(Some(IndexView { index, txn, update_store }))
}
None => Ok(None)
}
}
None => Ok(None)
}
}
}
}
pub fn get_mut<S: AsRef<str>>(&self, name: S) -> Result<Option<IndexView<'_, U>>> {
todo!()
}
pub async fn delete_index<S: AsRef<str>>(&self, name:S) -> Result<()> {
todo!()
}
pub async fn list_indices(&self) -> Result<Vec<(String, IndexMetadata)>> {
todo!()
}
pub async fn rename_index(&self, old: &str, new: &str) -> Result<()> {
/// Apply an update to the given index. This method can be called when an update is ready to be
/// processed
fn handle_update<S: AsRef<str>>(
&self,
_index: S,
_update_id: u64,
_meta: Processing<UpdateMeta>,
_content: &[u8]
) -> Result<Processed<UpdateMeta, UpdateResult>, Failed<UpdateMeta, String>> {
todo!()
}
}

View file

@ -0,0 +1,49 @@
use std::ops::Deref;
use super::{IndexStore, IndexController};
pub struct UpdateStore {
index_store: IndexStore,
}
impl Deref for UpdateStore {
type Target = IndexStore;
fn deref(&self) -> &Self::Target {
&self.index_store
}
}
impl UpdateStore {
pub fn new(index_store: IndexStore) -> Self {
Self { index_store }
}
}
impl IndexController for UpdateStore {
fn add_documents<S: AsRef<str>>(
&self,
_index: S,
_method: milli::update::IndexDocumentsMethod,
_format: milli::update::UpdateFormat,
_data: &[u8],
) -> anyhow::Result<crate::index_controller::UpdateStatusResponse> {
todo!()
}
fn update_settings<S: AsRef<str>>(&self, _index_uid: S, _settings: crate::index_controller::Settings) -> anyhow::Result<crate::index_controller::UpdateStatusResponse> {
todo!()
}
fn create_index<S: AsRef<str>>(&self, _index_uid: S) -> anyhow::Result<()> {
todo!()
}
fn delete_index<S: AsRef<str>>(&self, _index_uid: S) -> anyhow::Result<()> {
todo!()
}
fn swap_indices<S1: AsRef<str>, S2: AsRef<str>>(&self, _index1_uid: S1, _index2_uid: S2) -> anyhow::Result<()> {
todo!()
}
}