MeiliSearch/meilisearch-lib/src/index/mod.rs

288 lines
8.8 KiB
Rust
Raw Normal View History

2021-05-31 16:40:59 +02:00
use std::collections::{BTreeSet, HashSet};
use std::fs::create_dir_all;
use std::marker::PhantomData;
2021-03-04 11:56:32 +01:00
use std::ops::Deref;
2021-05-31 16:40:59 +02:00
use std::path::Path;
use std::sync::Arc;
2021-03-04 11:56:32 +01:00
2021-09-24 11:53:11 +02:00
use chrono::{DateTime, Utc};
2021-05-26 22:52:06 +02:00
use heed::{EnvOpenOptions, RoTxn};
2021-08-24 20:55:29 +02:00
use milli::update::Setting;
2021-09-28 22:22:59 +02:00
use milli::{obkv_to_json, FieldDistribution, FieldId};
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
2021-03-04 14:20:19 +01:00
use error::Result;
2021-06-23 14:48:33 +02:00
pub use search::{default_crop_length, SearchQuery, SearchResult, DEFAULT_SEARCH_LIMIT};
2021-09-28 22:22:59 +02:00
pub use updates::{apply_settings_to_builder, Checked, Facets, Settings, Unchecked};
2021-09-24 11:53:11 +02:00
use uuid::Uuid;
2021-03-04 11:56:32 +01:00
2021-09-14 18:39:02 +02:00
use crate::index_controller::update_file_store::UpdateFileStore;
2021-09-28 22:22:59 +02:00
use crate::EnvSizer;
2021-08-24 20:55:29 +02:00
use self::error::IndexError;
2021-09-24 11:53:11 +02:00
use self::update_handler::UpdateHandler;
pub mod error;
pub mod update_handler;
2021-05-26 22:52:06 +02:00
mod dump;
2021-05-31 16:03:39 +02:00
mod search;
mod updates;
2021-04-01 16:44:42 +02:00
2021-03-04 14:20:19 +01:00
pub type Document = Map<String, Value>;
2021-09-24 11:53:11 +02:00
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct IndexMeta {
created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub primary_key: Option<String>,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct IndexStats {
#[serde(skip)]
pub size: u64,
pub number_of_documents: u64,
/// Whether the current index is performing an update. It is initially `None` when the
/// index returns it, since it is the `UpdateStore` that knows what index is currently indexing. It is
/// later set to either true or false, we we retrieve the information from the `UpdateStore`
pub is_indexing: Option<bool>,
pub field_distribution: FieldDistribution,
}
impl IndexMeta {
pub fn new(index: &Index) -> Result<Self> {
let txn = index.read_txn()?;
Self::new_txn(index, &txn)
}
fn new_txn(index: &Index, txn: &heed::RoTxn) -> Result<Self> {
let created_at = index.created_at(txn)?;
let updated_at = index.updated_at(txn)?;
let primary_key = index.primary_key(txn)?.map(String::from);
Ok(Self {
created_at,
updated_at,
primary_key,
})
}
}
2021-09-27 16:48:03 +02:00
#[derive(Clone, derivative::Derivative)]
#[derivative(Debug)]
2021-09-14 18:39:02 +02:00
pub struct Index {
2021-09-24 11:53:11 +02:00
pub uuid: Uuid,
2021-09-28 22:22:59 +02:00
#[derivative(Debug = "ignore")]
2021-09-14 18:39:02 +02:00
pub inner: Arc<milli::Index>,
2021-09-28 22:22:59 +02:00
#[derivative(Debug = "ignore")]
2021-09-14 18:39:02 +02:00
update_file_store: Arc<UpdateFileStore>,
2021-09-28 22:22:59 +02:00
#[derivative(Debug = "ignore")]
2021-09-24 11:53:11 +02:00
update_handler: Arc<UpdateHandler>,
2021-09-14 18:39:02 +02:00
}
2021-03-04 11:56:32 +01:00
impl Deref for Index {
type Target = milli::Index;
fn deref(&self) -> &Self::Target {
2021-09-14 18:39:02 +02:00
self.inner.as_ref()
2021-03-04 11:56:32 +01:00
}
}
2021-03-04 12:38:55 +01:00
impl Index {
2021-09-28 22:22:59 +02:00
pub fn open(
path: impl AsRef<Path>,
size: usize,
update_file_store: Arc<UpdateFileStore>,
uuid: Uuid,
update_handler: Arc<UpdateHandler>,
) -> Result<Self> {
2021-05-31 16:40:59 +02:00
create_dir_all(&path)?;
2021-05-26 22:52:06 +02:00
let mut options = EnvOpenOptions::new();
options.map_size(size);
2021-09-14 18:39:02 +02:00
let inner = Arc::new(milli::Index::new(options, &path)?);
2021-09-28 22:22:59 +02:00
Ok(Index {
inner,
update_file_store,
uuid,
update_handler,
})
2021-09-24 11:53:11 +02:00
}
pub fn stats(&self) -> Result<IndexStats> {
let rtxn = self.read_txn()?;
Ok(IndexStats {
size: self.size(),
number_of_documents: self.number_of_documents(&rtxn)?,
is_indexing: None,
field_distribution: self.field_distribution(&rtxn)?,
})
2021-05-26 22:52:06 +02:00
}
2021-09-24 11:53:11 +02:00
pub fn meta(&self) -> Result<IndexMeta> {
IndexMeta::new(self)
}
pub fn settings(&self) -> Result<Settings<Checked>> {
2021-03-04 12:38:55 +01:00
let txn = self.read_txn()?;
2021-05-24 18:16:35 +02:00
self.settings_txn(&txn)
}
2021-03-04 12:38:55 +01:00
pub fn settings_txn(&self, txn: &RoTxn) -> Result<Settings<Checked>> {
2021-03-04 12:38:55 +01:00
let displayed_attributes = self
2021-07-29 18:14:36 +02:00
.displayed_fields(txn)?
2021-05-11 11:47:04 +02:00
.map(|fields| fields.into_iter().map(String::from).collect());
2021-03-04 12:38:55 +01:00
let searchable_attributes = self
2021-07-29 18:14:36 +02:00
.searchable_fields(txn)?
2021-05-11 11:47:04 +02:00
.map(|fields| fields.into_iter().map(String::from).collect());
2021-03-04 12:38:55 +01:00
2021-07-29 18:14:36 +02:00
let filterable_attributes = self.filterable_fields(txn)?.into_iter().collect();
2021-03-04 12:38:55 +01:00
let sortable_attributes = self.sortable_fields(txn)?.into_iter().collect();
2021-03-11 22:39:16 +01:00
let criteria = self
2021-07-29 18:14:36 +02:00
.criteria(txn)?
2021-03-11 22:39:16 +01:00
.into_iter()
.map(|c| c.to_string())
.collect();
let stop_words = self
2021-07-29 18:14:36 +02:00
.stop_words(txn)?
.map(|stop_words| -> Result<BTreeSet<_>> {
Ok(stop_words.stream().into_strs()?.into_iter().collect())
})
.transpose()?
.unwrap_or_else(BTreeSet::new);
2021-07-29 18:14:36 +02:00
let distinct_field = self.distinct_field(txn)?.map(String::from);
2021-06-14 10:38:56 +02:00
// in milli each word in the synonyms map were split on their separator. Since we lost
// this information we are going to put space between words.
2021-06-03 14:19:56 +02:00
let synonyms = self
2021-07-29 18:14:36 +02:00
.synonyms(txn)?
2021-06-03 14:19:56 +02:00
.iter()
.map(|(key, values)| {
(
key.join(" "),
values.iter().map(|value| value.join(" ")).collect(),
)
})
.collect();
2021-03-04 12:38:55 +01:00
Ok(Settings {
2021-08-24 20:55:29 +02:00
displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
searchable_attributes: match searchable_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
filterable_attributes: Setting::Set(filterable_attributes),
sortable_attributes: Setting::Set(sortable_attributes),
2021-08-24 20:55:29 +02:00
ranking_rules: Setting::Set(criteria),
stop_words: Setting::Set(stop_words),
distinct_attribute: match distinct_field {
Some(field) => Setting::Set(field),
None => Setting::Reset,
},
synonyms: Setting::Set(synonyms),
2021-05-10 17:30:09 +02:00
_kind: PhantomData,
2021-03-04 12:38:55 +01:00
})
}
2021-03-04 14:20:19 +01:00
2021-03-15 18:11:10 +01:00
pub fn retrieve_documents<S: AsRef<str>>(
2021-03-04 14:20:19 +01:00
&self,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<Vec<Map<String, Value>>> {
2021-03-04 14:20:19 +01:00
let txn = self.read_txn()?;
let fields_ids_map = self.fields_ids_map(&txn)?;
2021-06-17 14:36:32 +02:00
let fields_to_display =
self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?;
2021-03-04 14:20:19 +01:00
let iter = self.documents.range(&txn, &(..))?.skip(offset).take(limit);
let mut documents = Vec::new();
for entry in iter {
let (_id, obkv) = entry?;
2021-06-17 14:36:32 +02:00
let object = obkv_to_json(&fields_to_display, &fields_ids_map, obkv)?;
2021-03-04 14:20:19 +01:00
documents.push(object);
}
Ok(documents)
}
2021-03-04 15:09:00 +01:00
pub fn retrieve_document<S: AsRef<str>>(
&self,
doc_id: String,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<Map<String, Value>> {
let txn = self.read_txn()?;
2021-03-04 15:09:00 +01:00
let fields_ids_map = self.fields_ids_map(&txn)?;
2021-03-04 15:09:00 +01:00
2021-06-17 14:36:32 +02:00
let fields_to_display =
self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?;
let internal_id = self
2021-06-17 14:36:32 +02:00
.external_documents_ids(&txn)?
.get(doc_id.as_bytes())
.ok_or_else(|| IndexError::DocumentNotFound(doc_id.clone()))?;
let document = self
2021-06-17 14:36:32 +02:00
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next()
2021-06-17 14:36:32 +02:00
.map(|(_, d)| d)
.ok_or(IndexError::DocumentNotFound(doc_id))?;
let document = obkv_to_json(&fields_to_display, &fields_ids_map, document)?;
Ok(document)
2021-03-04 15:09:00 +01:00
}
pub fn size(&self) -> u64 {
self.env.size()
2021-04-01 16:44:42 +02:00
}
fn fields_to_display<S: AsRef<str>>(
&self,
txn: &heed::RoTxn,
2021-04-19 16:22:41 +02:00
attributes_to_retrieve: &Option<Vec<S>>,
fields_ids_map: &milli::FieldsIdsMap,
) -> Result<Vec<FieldId>> {
2021-07-29 18:14:36 +02:00
let mut displayed_fields_ids = match self.displayed_fields_ids(txn)? {
Some(ids) => ids.into_iter().collect::<Vec<_>>(),
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
let attributes_to_retrieve_ids = match attributes_to_retrieve {
Some(attrs) => attrs
.iter()
.filter_map(|f| fields_ids_map.id(f.as_ref()))
.collect::<HashSet<_>>(),
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
displayed_fields_ids.retain(|fid| attributes_to_retrieve_ids.contains(fid));
Ok(displayed_fields_ids)
}
2021-09-27 16:48:03 +02:00
pub fn snapshot(&self, path: impl AsRef<Path>) -> Result<()> {
let mut dst = path.as_ref().join(format!("indexes/{}/", self.uuid));
create_dir_all(&dst)?;
dst.push("data.mdb");
let _txn = self.write_txn()?;
2021-09-28 22:22:59 +02:00
self.inner
.env
.copy_to_path(dst, heed::CompactionOption::Enabled)?;
2021-09-27 16:48:03 +02:00
Ok(())
}
2021-03-04 12:38:55 +01:00
}