mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-06-07 21:37:48 +02:00
336 lines
11 KiB
Rust
336 lines
11 KiB
Rust
use std::collections::BTreeSet;
|
||
use std::fs::create_dir_all;
|
||
use std::marker::PhantomData;
|
||
use std::ops::Deref;
|
||
use std::path::Path;
|
||
use std::sync::Arc;
|
||
|
||
use fst::IntoStreamer;
|
||
use milli::heed::{CompactionOption, EnvOpenOptions, RoTxn};
|
||
use milli::update::{IndexerConfig, Setting};
|
||
use milli::{obkv_to_json, FieldDistribution, DEFAULT_VALUES_PER_FACET};
|
||
use serde::{Deserialize, Serialize};
|
||
use serde_json::{Map, Value};
|
||
use time::OffsetDateTime;
|
||
|
||
use crate::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
||
|
||
use super::error::IndexError;
|
||
use super::error::Result;
|
||
use super::updates::{FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, TypoSettings};
|
||
use super::{Checked, Settings};
|
||
|
||
pub type Document = Map<String, Value>;
|
||
|
||
// @kero, what is this structure? Shouldn't it move entirely to milli?
|
||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||
#[serde(rename_all = "camelCase")]
|
||
pub struct IndexMeta {
|
||
#[serde(with = "time::serde::rfc3339")]
|
||
pub created_at: OffsetDateTime,
|
||
#[serde(with = "time::serde::rfc3339")]
|
||
pub updated_at: OffsetDateTime,
|
||
pub primary_key: Option<String>,
|
||
}
|
||
|
||
impl IndexMeta {
|
||
pub fn new(index: &Index) -> Result<Self> {
|
||
let txn = index.read_txn()?;
|
||
Self::new_txn(index, &txn)
|
||
}
|
||
|
||
pub fn new_txn(index: &Index, txn: &milli::heed::RoTxn) -> Result<Self> {
|
||
let created_at = index.created_at(txn)?;
|
||
let updated_at = index.updated_at(txn)?;
|
||
let primary_key = index.primary_key(txn)?.map(String::from);
|
||
Ok(Self {
|
||
created_at,
|
||
updated_at,
|
||
primary_key,
|
||
})
|
||
}
|
||
}
|
||
|
||
// @kero Maybe this should be entirely generated somewhere else since it doesn't really concern the index?
|
||
#[derive(Serialize, Debug)]
|
||
#[serde(rename_all = "camelCase")]
|
||
pub struct IndexStats {
|
||
#[serde(skip)]
|
||
pub size: u64,
|
||
pub number_of_documents: u64,
|
||
/// Whether the current index is performing an update. It is initially `None` when the
|
||
/// index returns it, since it is the `UpdateStore` that knows what index is currently indexing. It is
|
||
/// later set to either true or false, we we retrieve the information from the `UpdateStore`
|
||
pub is_indexing: Option<bool>,
|
||
pub field_distribution: FieldDistribution,
|
||
}
|
||
|
||
#[derive(Clone, derivative::Derivative)]
|
||
#[derivative(Debug)]
|
||
pub struct Index {
|
||
pub name: String,
|
||
#[derivative(Debug = "ignore")]
|
||
pub inner: Arc<milli::Index>,
|
||
#[derivative(Debug = "ignore")]
|
||
pub indexer_config: Arc<IndexerConfig>,
|
||
}
|
||
|
||
impl Deref for Index {
|
||
type Target = milli::Index;
|
||
|
||
fn deref(&self) -> &Self::Target {
|
||
self.inner.as_ref()
|
||
}
|
||
}
|
||
|
||
impl Index {
|
||
pub fn open(
|
||
path: impl AsRef<Path>,
|
||
name: String,
|
||
size: usize,
|
||
update_handler: Arc<IndexerConfig>,
|
||
) -> Result<Self> {
|
||
log::debug!("opening index in {}", path.as_ref().display());
|
||
create_dir_all(&path)?;
|
||
let mut options = EnvOpenOptions::new();
|
||
options.map_size(size);
|
||
let inner = Arc::new(milli::Index::new(options, &path)?);
|
||
Ok(Index {
|
||
name,
|
||
inner,
|
||
indexer_config: update_handler,
|
||
})
|
||
}
|
||
|
||
/// Asynchronously close the underlying index
|
||
pub fn close(self) {
|
||
self.inner.as_ref().clone().prepare_for_closing();
|
||
}
|
||
|
||
pub fn delete(self) -> Result<()> {
|
||
let path = self.path().to_path_buf();
|
||
self.inner.as_ref().clone().prepare_for_closing().wait();
|
||
std::fs::remove_file(path)?;
|
||
|
||
Ok(())
|
||
}
|
||
|
||
pub fn stats(&self) -> Result<IndexStats> {
|
||
let rtxn = self.read_txn()?;
|
||
|
||
Ok(IndexStats {
|
||
size: self.size()?,
|
||
number_of_documents: self.number_of_documents(&rtxn)?,
|
||
is_indexing: None,
|
||
field_distribution: self.field_distribution(&rtxn)?,
|
||
})
|
||
}
|
||
|
||
pub fn meta(&self) -> Result<IndexMeta> {
|
||
IndexMeta::new(self)
|
||
}
|
||
pub fn settings(&self) -> Result<Settings<Checked>> {
|
||
let txn = self.read_txn()?;
|
||
self.settings_txn(&txn)
|
||
}
|
||
|
||
pub fn name(&self) -> &str {
|
||
&self.name
|
||
}
|
||
|
||
pub fn settings_txn(&self, txn: &RoTxn) -> Result<Settings<Checked>> {
|
||
let displayed_attributes = self
|
||
.displayed_fields(txn)?
|
||
.map(|fields| fields.into_iter().map(String::from).collect());
|
||
|
||
let searchable_attributes = self
|
||
.user_defined_searchable_fields(txn)?
|
||
.map(|fields| fields.into_iter().map(String::from).collect());
|
||
|
||
let filterable_attributes = self.filterable_fields(txn)?.into_iter().collect();
|
||
|
||
let sortable_attributes = self.sortable_fields(txn)?.into_iter().collect();
|
||
|
||
let criteria = self
|
||
.criteria(txn)?
|
||
.into_iter()
|
||
.map(|c| c.to_string())
|
||
.collect();
|
||
|
||
let stop_words = self
|
||
.stop_words(txn)?
|
||
.map(|stop_words| -> Result<BTreeSet<_>> {
|
||
Ok(stop_words.stream().into_strs()?.into_iter().collect())
|
||
})
|
||
.transpose()?
|
||
.unwrap_or_default();
|
||
let distinct_field = self.distinct_field(txn)?.map(String::from);
|
||
|
||
// in milli each word in the synonyms map were split on their separator. Since we lost
|
||
// this information we are going to put space between words.
|
||
let synonyms = self
|
||
.synonyms(txn)?
|
||
.iter()
|
||
.map(|(key, values)| {
|
||
(
|
||
key.join(" "),
|
||
values.iter().map(|value| value.join(" ")).collect(),
|
||
)
|
||
})
|
||
.collect();
|
||
|
||
let min_typo_word_len = MinWordSizeTyposSetting {
|
||
one_typo: Setting::Set(self.min_word_len_one_typo(txn)?),
|
||
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
|
||
};
|
||
|
||
let disabled_words = match self.exact_words(txn)? {
|
||
Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
|
||
None => BTreeSet::new(),
|
||
};
|
||
|
||
let disabled_attributes = self
|
||
.exact_attributes(txn)?
|
||
.into_iter()
|
||
.map(String::from)
|
||
.collect();
|
||
|
||
let typo_tolerance = TypoSettings {
|
||
enabled: Setting::Set(self.authorize_typos(txn)?),
|
||
min_word_size_for_typos: Setting::Set(min_typo_word_len),
|
||
disable_on_words: Setting::Set(disabled_words),
|
||
disable_on_attributes: Setting::Set(disabled_attributes),
|
||
};
|
||
|
||
let faceting = FacetingSettings {
|
||
max_values_per_facet: Setting::Set(
|
||
self.max_values_per_facet(txn)?
|
||
.unwrap_or(DEFAULT_VALUES_PER_FACET),
|
||
),
|
||
};
|
||
|
||
let pagination = PaginationSettings {
|
||
max_total_hits: Setting::Set(
|
||
self.pagination_max_total_hits(txn)?
|
||
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
|
||
),
|
||
};
|
||
|
||
Ok(Settings {
|
||
displayed_attributes: match displayed_attributes {
|
||
Some(attrs) => Setting::Set(attrs),
|
||
None => Setting::Reset,
|
||
},
|
||
searchable_attributes: match searchable_attributes {
|
||
Some(attrs) => Setting::Set(attrs),
|
||
None => Setting::Reset,
|
||
},
|
||
filterable_attributes: Setting::Set(filterable_attributes),
|
||
sortable_attributes: Setting::Set(sortable_attributes),
|
||
ranking_rules: Setting::Set(criteria),
|
||
stop_words: Setting::Set(stop_words),
|
||
distinct_attribute: match distinct_field {
|
||
Some(field) => Setting::Set(field),
|
||
None => Setting::Reset,
|
||
},
|
||
synonyms: Setting::Set(synonyms),
|
||
typo_tolerance: Setting::Set(typo_tolerance),
|
||
faceting: Setting::Set(faceting),
|
||
pagination: Setting::Set(pagination),
|
||
_kind: PhantomData,
|
||
})
|
||
}
|
||
|
||
/// Return the total number of documents contained in the index + the selected documents.
|
||
pub fn retrieve_documents<S: AsRef<str>>(
|
||
&self,
|
||
offset: usize,
|
||
limit: usize,
|
||
attributes_to_retrieve: Option<Vec<S>>,
|
||
) -> Result<(u64, Vec<Document>)> {
|
||
let txn = self.read_txn()?;
|
||
|
||
let fields_ids_map = self.fields_ids_map(&txn)?;
|
||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||
|
||
let mut documents = Vec::new();
|
||
for entry in self.all_documents(&txn)?.skip(offset).take(limit) {
|
||
let (_id, obkv) = entry?;
|
||
let document = obkv_to_json(&all_fields, &fields_ids_map, obkv)?;
|
||
let document = match &attributes_to_retrieve {
|
||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||
&document,
|
||
attributes_to_retrieve.iter().map(|s| s.as_ref()),
|
||
),
|
||
None => document,
|
||
};
|
||
documents.push(document);
|
||
}
|
||
|
||
let number_of_documents = self.number_of_documents(&txn)?;
|
||
|
||
Ok((number_of_documents, documents))
|
||
}
|
||
|
||
pub fn retrieve_document<S: AsRef<str>>(
|
||
&self,
|
||
doc_id: String,
|
||
attributes_to_retrieve: Option<Vec<S>>,
|
||
) -> Result<Document> {
|
||
let txn = self.read_txn()?;
|
||
|
||
let fields_ids_map = self.fields_ids_map(&txn)?;
|
||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||
|
||
let internal_id = self
|
||
.external_documents_ids(&txn)?
|
||
.get(doc_id.as_bytes())
|
||
.ok_or_else(|| IndexError::DocumentNotFound(doc_id.clone()))?;
|
||
|
||
let document = self
|
||
.documents(&txn, std::iter::once(internal_id))?
|
||
.into_iter()
|
||
.next()
|
||
.map(|(_, d)| d)
|
||
.ok_or(IndexError::DocumentNotFound(doc_id))?;
|
||
|
||
let document = obkv_to_json(&all_fields, &fields_ids_map, document)?;
|
||
let document = match &attributes_to_retrieve {
|
||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||
&document,
|
||
attributes_to_retrieve.iter().map(|s| s.as_ref()),
|
||
),
|
||
None => document,
|
||
};
|
||
|
||
Ok(document)
|
||
}
|
||
|
||
pub fn size(&self) -> Result<u64> {
|
||
Ok(self.inner.on_disk_size()?)
|
||
}
|
||
|
||
pub fn snapshot(&self, path: impl AsRef<Path>) -> Result<()> {
|
||
let mut dst = path.as_ref().join(format!("indexes/{}/", self.name));
|
||
create_dir_all(&dst)?;
|
||
dst.push("data.mdb");
|
||
let _txn = self.write_txn()?;
|
||
self.inner.copy_to_path(dst, CompactionOption::Enabled)?;
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// When running tests, when a server instance is dropped, the environment is not actually closed,
|
||
/// leaving a lot of open file descriptors.
|
||
impl Drop for Index {
|
||
fn drop(&mut self) {
|
||
// When dropping the last instance of an index, we want to close the index
|
||
// Note that the close is actually performed only if all the instances a effectively
|
||
// dropped
|
||
|
||
if Arc::strong_count(&self.inner) == 1 {
|
||
self.inner.as_ref().clone().prepare_for_closing();
|
||
}
|
||
}
|
||
}
|