MeiliSearch/meilisearch-http/src/index/mod.rs

224 lines
6.8 KiB
Rust
Raw Normal View History

2021-05-24 18:16:35 +02:00
use std::{collections::{BTreeSet, HashSet}, io::Write, marker::PhantomData, path::{Path, PathBuf}};
2021-03-04 11:56:32 +01:00
use std::ops::Deref;
use std::sync::Arc;
2021-05-24 18:16:35 +02:00
use std::fs::File;
2021-03-04 11:56:32 +01:00
2021-03-04 15:09:00 +01:00
use anyhow::{bail, Context};
2021-05-24 18:16:35 +02:00
use heed::RoTxn;
use indexmap::IndexMap;
2021-03-04 14:20:19 +01:00
use milli::obkv_to_json;
use serde_json::{Map, Value};
2021-03-04 14:20:19 +01:00
2021-04-14 17:53:12 +02:00
use crate::helpers::EnvSizer;
2021-03-04 11:56:32 +01:00
pub use search::{SearchQuery, SearchResult, DEFAULT_SEARCH_LIMIT};
pub use updates::{Facets, Settings, Checked, Unchecked};
2021-05-10 20:22:18 +02:00
use serde::{de::Deserializer, Deserialize};
2021-03-04 11:56:32 +01:00
2021-04-01 16:44:42 +02:00
mod search;
mod updates;
2021-03-04 14:20:19 +01:00
pub type Document = Map<String, Value>;
2021-03-04 11:56:32 +01:00
#[derive(Clone)]
pub struct Index(pub Arc<milli::Index>);
impl Deref for Index {
type Target = milli::Index;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}
2021-03-04 12:38:55 +01:00
2021-05-10 20:22:18 +02:00
pub fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
where
T: Deserialize<'de>,
D: Deserializer<'de>,
{
Deserialize::deserialize(deserializer).map(Some)
}
2021-03-04 12:38:55 +01:00
impl Index {
2021-05-10 17:30:09 +02:00
pub fn settings(&self) -> anyhow::Result<Settings<Checked>> {
2021-03-04 12:38:55 +01:00
let txn = self.read_txn()?;
2021-05-24 18:16:35 +02:00
self.settings_txn(&txn)
}
2021-03-04 12:38:55 +01:00
2021-05-24 18:16:35 +02:00
pub fn settings_txn(&self, txn: &RoTxn) -> anyhow::Result<Settings<Checked>> {
2021-03-04 12:38:55 +01:00
let displayed_attributes = self
.displayed_fields(&txn)?
2021-05-11 11:47:04 +02:00
.map(|fields| fields.into_iter().map(String::from).collect());
2021-03-04 12:38:55 +01:00
let searchable_attributes = self
.searchable_fields(&txn)?
2021-05-11 11:47:04 +02:00
.map(|fields| fields.into_iter().map(String::from).collect());
2021-03-04 12:38:55 +01:00
let faceted_attributes = self
.faceted_fields(&txn)?
.into_iter()
.map(|(k, v)| (k, v.to_string()))
.collect();
2021-03-11 22:39:16 +01:00
let criteria = self
.criteria(&txn)?
.into_iter()
.map(|c| c.to_string())
.collect();
let stop_words = self
.stop_words(&txn)?
.map(|stop_words| -> anyhow::Result<BTreeSet<_>> {
Ok(stop_words.stream().into_strs()?.into_iter().collect())
})
.transpose()?
.unwrap_or_else(BTreeSet::new);
2021-04-22 10:14:29 +02:00
let distinct_attribute = self.distinct_attribute(&txn)?.map(String::from);
2021-03-04 12:38:55 +01:00
Ok(Settings {
2021-05-11 11:47:04 +02:00
displayed_attributes: Some(displayed_attributes),
searchable_attributes: Some(searchable_attributes),
attributes_for_faceting: Some(Some(faceted_attributes)),
2021-03-11 22:39:16 +01:00
ranking_rules: Some(Some(criteria)),
stop_words: Some(Some(stop_words)),
2021-03-29 09:22:36 +02:00
distinct_attribute: Some(distinct_attribute),
2021-05-10 17:30:09 +02:00
_kind: PhantomData,
2021-03-04 12:38:55 +01:00
})
}
2021-03-04 14:20:19 +01:00
2021-03-15 18:11:10 +01:00
pub fn retrieve_documents<S: AsRef<str>>(
2021-03-04 14:20:19 +01:00
&self,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<S>>,
2021-03-15 18:11:10 +01:00
) -> anyhow::Result<Vec<Map<String, Value>>> {
2021-03-04 14:20:19 +01:00
let txn = self.read_txn()?;
let fields_ids_map = self.fields_ids_map(&txn)?;
2021-03-15 18:11:10 +01:00
let fields_to_display =
2021-04-19 16:22:41 +02:00
self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?;
2021-03-04 14:20:19 +01:00
let iter = self.documents.range(&txn, &(..))?.skip(offset).take(limit);
let mut documents = Vec::new();
2021-05-10 20:22:18 +02:00
println!("fields to display: {:?}", fields_to_display);
2021-03-04 14:20:19 +01:00
for entry in iter {
let (_id, obkv) = entry?;
let object = obkv_to_json(&fields_to_display, &fields_ids_map, obkv)?;
2021-03-04 14:20:19 +01:00
documents.push(object);
}
Ok(documents)
}
2021-03-04 15:09:00 +01:00
pub fn retrieve_document<S: AsRef<str>>(
&self,
doc_id: String,
attributes_to_retrieve: Option<Vec<S>>,
) -> anyhow::Result<Map<String, Value>> {
let txn = self.read_txn()?;
2021-03-04 15:09:00 +01:00
let fields_ids_map = self.fields_ids_map(&txn)?;
2021-03-04 15:09:00 +01:00
2021-03-15 18:11:10 +01:00
let fields_to_display =
2021-04-19 16:22:41 +02:00
self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?;
let internal_id = self
.external_documents_ids(&txn)?
.get(doc_id.as_bytes())
.with_context(|| format!("Document with id {} not found", doc_id))?;
let document = self
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next()
.map(|(_, d)| d);
match document {
2021-03-15 18:11:10 +01:00
Some(document) => Ok(obkv_to_json(&fields_to_display, &fields_ids_map, document)?),
None => bail!("Document with id {} not found", doc_id),
}
2021-03-04 15:09:00 +01:00
}
pub fn size(&self) -> u64 {
self.env.size()
2021-04-01 16:44:42 +02:00
}
fn fields_to_display<S: AsRef<str>>(
&self,
txn: &heed::RoTxn,
2021-04-19 16:22:41 +02:00
attributes_to_retrieve: &Option<Vec<S>>,
fields_ids_map: &milli::FieldsIdsMap,
) -> anyhow::Result<Vec<u8>> {
let mut displayed_fields_ids = match self.displayed_fields_ids(&txn)? {
Some(ids) => ids.into_iter().collect::<Vec<_>>(),
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
let attributes_to_retrieve_ids = match attributes_to_retrieve {
Some(attrs) => attrs
.iter()
.filter_map(|f| fields_ids_map.id(f.as_ref()))
.collect::<HashSet<_>>(),
None => fields_ids_map.iter().map(|(id, _)| id).collect(),
};
displayed_fields_ids.retain(|fid| attributes_to_retrieve_ids.contains(fid));
Ok(displayed_fields_ids)
}
2021-05-24 18:16:35 +02:00
pub fn dump(&self, path: PathBuf) -> anyhow::Result<()> {
// acquire write txn make sure any ongoing write is finnished before we start.
let txn = self.env.write_txn()?;
self.dump_documents(&txn, &path)?;
self.dump_meta(&txn, &path)?;
Ok(())
}
fn dump_documents(&self, txn: &RoTxn, path: impl AsRef<Path>) -> anyhow::Result<()> {
println!("dumping documents");
let document_file_path = path.as_ref().join("documents.jsonl");
let mut document_file = File::create(&document_file_path)?;
let documents = self.all_documents(txn)?;
let fields_ids_map = self.fields_ids_map(txn)?;
// dump documents
let mut json_map = IndexMap::new();
for document in documents {
let (_, reader) = document?;
for (fid, bytes) in reader.iter() {
if let Some(name) = fields_ids_map.name(fid) {
json_map.insert(name, serde_json::from_slice::<serde_json::Value>(bytes)?);
}
}
serde_json::to_writer(&mut document_file, &json_map)?;
document_file.write(b"\n")?;
json_map.clear();
}
Ok(())
}
fn dump_meta(&self, txn: &RoTxn, path: impl AsRef<Path>) -> anyhow::Result<()> {
println!("dumping settings");
let meta_file_path = path.as_ref().join("meta.json");
let mut meta_file = File::create(&meta_file_path)?;
let settings = self.settings_txn(txn)?;
let json = serde_json::json!({
"settings": settings,
});
serde_json::to_writer(&mut meta_file, &json)?;
Ok(())
}
2021-03-04 12:38:55 +01:00
}