push the document_format and settings I forgot in the previous PR

This commit is contained in:
Tamo 2022-10-11 18:03:10 +02:00 committed by Clément Renault
parent 667c282e19
commit 141a1c9464
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 584 additions and 0 deletions

View File

@ -0,0 +1,155 @@
use std::borrow::Borrow;
use std::fmt::{self, Debug, Display};
use std::io::{self, BufReader, Read, Seek, Write};
use crate::error::{Code, ErrorCode};
use crate::internal_error;
use either::Either;
use milli::documents::{DocumentsBatchBuilder, Error};
use milli::Object;
use serde::Deserialize;
type Result<T> = std::result::Result<T, DocumentFormatError>;
#[derive(Debug)]
pub enum PayloadType {
Ndjson,
Json,
Csv,
}
impl fmt::Display for PayloadType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PayloadType::Ndjson => f.write_str("ndjson"),
PayloadType::Json => f.write_str("json"),
PayloadType::Csv => f.write_str("csv"),
}
}
}
#[derive(Debug)]
pub enum DocumentFormatError {
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
MalformedPayload(Error, PayloadType),
}
impl Display for DocumentFormatError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e),
Self::MalformedPayload(me, b) => match me.borrow() {
Error::Json(se) => {
// https://github.com/meilisearch/meilisearch/issues/2107
// The user input maybe insanely long. We need to truncate it.
let mut serde_msg = se.to_string();
let ellipsis = "...";
if serde_msg.len() > 100 + ellipsis.len() {
serde_msg.replace_range(50..serde_msg.len() - 85, ellipsis);
}
write!(
f,
"The `{}` payload provided is malformed. `Couldn't serialize document value: {}`.",
b, serde_msg
)
}
_ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me),
},
}
}
}
impl std::error::Error for DocumentFormatError {}
impl From<(PayloadType, Error)> for DocumentFormatError {
fn from((ty, error): (PayloadType, Error)) -> Self {
match error {
Error::Io(e) => Self::Internal(Box::new(e)),
e => Self::MalformedPayload(e, ty),
}
}
}
impl ErrorCode for DocumentFormatError {
fn error_code(&self) -> Code {
match self {
DocumentFormatError::Internal(_) => Code::Internal,
DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload,
}
}
}
internal_error!(DocumentFormatError: io::Error);
/// Reads CSV from input and write an obkv batch to writer.
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut builder = DocumentsBatchBuilder::new(writer);
let csv = csv::Reader::from_reader(input);
builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?;
let count = builder.documents_count();
let _ = builder
.into_inner()
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
Ok(count as usize)
}
/// Reads JSON Lines from input and write an obkv batch to writer.
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut builder = DocumentsBatchBuilder::new(writer);
let reader = BufReader::new(input);
for result in serde_json::Deserializer::from_reader(reader).into_iter() {
let object = result
.map_err(Error::Json)
.map_err(|e| (PayloadType::Ndjson, e))?;
builder
.append_json_object(&object)
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
}
let count = builder.documents_count();
let _ = builder
.into_inner()
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
Ok(count as usize)
}
/// Reads JSON from input and write an obkv batch to writer.
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut builder = DocumentsBatchBuilder::new(writer);
let reader = BufReader::new(input);
#[derive(Deserialize, Debug)]
#[serde(transparent)]
struct ArrayOrSingleObject {
#[serde(with = "either::serde_untagged")]
inner: Either<Vec<Object>, Object>,
}
let content: ArrayOrSingleObject = serde_json::from_reader(reader)
.map_err(Error::Json)
.map_err(|e| (PayloadType::Json, e))?;
for object in content.inner.map_right(|o| vec![o]).into_inner() {
builder
.append_json_object(&object)
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
}
let count = builder.documents_count();
let _ = builder
.into_inner()
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
Ok(count as usize)
}

View File

@ -0,0 +1,429 @@
use std::collections::{BTreeMap, BTreeSet};
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use milli::update::Setting;
use serde::{Deserialize, Serialize, Serializer};
fn serialize_with_wildcard<S>(
field: &Setting<Vec<String>>,
s: S,
) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
let wildcard = vec!["*".to_string()];
match field {
Setting::Set(value) => Some(value),
Setting::Reset => Some(&wildcard),
Setting::NotSet => None,
}
.serialize(s)
}
#[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)]
pub struct Checked;
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Unchecked;
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct MinWordSizeTyposSetting {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub one_typo: Setting<u8>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub two_typos: Setting<u8>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct TypoSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub enabled: Setting<bool>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub min_word_size_for_typos: Setting<MinWordSizeTyposSetting>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_words: Setting<BTreeSet<String>>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_attributes: Setting<BTreeSet<String>>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct FacetingSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub max_values_per_facet: Setting<usize>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct PaginationSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub max_total_hits: Setting<usize>,
}
/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings
/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a
/// call to `check` will return a `Settings<Checked>` from a `Settings<Unchecked>`.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
#[serde(bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>"))]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
pub struct Settings<T> {
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub displayed_attributes: Setting<Vec<String>>,
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub searchable_attributes: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub filterable_attributes: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub sortable_attributes: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub ranking_rules: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub stop_words: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub synonyms: Setting<BTreeMap<String, Vec<String>>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub distinct_attribute: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub typo_tolerance: Setting<TypoSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub faceting: Setting<FacetingSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub pagination: Setting<PaginationSettings>,
#[serde(skip)]
pub _kind: PhantomData<T>,
}
impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> {
Settings {
displayed_attributes: Setting::Reset,
searchable_attributes: Setting::Reset,
filterable_attributes: Setting::Reset,
sortable_attributes: Setting::Reset,
ranking_rules: Setting::Reset,
stop_words: Setting::Reset,
synonyms: Setting::Reset,
distinct_attribute: Setting::Reset,
typo_tolerance: Setting::Reset,
faceting: Setting::Reset,
pagination: Setting::Reset,
_kind: PhantomData,
}
}
pub fn into_unchecked(self) -> Settings<Unchecked> {
let Self {
displayed_attributes,
searchable_attributes,
filterable_attributes,
sortable_attributes,
ranking_rules,
stop_words,
synonyms,
distinct_attribute,
typo_tolerance,
faceting,
pagination,
..
} = self;
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes,
sortable_attributes,
ranking_rules,
stop_words,
synonyms,
distinct_attribute,
typo_tolerance,
faceting,
pagination,
_kind: PhantomData,
}
}
}
impl Settings<Unchecked> {
pub fn check(self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
} else {
Setting::Set(fields)
}
}
otherwise => otherwise,
};
let searchable_attributes = match self.searchable_attributes {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
} else {
Setting::Set(fields)
}
}
otherwise => otherwise,
};
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes: self.filterable_attributes,
sortable_attributes: self.sortable_attributes,
ranking_rules: self.ranking_rules,
stop_words: self.stop_words,
synonyms: self.synonyms,
distinct_attribute: self.distinct_attribute,
typo_tolerance: self.typo_tolerance,
faceting: self.faceting,
pagination: self.pagination,
_kind: PhantomData,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct Facets {
pub level_group_size: Option<NonZeroUsize>,
pub min_level_size: Option<NonZeroUsize>,
}
pub fn apply_settings_to_builder(
settings: &Settings<Checked>,
builder: &mut milli::update::Settings,
) {
match settings.searchable_attributes {
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => (),
}
match settings.displayed_attributes {
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => (),
}
match settings.filterable_attributes {
Setting::Set(ref facets) => {
builder.set_filterable_fields(facets.clone().into_iter().collect())
}
Setting::Reset => builder.reset_filterable_fields(),
Setting::NotSet => (),
}
match settings.sortable_attributes {
Setting::Set(ref fields) => builder.set_sortable_fields(fields.iter().cloned().collect()),
Setting::Reset => builder.reset_sortable_fields(),
Setting::NotSet => (),
}
match settings.ranking_rules {
Setting::Set(ref criteria) => builder.set_criteria(criteria.clone()),
Setting::Reset => builder.reset_criteria(),
Setting::NotSet => (),
}
match settings.stop_words {
Setting::Set(ref stop_words) => builder.set_stop_words(stop_words.clone()),
Setting::Reset => builder.reset_stop_words(),
Setting::NotSet => (),
}
match settings.synonyms {
Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()),
Setting::Reset => builder.reset_synonyms(),
Setting::NotSet => (),
}
match settings.distinct_attribute {
Setting::Set(ref attr) => builder.set_distinct_field(attr.clone()),
Setting::Reset => builder.reset_distinct_field(),
Setting::NotSet => (),
}
match settings.typo_tolerance {
Setting::Set(ref value) => {
match value.enabled {
Setting::Set(val) => builder.set_autorize_typos(val),
Setting::Reset => builder.reset_authorize_typos(),
Setting::NotSet => (),
}
match value.min_word_size_for_typos {
Setting::Set(ref setting) => {
match setting.one_typo {
Setting::Set(val) => builder.set_min_word_len_one_typo(val),
Setting::Reset => builder.reset_min_word_len_one_typo(),
Setting::NotSet => (),
}
match setting.two_typos {
Setting::Set(val) => builder.set_min_word_len_two_typos(val),
Setting::Reset => builder.reset_min_word_len_two_typos(),
Setting::NotSet => (),
}
}
Setting::Reset => {
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
}
Setting::NotSet => (),
}
match value.disable_on_words {
Setting::Set(ref words) => {
builder.set_exact_words(words.clone());
}
Setting::Reset => builder.reset_exact_words(),
Setting::NotSet => (),
}
match value.disable_on_attributes {
Setting::Set(ref words) => {
builder.set_exact_attributes(words.iter().cloned().collect())
}
Setting::Reset => builder.reset_exact_attributes(),
Setting::NotSet => (),
}
}
Setting::Reset => {
// all typo settings need to be reset here.
builder.reset_authorize_typos();
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
builder.reset_exact_words();
builder.reset_exact_attributes();
}
Setting::NotSet => (),
}
match settings.faceting {
Setting::Set(ref value) => match value.max_values_per_facet {
Setting::Set(val) => builder.set_max_values_per_facet(val),
Setting::Reset => builder.reset_max_values_per_facet(),
Setting::NotSet => (),
},
Setting::Reset => builder.reset_max_values_per_facet(),
Setting::NotSet => (),
}
match settings.pagination {
Setting::Set(ref value) => match value.max_total_hits {
Setting::Set(val) => builder.set_pagination_max_total_hits(val),
Setting::Reset => builder.reset_pagination_max_total_hits(),
Setting::NotSet => (),
},
Setting::Reset => builder.reset_pagination_max_total_hits(),
Setting::NotSet => (),
}
}
#[cfg(test)]
pub(crate) mod test {
use proptest::prelude::*;
use super::*;
pub(super) fn setting_strategy<T: Arbitrary + Clone>() -> impl Strategy<Value = Setting<T>> {
prop_oneof![
Just(Setting::NotSet),
Just(Setting::Reset),
any::<T>().prop_map(Setting::Set)
]
}
#[test]
fn test_setting_check() {
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("hello")]),
searchable_attributes: Setting::Set(vec![String::from("hello")]),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::NotSet,
pagination: Setting::NotSet,
_kind: PhantomData::<Unchecked>,
};
let checked = settings.clone().check();
assert_eq!(settings.displayed_attributes, checked.displayed_attributes);
assert_eq!(
settings.searchable_attributes,
checked.searchable_attributes
);
// test wildcard
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("*")]),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::NotSet,
pagination: Setting::NotSet,
_kind: PhantomData::<Unchecked>,
};
let checked = settings.check();
assert_eq!(checked.displayed_attributes, Setting::Reset);
assert_eq!(checked.searchable_attributes, Setting::Reset);
}
}