From 141a1c94647882a4b44bd07ee4732b95d17884a5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 11 Oct 2022 18:03:10 +0200 Subject: [PATCH] =?UTF-8?q?push=20the=20document=5Fformat=20and=20settings?= =?UTF-8?q?=20I=C2=A0forgot=20in=20the=20previous=20PR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- meilisearch-types/src/document_formats.rs | 155 ++++++++ meilisearch-types/src/settings.rs | 429 ++++++++++++++++++++++ 2 files changed, 584 insertions(+) create mode 100644 meilisearch-types/src/document_formats.rs create mode 100644 meilisearch-types/src/settings.rs diff --git a/meilisearch-types/src/document_formats.rs b/meilisearch-types/src/document_formats.rs new file mode 100644 index 000000000..5a50bfc0f --- /dev/null +++ b/meilisearch-types/src/document_formats.rs @@ -0,0 +1,155 @@ +use std::borrow::Borrow; +use std::fmt::{self, Debug, Display}; +use std::io::{self, BufReader, Read, Seek, Write}; + +use crate::error::{Code, ErrorCode}; +use crate::internal_error; +use either::Either; +use milli::documents::{DocumentsBatchBuilder, Error}; +use milli::Object; +use serde::Deserialize; + +type Result = std::result::Result; + +#[derive(Debug)] +pub enum PayloadType { + Ndjson, + Json, + Csv, +} + +impl fmt::Display for PayloadType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + PayloadType::Ndjson => f.write_str("ndjson"), + PayloadType::Json => f.write_str("json"), + PayloadType::Csv => f.write_str("csv"), + } + } +} + +#[derive(Debug)] +pub enum DocumentFormatError { + Internal(Box), + MalformedPayload(Error, PayloadType), +} + +impl Display for DocumentFormatError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e), + Self::MalformedPayload(me, b) => match me.borrow() { + Error::Json(se) => { + // https://github.com/meilisearch/meilisearch/issues/2107 + // The user input maybe insanely long. We need to truncate it. + let mut serde_msg = se.to_string(); + let ellipsis = "..."; + if serde_msg.len() > 100 + ellipsis.len() { + serde_msg.replace_range(50..serde_msg.len() - 85, ellipsis); + } + + write!( + f, + "The `{}` payload provided is malformed. `Couldn't serialize document value: {}`.", + b, serde_msg + ) + } + _ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me), + }, + } + } +} + +impl std::error::Error for DocumentFormatError {} + +impl From<(PayloadType, Error)> for DocumentFormatError { + fn from((ty, error): (PayloadType, Error)) -> Self { + match error { + Error::Io(e) => Self::Internal(Box::new(e)), + e => Self::MalformedPayload(e, ty), + } + } +} + +impl ErrorCode for DocumentFormatError { + fn error_code(&self) -> Code { + match self { + DocumentFormatError::Internal(_) => Code::Internal, + DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload, + } + } +} + +internal_error!(DocumentFormatError: io::Error); + +/// Reads CSV from input and write an obkv batch to writer. +pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result { + let mut builder = DocumentsBatchBuilder::new(writer); + + let csv = csv::Reader::from_reader(input); + builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?; + + let count = builder.documents_count(); + let _ = builder + .into_inner() + .map_err(Into::into) + .map_err(DocumentFormatError::Internal)?; + + Ok(count as usize) +} + +/// Reads JSON Lines from input and write an obkv batch to writer. +pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result { + let mut builder = DocumentsBatchBuilder::new(writer); + let reader = BufReader::new(input); + + for result in serde_json::Deserializer::from_reader(reader).into_iter() { + let object = result + .map_err(Error::Json) + .map_err(|e| (PayloadType::Ndjson, e))?; + builder + .append_json_object(&object) + .map_err(Into::into) + .map_err(DocumentFormatError::Internal)?; + } + + let count = builder.documents_count(); + let _ = builder + .into_inner() + .map_err(Into::into) + .map_err(DocumentFormatError::Internal)?; + + Ok(count as usize) +} + +/// Reads JSON from input and write an obkv batch to writer. +pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { + let mut builder = DocumentsBatchBuilder::new(writer); + let reader = BufReader::new(input); + + #[derive(Deserialize, Debug)] + #[serde(transparent)] + struct ArrayOrSingleObject { + #[serde(with = "either::serde_untagged")] + inner: Either, Object>, + } + + let content: ArrayOrSingleObject = serde_json::from_reader(reader) + .map_err(Error::Json) + .map_err(|e| (PayloadType::Json, e))?; + + for object in content.inner.map_right(|o| vec![o]).into_inner() { + builder + .append_json_object(&object) + .map_err(Into::into) + .map_err(DocumentFormatError::Internal)?; + } + + let count = builder.documents_count(); + let _ = builder + .into_inner() + .map_err(Into::into) + .map_err(DocumentFormatError::Internal)?; + + Ok(count as usize) +} diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs new file mode 100644 index 000000000..a6d13d99f --- /dev/null +++ b/meilisearch-types/src/settings.rs @@ -0,0 +1,429 @@ +use std::collections::{BTreeMap, BTreeSet}; +use std::marker::PhantomData; +use std::num::NonZeroUsize; + +use milli::update::Setting; +use serde::{Deserialize, Serialize, Serializer}; + +fn serialize_with_wildcard( + field: &Setting>, + s: S, +) -> std::result::Result +where + S: Serializer, +{ + let wildcard = vec!["*".to_string()]; + match field { + Setting::Set(value) => Some(value), + Setting::Reset => Some(&wildcard), + Setting::NotSet => None, + } + .serialize(s) +} + +#[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)] +pub struct Checked; + +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct Unchecked; + +#[cfg_attr(test, derive(proptest_derive::Arbitrary))] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub struct MinWordSizeTyposSetting { + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub one_typo: Setting, + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub two_typos: Setting, +} + +#[cfg_attr(test, derive(proptest_derive::Arbitrary))] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub struct TypoSettings { + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub enabled: Setting, + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub min_word_size_for_typos: Setting, + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub disable_on_words: Setting>, + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub disable_on_attributes: Setting>, +} + +#[cfg_attr(test, derive(proptest_derive::Arbitrary))] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub struct FacetingSettings { + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub max_values_per_facet: Setting, +} + +#[cfg_attr(test, derive(proptest_derive::Arbitrary))] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub struct PaginationSettings { + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + pub max_total_hits: Setting, +} + +/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings +/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a +/// call to `check` will return a `Settings` from a `Settings`. +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[serde(bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>"))] +#[cfg_attr(test, derive(proptest_derive::Arbitrary))] +pub struct Settings { + #[serde( + default, + serialize_with = "serialize_with_wildcard", + skip_serializing_if = "Setting::is_not_set" + )] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub displayed_attributes: Setting>, + + #[serde( + default, + serialize_with = "serialize_with_wildcard", + skip_serializing_if = "Setting::is_not_set" + )] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub searchable_attributes: Setting>, + + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub filterable_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub sortable_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub ranking_rules: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub stop_words: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub synonyms: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub distinct_attribute: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub typo_tolerance: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub faceting: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))] + pub pagination: Setting, + + #[serde(skip)] + pub _kind: PhantomData, +} + +impl Settings { + pub fn cleared() -> Settings { + Settings { + displayed_attributes: Setting::Reset, + searchable_attributes: Setting::Reset, + filterable_attributes: Setting::Reset, + sortable_attributes: Setting::Reset, + ranking_rules: Setting::Reset, + stop_words: Setting::Reset, + synonyms: Setting::Reset, + distinct_attribute: Setting::Reset, + typo_tolerance: Setting::Reset, + faceting: Setting::Reset, + pagination: Setting::Reset, + _kind: PhantomData, + } + } + + pub fn into_unchecked(self) -> Settings { + let Self { + displayed_attributes, + searchable_attributes, + filterable_attributes, + sortable_attributes, + ranking_rules, + stop_words, + synonyms, + distinct_attribute, + typo_tolerance, + faceting, + pagination, + .. + } = self; + + Settings { + displayed_attributes, + searchable_attributes, + filterable_attributes, + sortable_attributes, + ranking_rules, + stop_words, + synonyms, + distinct_attribute, + typo_tolerance, + faceting, + pagination, + _kind: PhantomData, + } + } +} + +impl Settings { + pub fn check(self) -> Settings { + let displayed_attributes = match self.displayed_attributes { + Setting::Set(fields) => { + if fields.iter().any(|f| f == "*") { + Setting::Reset + } else { + Setting::Set(fields) + } + } + otherwise => otherwise, + }; + + let searchable_attributes = match self.searchable_attributes { + Setting::Set(fields) => { + if fields.iter().any(|f| f == "*") { + Setting::Reset + } else { + Setting::Set(fields) + } + } + otherwise => otherwise, + }; + + Settings { + displayed_attributes, + searchable_attributes, + filterable_attributes: self.filterable_attributes, + sortable_attributes: self.sortable_attributes, + ranking_rules: self.ranking_rules, + stop_words: self.stop_words, + synonyms: self.synonyms, + distinct_attribute: self.distinct_attribute, + typo_tolerance: self.typo_tolerance, + faceting: self.faceting, + pagination: self.pagination, + _kind: PhantomData, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub struct Facets { + pub level_group_size: Option, + pub min_level_size: Option, +} + +pub fn apply_settings_to_builder( + settings: &Settings, + builder: &mut milli::update::Settings, +) { + match settings.searchable_attributes { + Setting::Set(ref names) => builder.set_searchable_fields(names.clone()), + Setting::Reset => builder.reset_searchable_fields(), + Setting::NotSet => (), + } + + match settings.displayed_attributes { + Setting::Set(ref names) => builder.set_displayed_fields(names.clone()), + Setting::Reset => builder.reset_displayed_fields(), + Setting::NotSet => (), + } + + match settings.filterable_attributes { + Setting::Set(ref facets) => { + builder.set_filterable_fields(facets.clone().into_iter().collect()) + } + Setting::Reset => builder.reset_filterable_fields(), + Setting::NotSet => (), + } + + match settings.sortable_attributes { + Setting::Set(ref fields) => builder.set_sortable_fields(fields.iter().cloned().collect()), + Setting::Reset => builder.reset_sortable_fields(), + Setting::NotSet => (), + } + + match settings.ranking_rules { + Setting::Set(ref criteria) => builder.set_criteria(criteria.clone()), + Setting::Reset => builder.reset_criteria(), + Setting::NotSet => (), + } + + match settings.stop_words { + Setting::Set(ref stop_words) => builder.set_stop_words(stop_words.clone()), + Setting::Reset => builder.reset_stop_words(), + Setting::NotSet => (), + } + + match settings.synonyms { + Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => (), + } + + match settings.distinct_attribute { + Setting::Set(ref attr) => builder.set_distinct_field(attr.clone()), + Setting::Reset => builder.reset_distinct_field(), + Setting::NotSet => (), + } + + match settings.typo_tolerance { + Setting::Set(ref value) => { + match value.enabled { + Setting::Set(val) => builder.set_autorize_typos(val), + Setting::Reset => builder.reset_authorize_typos(), + Setting::NotSet => (), + } + + match value.min_word_size_for_typos { + Setting::Set(ref setting) => { + match setting.one_typo { + Setting::Set(val) => builder.set_min_word_len_one_typo(val), + Setting::Reset => builder.reset_min_word_len_one_typo(), + Setting::NotSet => (), + } + match setting.two_typos { + Setting::Set(val) => builder.set_min_word_len_two_typos(val), + Setting::Reset => builder.reset_min_word_len_two_typos(), + Setting::NotSet => (), + } + } + Setting::Reset => { + builder.reset_min_word_len_one_typo(); + builder.reset_min_word_len_two_typos(); + } + Setting::NotSet => (), + } + + match value.disable_on_words { + Setting::Set(ref words) => { + builder.set_exact_words(words.clone()); + } + Setting::Reset => builder.reset_exact_words(), + Setting::NotSet => (), + } + + match value.disable_on_attributes { + Setting::Set(ref words) => { + builder.set_exact_attributes(words.iter().cloned().collect()) + } + Setting::Reset => builder.reset_exact_attributes(), + Setting::NotSet => (), + } + } + Setting::Reset => { + // all typo settings need to be reset here. + builder.reset_authorize_typos(); + builder.reset_min_word_len_one_typo(); + builder.reset_min_word_len_two_typos(); + builder.reset_exact_words(); + builder.reset_exact_attributes(); + } + Setting::NotSet => (), + } + + match settings.faceting { + Setting::Set(ref value) => match value.max_values_per_facet { + Setting::Set(val) => builder.set_max_values_per_facet(val), + Setting::Reset => builder.reset_max_values_per_facet(), + Setting::NotSet => (), + }, + Setting::Reset => builder.reset_max_values_per_facet(), + Setting::NotSet => (), + } + + match settings.pagination { + Setting::Set(ref value) => match value.max_total_hits { + Setting::Set(val) => builder.set_pagination_max_total_hits(val), + Setting::Reset => builder.reset_pagination_max_total_hits(), + Setting::NotSet => (), + }, + Setting::Reset => builder.reset_pagination_max_total_hits(), + Setting::NotSet => (), + } +} + +#[cfg(test)] +pub(crate) mod test { + use proptest::prelude::*; + + use super::*; + + pub(super) fn setting_strategy() -> impl Strategy> { + prop_oneof![ + Just(Setting::NotSet), + Just(Setting::Reset), + any::().prop_map(Setting::Set) + ] + } + + #[test] + fn test_setting_check() { + // test no changes + let settings = Settings { + displayed_attributes: Setting::Set(vec![String::from("hello")]), + searchable_attributes: Setting::Set(vec![String::from("hello")]), + filterable_attributes: Setting::NotSet, + sortable_attributes: Setting::NotSet, + ranking_rules: Setting::NotSet, + stop_words: Setting::NotSet, + synonyms: Setting::NotSet, + distinct_attribute: Setting::NotSet, + typo_tolerance: Setting::NotSet, + faceting: Setting::NotSet, + pagination: Setting::NotSet, + _kind: PhantomData::, + }; + + let checked = settings.clone().check(); + assert_eq!(settings.displayed_attributes, checked.displayed_attributes); + assert_eq!( + settings.searchable_attributes, + checked.searchable_attributes + ); + + // test wildcard + // test no changes + let settings = Settings { + displayed_attributes: Setting::Set(vec![String::from("*")]), + searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]), + filterable_attributes: Setting::NotSet, + sortable_attributes: Setting::NotSet, + ranking_rules: Setting::NotSet, + stop_words: Setting::NotSet, + synonyms: Setting::NotSet, + distinct_attribute: Setting::NotSet, + typo_tolerance: Setting::NotSet, + faceting: Setting::NotSet, + pagination: Setting::NotSet, + _kind: PhantomData::, + }; + + let checked = settings.check(); + assert_eq!(checked.displayed_attributes, Setting::Reset); + assert_eq!(checked.searchable_attributes, Setting::Reset); + } +}