2259: disable typos on words r=MarinPostma a=MarinPostma

Introduce the disable typo setting as per https://github.com/meilisearch/specifications/pull/117.

waiting for https://github.com/meilisearch/milli/pull/474.


Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2022-04-06 17:40:08 +00:00 committed by GitHub
commit c321ac61b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 77 additions and 7 deletions

View File

@ -120,6 +120,7 @@ pub enum Code {
IndexAlreadyExists,
IndexNotFound,
InvalidIndexUid,
InvalidMinWordLengthForTypo,
// invalid state error
InvalidState,
@ -271,6 +272,9 @@ impl Code {
InvalidApiKeyDescription => {
ErrCode::invalid("invalid_api_key_description", StatusCode::BAD_REQUEST)
}
InvalidMinWordLengthForTypo => {
ErrCode::invalid("invalid_min_word_length_for_typo", StatusCode::BAD_REQUEST)
}
}
}

View File

@ -41,7 +41,9 @@ impl ErrorCode for MilliError<'_> {
UserError::CriterionError(_) => Code::InvalidRankingRule,
UserError::InvalidGeoField { .. } => Code::InvalidGeoField,
UserError::SortError(_) => Code::Sort,
UserError::InvalidMinTypoWordLenSetting(_, _) => unreachable!(),
UserError::InvalidMinTypoWordLenSetting(_, _) => {
Code::InvalidMinWordLengthForTypo
}
}
}
}

View File

@ -5,6 +5,7 @@ use std::ops::Deref;
use std::path::Path;
use std::sync::Arc;
use fst::IntoStreamer;
use milli::heed::{EnvOpenOptions, RoTxn};
use milli::update::{IndexerConfig, Setting};
use milli::{obkv_to_json, FieldDistribution, FieldId};
@ -17,7 +18,7 @@ use crate::EnvSizer;
use super::error::IndexError;
use super::error::Result;
use super::updates::TypoSettings;
use super::updates::{MinWordLengthTypoSetting, TypoSettings};
use super::{Checked, Settings};
pub type Document = Map<String, Value>;
@ -169,8 +170,22 @@ impl Index {
})
.collect();
let min_typo_word_len = MinWordLengthTypoSetting {
one_typo: Setting::Set(self.min_word_len_one_typo(txn)?),
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
};
let disabled_words = self
.exact_words(txn)?
.into_stream()
.into_strs()?
.into_iter()
.collect();
let typo_tolerance = TypoSettings {
enabled: Setting::Set(self.authorize_typos(txn)?),
min_word_length_for_typo: Setting::Set(min_typo_word_len),
disable_on_words: Setting::Set(disabled_words),
};
Ok(Settings {

View File

@ -37,14 +37,33 @@ pub struct Checked;
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)]
pub struct Unchecked;
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct MinWordLengthTypoSetting {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub one_typo: Setting<u8>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub two_typos: Setting<u8>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct TypoSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub enabled: Setting<bool>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub min_word_length_for_typo: Setting<MinWordLengthTypoSetting>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_words: Setting<BTreeSet<String>>,
}
/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings
/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a
@ -352,14 +371,44 @@ pub fn apply_settings_to_builder(
}
match settings.typo {
Setting::Set(ref value) => match value.enabled {
Setting::Set(val) => builder.set_autorize_typos(val),
Setting::Reset => builder.reset_authorize_typos(),
Setting::NotSet => (),
},
Setting::Set(ref value) => {
match value.enabled {
Setting::Set(val) => builder.set_autorize_typos(val),
Setting::Reset => builder.reset_authorize_typos(),
Setting::NotSet => (),
}
match value.min_word_length_for_typo {
Setting::Set(ref setting) => {
match setting.one_typo {
Setting::Set(val) => builder.set_min_word_len_one_typo(val),
Setting::Reset => builder.reset_min_word_len_one_typo(),
Setting::NotSet => (),
}
match setting.two_typos {
Setting::Set(val) => builder.set_min_word_len_two_typos(val),
Setting::Reset => builder.reset_min_word_len_two_typos(),
Setting::NotSet => (),
}
}
Setting::Reset => {
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
}
Setting::NotSet => (),
}
match value.disable_on_words {
Setting::Set(ref words) => {
builder.set_exact_words(words.clone());
}
Setting::Reset => builder.reset_exact_words(),
Setting::NotSet => (),
}
}
Setting::Reset => {
// all typo settings need to be reset here.
builder.reset_authorize_typos();
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
}
Setting::NotSet => (),
}