introduce word len for typo setting

This commit is contained in:
ad hoc 2022-03-21 13:03:06 +01:00
parent 9fe40df960
commit 5a24e60572
No known key found for this signature in database
GPG Key ID: 4F00A782990CC643
3 changed files with 96 additions and 0 deletions

View File

@ -72,6 +72,7 @@ pub enum UserError {
SerdeJson(serde_json::Error),
SortError(SortError),
UnknownInternalDocumentId { document_id: DocumentId },
InvalidMinTypoWordSetting(u8, u8),
}
impl From<io::Error> for Error {
@ -291,6 +292,7 @@ ranking rules settings to use the sort parameter at search time.",
Self::UnknownInternalDocumentId { document_id } => {
write!(f, "An unknown internal document id have been used: `{}`.", document_id)
}
Self::InvalidMinTypoWordSetting(one, two) => write!(f, "Invalid settings for MinWordLenForTypo, expected 0 < 1-typo < 2-typos < 255, but found 1-typo: {} and 2-typo: {}", one, two),
}
}
}

View File

@ -23,6 +23,9 @@ use crate::{
Search, StrBEU32Codec, StrStrU8Codec, BEU32,
};
pub const DEFAULT_MIN_WORD_LEN_1_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_2_TYPOS: u8 = 9;
pub mod main_key {
pub const CRITERIA_KEY: &str = "criteria";
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
@ -47,6 +50,8 @@ pub mod main_key {
pub const CREATED_AT_KEY: &str = "created-at";
pub const UPDATED_AT_KEY: &str = "updated-at";
pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
}
pub mod db_name {
@ -886,6 +891,42 @@ impl Index {
Ok(())
}
pub fn min_word_len_1_typo(&self, txn: &RoTxn) -> heed::Result<u8> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
Ok(self
.main
.get::<_, Str, OwnedType<u8>>(txn, main_key::ONE_TYPO_WORD_LEN)?
.unwrap_or(DEFAULT_MIN_WORD_LEN_1_TYPO))
}
pub(crate) fn put_min_word_len_1_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?;
Ok(())
}
pub fn min_word_len_2_typo(&self, txn: &RoTxn) -> heed::Result<u8> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
Ok(self
.main
.get::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN)?
.unwrap_or(DEFAULT_MIN_WORD_LEN_2_TYPOS))
}
pub(crate) fn put_min_word_len_2_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
Ok(())
}
}
#[cfg(test)]

View File

@ -90,6 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> {
synonyms: Setting<HashMap<String, Vec<String>>>,
primary_key: Setting<String>,
authorize_typos: Setting<bool>,
min_2_typos_word_len: Setting<u8>,
min_1_typo_word_len: Setting<u8>,
}
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -112,6 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
primary_key: Setting::NotSet,
authorize_typos: Setting::NotSet,
indexer_config,
min_2_typos_word_len: Setting::Reset,
min_1_typo_word_len: Setting::Reset,
}
}
@ -196,6 +200,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.authorize_typos = Setting::Reset;
}
pub fn set_min_2_typos_word_len(&mut self, val: u8) {
self.min_2_typos_word_len = Setting::Set(val);
}
pub fn reset_min_2_typos_word_len(&mut self) {
self.min_2_typos_word_len = Setting::Reset;
}
pub fn set_min_1_typo_word_len(&mut self, val: u8) {
self.min_1_typo_word_len = Setting::Set(val);
}
pub fn reset_min_1_typos_word_len(&mut self) {
self.min_1_typo_word_len = Setting::Reset;
}
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync,
@ -474,6 +494,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
fn update_min_typo_word_len(&mut self) -> Result<()> {
match (&self.min_1_typo_word_len, &self.min_2_typos_word_len) {
(Setting::Set(one), Setting::Set(two)) => {
if one < two {
self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?;
self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?;
} else {
return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into());
}
}
(Setting::Set(one), _) => {
let two = self.index.min_word_len_2_typo(&self.wtxn)?;
if *one < two {
self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?;
} else {
return Err(UserError::InvalidMinTypoWordSetting(*one, two).into());
}
}
(_, Setting::Set(two)) => {
let one = self.index.min_word_len_1_typo(&self.wtxn)?;
if one < *two {
self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?;
} else {
return Err(UserError::InvalidMinTypoWordSetting(one, *two).into());
}
}
_ => (),
}
Ok(())
}
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync,
@ -490,6 +542,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.update_criteria()?;
self.update_primary_key()?;
self.update_authorize_typos()?;
self.update_min_typo_word_len()?;
// If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute,