From 5a24e605728c7c6b2a80b5d90c1dc553ebe3f9ba Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 13:03:06 +0100 Subject: [PATCH] introduce word len for typo setting --- milli/src/error.rs | 2 ++ milli/src/index.rs | 41 ++++++++++++++++++++++++++++ milli/src/update/settings.rs | 53 ++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/milli/src/error.rs b/milli/src/error.rs index e6fbc0605..3ef6aa81d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -72,6 +72,7 @@ pub enum UserError { SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, + InvalidMinTypoWordSetting(u8, u8), } impl From for Error { @@ -291,6 +292,7 @@ ranking rules settings to use the sort parameter at search time.", Self::UnknownInternalDocumentId { document_id } => { write!(f, "An unknown internal document id have been used: `{}`.", document_id) } + Self::InvalidMinTypoWordSetting(one, two) => write!(f, "Invalid settings for MinWordLenForTypo, expected 0 < 1-typo < 2-typos < 255, but found 1-typo: {} and 2-typo: {}", one, two), } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index badcac0e5..3c1ba948f 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,9 @@ use crate::{ Search, StrBEU32Codec, StrStrU8Codec, BEU32, }; +pub const DEFAULT_MIN_WORD_LEN_1_TYPO: u8 = 5; +pub const DEFAULT_MIN_WORD_LEN_2_TYPOS: u8 = 9; + pub mod main_key { pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; @@ -47,6 +50,8 @@ pub mod main_key { pub const CREATED_AT_KEY: &str = "created-at"; pub const UPDATED_AT_KEY: &str = "updated-at"; pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; + pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; + pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; } pub mod db_name { @@ -886,6 +891,42 @@ impl Index { Ok(()) } + + pub fn min_word_len_1_typo(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_1_TYPO)) + } + + pub(crate) fn put_min_word_len_1_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?; + Ok(()) + } + + pub fn min_word_len_2_typo(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_2_TYPOS)) + } + + pub(crate) fn put_min_word_len_2_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 17924da8a..72b416b02 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -90,6 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> { synonyms: Setting>>, primary_key: Setting, authorize_typos: Setting, + min_2_typos_word_len: Setting, + min_1_typo_word_len: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -112,6 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, indexer_config, + min_2_typos_word_len: Setting::Reset, + min_1_typo_word_len: Setting::Reset, } } @@ -196,6 +200,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.authorize_typos = Setting::Reset; } + pub fn set_min_2_typos_word_len(&mut self, val: u8) { + self.min_2_typos_word_len = Setting::Set(val); + } + + pub fn reset_min_2_typos_word_len(&mut self) { + self.min_2_typos_word_len = Setting::Reset; + } + + pub fn set_min_1_typo_word_len(&mut self, val: u8) { + self.min_1_typo_word_len = Setting::Set(val); + } + + pub fn reset_min_1_typos_word_len(&mut self) { + self.min_1_typo_word_len = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -474,6 +494,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_min_typo_word_len(&mut self) -> Result<()> { + match (&self.min_1_typo_word_len, &self.min_2_typos_word_len) { + (Setting::Set(one), Setting::Set(two)) => { + if one < two { + self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; + self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; + } else { + return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into()); + } + } + (Setting::Set(one), _) => { + let two = self.index.min_word_len_2_typo(&self.wtxn)?; + if *one < two { + self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; + } else { + return Err(UserError::InvalidMinTypoWordSetting(*one, two).into()); + } + } + (_, Setting::Set(two)) => { + let one = self.index.min_word_len_1_typo(&self.wtxn)?; + if one < *two { + self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; + } else { + return Err(UserError::InvalidMinTypoWordSetting(one, *two).into()); + } + } + _ => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -490,6 +542,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_criteria()?; self.update_primary_key()?; self.update_authorize_typos()?; + self.update_min_typo_word_len()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute,