473: set minimum word len for typos r=MarinPostma a=MarinPostma

this PR allows the configuration on the minimum word length for typos.

The default values are the same as previously.

## steps
- [x] introduce settings for the minimum word length for 1 and 2 typos
- [x] update the settings update flow to set this setting
- [x] create a structure `TypoConfig` to configure typo tolerance in the query builder
- [x] in `typo`, use the configuration to create the appropriate query tree node.
- [x] extend `Context` to return the setting for minimum word length for typos
- [x] return correct error message for wrong settings.
- [x] merge #469 

Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2022-04-04 17:53:14 +00:00 committed by GitHub
commit 48a5ce7434
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 301 additions and 8 deletions

View File

@ -72,6 +72,7 @@ pub enum UserError {
SerdeJson(serde_json::Error), SerdeJson(serde_json::Error),
SortError(SortError), SortError(SortError),
UnknownInternalDocumentId { document_id: DocumentId }, UnknownInternalDocumentId { document_id: DocumentId },
InvalidMinTypoWordLenSetting(u8, u8),
} }
impl From<io::Error> for Error { impl From<io::Error> for Error {
@ -291,6 +292,7 @@ ranking rules settings to use the sort parameter at search time.",
Self::UnknownInternalDocumentId { document_id } => { Self::UnknownInternalDocumentId { document_id } => {
write!(f, "An unknown internal document id have been used: `{}`.", document_id) write!(f, "An unknown internal document id have been used: `{}`.", document_id)
} }
Self::InvalidMinTypoWordLenSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two),
} }
} }
} }

View File

@ -23,6 +23,9 @@ use crate::{
Search, StrBEU32Codec, StrStrU8Codec, BEU32, Search, StrBEU32Codec, StrStrU8Codec, BEU32,
}; };
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
pub mod main_key { pub mod main_key {
pub const CRITERIA_KEY: &str = "criteria"; pub const CRITERIA_KEY: &str = "criteria";
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
@ -47,6 +50,8 @@ pub mod main_key {
pub const CREATED_AT_KEY: &str = "created-at"; pub const CREATED_AT_KEY: &str = "created-at";
pub const UPDATED_AT_KEY: &str = "updated-at"; pub const UPDATED_AT_KEY: &str = "updated-at";
pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
} }
pub mod db_name { pub mod db_name {
@ -886,6 +891,42 @@ impl Index {
Ok(()) Ok(())
} }
pub fn min_word_len_one_typo(&self, txn: &RoTxn) -> heed::Result<u8> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
Ok(self
.main
.get::<_, Str, OwnedType<u8>>(txn, main_key::ONE_TYPO_WORD_LEN)?
.unwrap_or(DEFAULT_MIN_WORD_LEN_ONE_TYPO))
}
pub(crate) fn put_min_word_len_one_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?;
Ok(())
}
pub fn min_word_len_two_typos(&self, txn: &RoTxn) -> heed::Result<u8> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
Ok(self
.main
.get::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN)?
.unwrap_or(DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}
pub(crate) fn put_min_word_len_two_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> {
// It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We
// identify 0 as being false, and anything else as true. The absence of a value is true,
// because by default, we authorize typos.
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
Ok(())
}
} }
#[cfg(test)] #[cfg(test)]
@ -896,6 +937,7 @@ pub(crate) mod tests {
use maplit::btreemap; use maplit::btreemap;
use tempfile::TempDir; use tempfile::TempDir;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
use crate::Index; use crate::Index;
@ -1023,4 +1065,22 @@ pub(crate) mod tests {
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();
assert!(!index.authorize_typos(&txn).unwrap()); assert!(!index.authorize_typos(&txn).unwrap());
} }
#[test]
fn set_min_word_len_for_typos() {
let index = TempIndex::new();
let mut txn = index.write_txn().unwrap();
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO);
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS);
index.put_min_word_len_one_typo(&mut txn, 3).unwrap();
index.put_min_word_len_two_typos(&mut txn, 15).unwrap();
txn.commit().unwrap();
let txn = index.read_txn().unwrap();
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 3);
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 15);
}
} }

View File

@ -155,6 +155,8 @@ trait Context {
None => Ok(None), None => Ok(None),
} }
} }
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
} }
/// The query tree builder is the interface to build a query tree. /// The query tree builder is the interface to build a query tree.
@ -178,6 +180,12 @@ impl<'a> Context for QueryTreeBuilder<'a> {
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> { fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
self.index.word_documents_count(self.rtxn, word) self.index.word_documents_count(self.rtxn, word)
} }
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
let one = self.index.min_word_len_one_typo(&self.rtxn)?;
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
Ok((one, two))
}
} }
impl<'a> QueryTreeBuilder<'a> { impl<'a> QueryTreeBuilder<'a> {
@ -256,14 +264,24 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()]))) Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
} }
#[derive(Clone)]
pub struct TypoConfig {
pub max_typos: u8,
pub word_len_one_typo: u8,
pub word_len_two_typo: u8,
}
/// Return the `QueryKind` of a word depending on `authorize_typos` /// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length. /// and the provided word length.
fn typos(word: String, authorize_typos: bool, max_typos: u8) -> QueryKind { fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
if authorize_typos { if authorize_typos {
match word.chars().count() { let count = word.chars().count().min(u8::MAX as usize) as u8;
0..=4 => QueryKind::exact(word), if count < config.word_len_one_typo {
5..=8 => QueryKind::tolerant(1.min(max_typos), word), QueryKind::exact(word)
_ => QueryKind::tolerant(2.min(max_typos), word), } else if count < config.word_len_two_typo {
QueryKind::tolerant(1.min(config.max_typos), word)
} else {
QueryKind::tolerant(2.min(config.max_typos), word)
} }
} else { } else {
QueryKind::exact(word) QueryKind::exact(word)
@ -314,9 +332,11 @@ fn create_query_tree(
if let Some(child) = split_best_frequency(ctx, &word)? { if let Some(child) = split_best_frequency(ctx, &word)? {
children.push(child); children.push(child);
} }
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
children.push(Operation::Query(Query { children.push(Operation::Query(Query {
prefix, prefix,
kind: typos(word, authorize_typos, 2), kind: typos(word, authorize_typos, config),
})); }));
Ok(Operation::or(false, children)) Ok(Operation::or(false, children))
} }
@ -363,9 +383,13 @@ fn create_query_tree(
.collect(); .collect();
let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
let concat = words.concat(); let concat = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
let config =
TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
let query = Query { let query = Query {
prefix: is_prefix, prefix: is_prefix,
kind: typos(concat, authorize_typos, 1), kind: typos(concat, authorize_typos, config),
}; };
operations.push(Operation::Query(query)); operations.push(Operation::Query(query));
and_op_children.push(Operation::or(false, operations)); and_op_children.push(Operation::or(false, operations));
@ -541,6 +565,7 @@ mod test {
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
use super::*; use super::*;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
#[derive(Debug)] #[derive(Debug)]
struct TestContext { struct TestContext {
@ -576,6 +601,10 @@ mod test {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms.get(&words).cloned()) Ok(self.synonyms.get(&words).cloned())
} }
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}
} }
impl Default for TestContext { impl Default for TestContext {
@ -1193,4 +1222,24 @@ mod test {
assert_eq!(expected, query_tree); assert_eq!(expected, query_tree);
} }
#[test]
fn test_min_word_len_typo() {
let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
assert_eq!(
typos("hello".to_string(), true, config.clone()),
QueryKind::Tolerant { typo: 1, word: "hello".to_string() }
);
assert_eq!(
typos("hell".to_string(), true, config.clone()),
QueryKind::exact("hell".to_string())
);
assert_eq!(
typos("verylongword".to_string(), true, config.clone()),
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
);
}
} }

View File

@ -14,7 +14,7 @@ use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::{FieldsIdsMap, Index, Result}; use crate::{FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Copy)]
pub enum Setting<T> { pub enum Setting<T> {
Set(T), Set(T),
Reset, Reset,
@ -90,6 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> {
synonyms: Setting<HashMap<String, Vec<String>>>, synonyms: Setting<HashMap<String, Vec<String>>>,
primary_key: Setting<String>, primary_key: Setting<String>,
authorize_typos: Setting<bool>, authorize_typos: Setting<bool>,
min_word_len_two_typos: Setting<u8>,
min_word_len_one_typo: Setting<u8>,
} }
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -112,6 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
primary_key: Setting::NotSet, primary_key: Setting::NotSet,
authorize_typos: Setting::NotSet, authorize_typos: Setting::NotSet,
indexer_config, indexer_config,
min_word_len_two_typos: Setting::Reset,
min_word_len_one_typo: Setting::Reset,
} }
} }
@ -196,6 +200,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.authorize_typos = Setting::Reset; self.authorize_typos = Setting::Reset;
} }
pub fn set_min_word_len_two_typos(&mut self, val: u8) {
self.min_word_len_two_typos = Setting::Set(val);
}
pub fn reset_min_word_len_two_typos(&mut self) {
self.min_word_len_two_typos = Setting::Reset;
}
pub fn set_min_word_len_one_typo(&mut self, val: u8) {
self.min_word_len_one_typo = Setting::Set(val);
}
pub fn reset_min_word_len_one_typo(&mut self) {
self.min_word_len_one_typo = Setting::Reset;
}
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where where
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
@ -474,6 +494,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
} }
fn update_min_typo_word_len(&mut self) -> Result<()> {
match (self.min_word_len_one_typo, self.min_word_len_two_typos) {
(Setting::Set(one), Setting::Set(two)) => {
if one > two {
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
} else {
self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?;
self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?;
}
}
(Setting::Set(one), _) => {
let two = self.index.min_word_len_two_typos(&self.wtxn)?;
if one > two {
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
} else {
self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?;
}
}
(_, Setting::Set(two)) => {
let one = self.index.min_word_len_one_typo(&self.wtxn)?;
if one > two {
return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into());
} else {
self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?;
}
}
_ => (),
}
Ok(())
}
pub fn execute<F>(mut self, progress_callback: F) -> Result<()> pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where where
F: Fn(UpdateIndexingStep) + Sync, F: Fn(UpdateIndexingStep) + Sync,
@ -490,6 +542,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.update_criteria()?; self.update_criteria()?;
self.update_primary_key()?; self.update_primary_key()?;
self.update_authorize_typos()?; self.update_authorize_typos()?;
self.update_min_typo_word_len()?;
// If there is new faceted fields we indicate that we must reindex as we must // If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute, // index new fields as facets. It means that the distinct attribute,
@ -1233,4 +1286,37 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
assert!(!index.authorize_typos(&txn).unwrap()); assert!(!index.authorize_typos(&txn).unwrap());
} }
#[test]
fn update_min_word_len_for_typo() {
let index = TempIndex::new();
let config = IndexerConfig::default();
// Set the genres setting
let mut txn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut txn, &index, &config);
builder.set_min_word_len_one_typo(8);
builder.set_min_word_len_two_typos(8);
builder.execute(|_| ()).unwrap();
txn.commit().unwrap();
let txn = index.read_txn().unwrap();
assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8);
assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8);
}
#[test]
fn update_invalid_min_word_len_for_typo() {
let index = TempIndex::new();
let config = IndexerConfig::default();
// Set the genres setting
let mut txn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut txn, &index, &config);
builder.set_min_word_len_one_typo(10);
builder.set_min_word_len_two_typos(7);
assert!(builder.execute(|_| ()).is_err());
}
} }

View File

@ -16,6 +16,7 @@ mod distinct;
mod filters; mod filters;
mod query_criteria; mod query_criteria;
mod sort; mod sort;
mod typo_tolerance;
pub const TEST_QUERY: &'static str = "hello world america"; pub const TEST_QUERY: &'static str = "hello world america";

View File

@ -0,0 +1,95 @@
use milli::update::{IndexerConfig, Settings};
use milli::{Criterion, Search};
use Criterion::*;
#[test]
fn test_typo_tolerance_one_typo() {
let criteria = [Typo];
let index = super::setup_search_index_with_criteria(&criteria);
// basic typo search with default typo settings
{
let txn = index.read_txn().unwrap();
let mut search = Search::new(&txn, &index);
search.query("zeal");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
let mut search = Search::new(&txn, &index);
search.query("zean");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 0);
}
let mut txn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut txn, &index, &config);
builder.set_min_word_len_one_typo(4);
builder.execute(|_| ()).unwrap();
// typo is now supported for 4 letters words
let mut search = Search::new(&txn, &index);
search.query("zean");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
}
#[test]
fn test_typo_tolerance_two_typo() {
let criteria = [Typo];
let index = super::setup_search_index_with_criteria(&criteria);
// basic typo search with default typo settings
{
let txn = index.read_txn().unwrap();
let mut search = Search::new(&txn, &index);
search.query("zealand");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
let mut search = Search::new(&txn, &index);
search.query("zealemd");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 0);
}
let mut txn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut txn, &index, &config);
builder.set_min_word_len_two_typos(7);
builder.execute(|_| ()).unwrap();
// typo is now supported for 4 letters words
let mut search = Search::new(&txn, &index);
search.query("zealemd");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
}