From a1a3a49bc9493c91c38b86d11370d2c66d8d348f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 13:29:59 +0100 Subject: [PATCH] dynamic minimum word len for typos in query tree builder --- milli/src/search/query_tree.rs | 40 ++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 5437199e1..6db2ce7a7 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,6 +155,8 @@ trait Context { None => Ok(None), } } + /// Returns the minimum word len for 1 and 2 typos. + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; } /// The query tree builder is the interface to build a query tree. @@ -178,6 +180,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn word_documents_count(&self, word: &str) -> heed::Result> { self.index.word_documents_count(self.rtxn, word) } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + let one = self.index.min_word_len_1_typo(&self.rtxn)?; + let two = self.index.min_word_len_2_typo(&self.rtxn)?; + Ok((one, two)) + } } impl<'a> QueryTreeBuilder<'a> { @@ -256,14 +264,23 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { +fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { if authorize_typos { - match word.chars().count() { - 0..=4 => QueryKind::exact(word), - 5..=8 => QueryKind::tolerant(1.min(max_typos), word), - _ => QueryKind::tolerant(2.min(max_typos), word), + let count = word.chars().count().min(u8::MAX as usize) as u8; + if (0..config.word_len_1_typo).contains(&count) { + QueryKind::exact(word) + } else if (config.word_len_1_typo..config.word_len_2_typo).contains(&count) { + QueryKind::tolerant(1.min(config.max_typos), word) + } else { + QueryKind::tolerant(2.min(config.max_typos), word) } } else { QueryKind::exact(word) @@ -314,9 +331,11 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } + let (word_len_1_typo, word_len_2_typo) = ctx.min_word_len_for_typo()?; + let config = TypoConfig { max_typos: 2, word_len_1_typo, word_len_2_typo }; children.push(Operation::Query(Query { prefix, - kind: typos(word, authorize_typos, 2), + kind: typos(word, authorize_typos, config), })); Ok(Operation::or(false, children)) } @@ -363,9 +382,12 @@ fn create_query_tree( .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); + let (word_len_1_typo, word_len_2_typo) = ctx.min_word_len_for_typo()?; + let config = + TypoConfig { max_typos: 1, word_len_1_typo, word_len_2_typo }; let query = Query { prefix: is_prefix, - kind: typos(concat, authorize_typos, 1), + kind: typos(concat, authorize_typos, config), }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); @@ -576,6 +598,10 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + Ok((5, 9)) + } } impl Default for TestContext {