From 9bbffb8fee9ab73fb59eab731f1e739c85e536dd Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 14:03:31 +0100 Subject: [PATCH 1/8] add exact words setting --- milli/src/index.rs | 22 ++++++++++++++++++++++ milli/src/update/settings.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 853e7537d..c0be985da 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -52,6 +52,7 @@ pub mod main_key { pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; + pub const EXACT_WORDS: &str = "exact-words"; } pub mod db_name { @@ -927,6 +928,27 @@ impl Index { self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; Ok(()) } + + /// List the words on which typo are not allowed + pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + pub(crate) fn put_exact_words>( + &self, + txn: &mut RwTxn, + words: &fst::Set, + ) -> Result<()> { + self.main.put::<_, Str, ByteSlice>( + txn, + main_key::EXACT_WORDS, + words.as_fst().as_bytes(), + )?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c03d6e0ae..513dee42c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> { authorize_typos: Setting, min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, + exact_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -113,6 +114,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, + exact_words: Setting::NotSet, indexer_config, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, @@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.min_word_len_one_typo = Setting::Reset; } + pub fn set_exact_words(&mut self, words: Vec) { + self.exact_words = Setting::Set(words); + } + + pub fn reset_exact_words(&mut self) { + self.exact_words = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -526,6 +536,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_exact_words(&mut self) -> Result<()> { + match self.exact_words { + Setting::Set(ref mut words) => { + words.sort_unstable(); + let words = fst::Set::from_iter(words)?; + self.index.put_exact_words(&mut self.wtxn, &words)?; + } + Setting::Reset => { + self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?; + } + Setting::NotSet => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -543,6 +569,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_primary_key()?; self.update_authorize_typos()?; self.update_min_typo_word_len()?; + self.update_exact_words()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, From 774fa8f06578d7dd0d660efe2f084429f4fb31c6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 16:25:15 +0100 Subject: [PATCH 2/8] disable typos on exact words --- milli/src/search/query_tree.rs | 35 +++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 934d2fd9b..a31a71590 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,4 @@ -use std::{cmp, fmt, mem}; +use std::{borrow::Cow, cmp, fmt, mem}; use fst::Set; use meilisearch_tokenizer::token::SeparatorKind; @@ -157,6 +157,7 @@ trait Context { } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; + fn exact_words(&self) -> crate::Result>>; } /// The query tree builder is the interface to build a query tree. @@ -186,6 +187,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { let two = self.index.min_word_len_two_typos(&self.rtxn)?; Ok((one, two)) } + + fn exact_words(&self) -> crate::Result>> { + self.index.exact_words(self.rtxn) + } } impl<'a> QueryTreeBuilder<'a> { @@ -265,15 +270,16 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { pub max_typos: u8, pub word_len_one_typo: u8, pub word_len_two_typo: u8, + pub exact_words: fst::Set>, } /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. -fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { +fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { if authorize_typos { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { @@ -333,7 +339,9 @@ fn create_query_tree( children.push(child); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo }; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos, config), @@ -385,8 +393,13 @@ fn create_query_tree( let concat = words.concat(); let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let config = - TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo }; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos, config), @@ -605,6 +618,12 @@ mod test { fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } + + fn exact_words(&self) -> crate::Result>> { + let builder = fst::SetBuilder::new(Vec::new()).unwrap(); + let data = builder.into_inner().unwrap(); + Ok(fst::Set::new(Cow::Owned(data)).unwrap()) + } } impl Default for TestContext { @@ -1225,7 +1244,9 @@ mod test { #[test] fn test_min_word_len_typo() { - let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 }; + let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); + let config = + TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words }; assert_eq!( typos("hello".to_string(), true, config.clone()), From 8b1e5d9c6d654be95159519e8c233e8868694e1b Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 22 Mar 2022 09:55:49 +0100 Subject: [PATCH 3/8] add test for exact words --- milli/src/search/query_tree.rs | 25 ++++++++++++++++++++++--- milli/src/update/settings.rs | 9 ++++----- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index a31a71590..0014075d4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -584,6 +584,8 @@ mod test { struct TestContext { synonyms: HashMap, Vec>>, postings: HashMap, + // Raw bytes for the exact word fst Set + exact_words: Vec, } impl TestContext { @@ -620,9 +622,7 @@ mod test { } fn exact_words(&self) -> crate::Result>> { - let builder = fst::SetBuilder::new(Vec::new()).unwrap(); - let data = builder.into_inner().unwrap(); - Ok(fst::Set::new(Cow::Owned(data)).unwrap()) + Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) } } @@ -640,6 +640,8 @@ mod test { RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() } + let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); + TestContext { synonyms: hashmap! { vec![String::from("hello")] => vec![ @@ -679,6 +681,7 @@ mod test { String::from("good") => random_postings(rng, 1250), String::from("morning") => random_postings(rng, 125), }, + exact_words, } } } @@ -1263,4 +1266,20 @@ mod test { QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } ); } + #[test] + fn disable_typo_on_word() { + let query = "goodbye"; + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let result = analyzer.analyze(query); + + let tokens = result.tokens(); + let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); + let context = TestContext { exact_words, ..Default::default() }; + let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); + + assert!(matches!( + query_tree, + Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) + )); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 513dee42c..503fbd06e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -92,7 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> { authorize_typos: Setting, min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, - exact_words: Setting>, + exact_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -115,9 +115,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, exact_words: Setting::NotSet, - indexer_config, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, + indexer_config, } } @@ -218,7 +218,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.min_word_len_one_typo = Setting::Reset; } - pub fn set_exact_words(&mut self, words: Vec) { + pub fn set_exact_words(&mut self, words: BTreeSet) { self.exact_words = Setting::Set(words); } @@ -539,8 +539,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { - words.sort_unstable(); - let words = fst::Set::from_iter(words)?; + let words = fst::Set::from_iter(words.iter())?; self.index.put_exact_words(&mut self.wtxn, &words)?; } Setting::Reset => { From 559e46be5e23d5fca45856a5201dfa223bfa3d29 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 11:02:43 +0200 Subject: [PATCH 4/8] fix bad rebase bug --- milli/src/search/query_tree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 0014075d4..585f4fbf3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -280,7 +280,7 @@ pub struct TypoConfig<'a> { /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { - if authorize_typos { + if authorize_typos && !config.exact_words.contains(&word) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { QueryKind::exact(word) @@ -1278,7 +1278,7 @@ mod test { let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); assert!(matches!( - query_tree, + dbg!(query_tree), Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) )); } From 0fd55db21c1fbe3471e8210b943dc426f9741b5e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 11:52:35 +0200 Subject: [PATCH 5/8] fmt --- milli/src/search/query_tree.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 585f4fbf3..4eccae8ce 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, cmp, fmt, mem}; +use std::borrow::Cow; +use std::{cmp, fmt, mem}; use fst::Set; use meilisearch_tokenizer::token::SeparatorKind; @@ -1278,7 +1279,7 @@ mod test { let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); assert!(matches!( - dbg!(query_tree), + query_tree, Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) )); } From 30a2711bacfdfbe1fee4a9b52a840c1c7b890c8e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 13:51:50 +0200 Subject: [PATCH 6/8] rename serde module to serde_impl module needed because of issues with rustfmt --- milli/src/documents/builder.rs | 2 +- milli/src/documents/mod.rs | 4 ++-- milli/src/documents/{serde.rs => serde_impl.rs} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename milli/src/documents/{serde.rs => serde_impl.rs} (100%) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 2860c4b86..2be7c1dd8 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; use serde_json::Value; -use super::serde::DocumentVisitor; +use super::serde_impl::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; use crate::FieldId; diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 14d97ee7d..8fd018328 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -5,15 +5,15 @@ mod builder; /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can /// later be read by milli using the `DocumentBatchReader` interface. mod reader; -mod serde; +mod serde_impl; use std::fmt::{self, Debug}; use std::io; -use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; pub use builder::DocumentBatchBuilder; pub use reader::DocumentBatchReader; +use serde::{Deserialize, Serialize}; use crate::FieldId; diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde_impl.rs similarity index 100% rename from milli/src/documents/serde.rs rename to milli/src/documents/serde_impl.rs From 284d8a24e0caf2376a68aeb1fd63691e4b2270c9 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 13:59:29 +0200 Subject: [PATCH 7/8] add intergration test for disabled typon on word --- milli/tests/search/typo_tolerance.rs | 81 +++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 00e6853cc..7d19e4ab0 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -1,5 +1,10 @@ -use milli::update::{IndexerConfig, Settings}; -use milli::{Criterion, Search}; +use std::collections::BTreeSet; + +use heed::EnvOpenOptions; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{Criterion, Index, Search}; +use serde_json::json; +use tempfile::tempdir; use Criterion::*; #[test] @@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); } + +#[test] +fn test_typo_disabled_on_word() { + let tmp = tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp.path()).unwrap(); + + let documents = json!([ + { + "id": 1usize, + "data": "zealand", + }, + { + "id": 2usize, + "data": "zearand", + }, + ]); + + let mut writer = std::io::Cursor::new(Vec::new()); + let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); + builder.finish().unwrap(); + + writer.set_position(0); + + let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + + let mut txn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); + + builder.add_documents(documents).unwrap(); + + builder.execute().unwrap(); + txn.commit().unwrap(); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + let mut exact_words = BTreeSet::new(); + // sealand doesn't allow typos anymore + exact_words.insert("zealand".to_string()); + builder.set_exact_words(exact_words); + builder.execute(|_| ()).unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} From 3e67d8818cc84127881883f0dca17e95e365b511 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 20:34:23 +0200 Subject: [PATCH 8/8] fix typo in test comment --- milli/tests/search/typo_tolerance.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 7d19e4ab0..df15fb768 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -156,7 +156,7 @@ fn test_typo_disabled_on_word() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); let mut exact_words = BTreeSet::new(); - // sealand doesn't allow typos anymore + // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); builder.execute(|_| ()).unwrap();