diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 2860c4b86..2be7c1dd8 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; use serde_json::Value; -use super::serde::DocumentVisitor; +use super::serde_impl::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; use crate::FieldId; diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 14d97ee7d..8fd018328 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -5,15 +5,15 @@ mod builder; /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can /// later be read by milli using the `DocumentBatchReader` interface. mod reader; -mod serde; +mod serde_impl; use std::fmt::{self, Debug}; use std::io; -use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; pub use builder::DocumentBatchBuilder; pub use reader::DocumentBatchReader; +use serde::{Deserialize, Serialize}; use crate::FieldId; diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde_impl.rs similarity index 100% rename from milli/src/documents/serde.rs rename to milli/src/documents/serde_impl.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 853e7537d..c0be985da 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -52,6 +52,7 @@ pub mod main_key { pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; + pub const EXACT_WORDS: &str = "exact-words"; } pub mod db_name { @@ -927,6 +928,27 @@ impl Index { self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; Ok(()) } + + /// List the words on which typo are not allowed + pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + pub(crate) fn put_exact_words>( + &self, + txn: &mut RwTxn, + words: &fst::Set, + ) -> Result<()> { + self.main.put::<_, Str, ByteSlice>( + txn, + main_key::EXACT_WORDS, + words.as_fst().as_bytes(), + )?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 934d2fd9b..4eccae8ce 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::{cmp, fmt, mem}; use fst::Set; @@ -157,6 +158,7 @@ trait Context { } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; + fn exact_words(&self) -> crate::Result>>; } /// The query tree builder is the interface to build a query tree. @@ -186,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { let two = self.index.min_word_len_two_typos(&self.rtxn)?; Ok((one, two)) } + + fn exact_words(&self) -> crate::Result>> { + self.index.exact_words(self.rtxn) + } } impl<'a> QueryTreeBuilder<'a> { @@ -265,16 +271,17 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { pub max_typos: u8, pub word_len_one_typo: u8, pub word_len_two_typo: u8, + pub exact_words: fst::Set>, } /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. -fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { - if authorize_typos { +fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { + if authorize_typos && !config.exact_words.contains(&word) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { QueryKind::exact(word) @@ -333,7 +340,9 @@ fn create_query_tree( children.push(child); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo }; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos, config), @@ -385,8 +394,13 @@ fn create_query_tree( let concat = words.concat(); let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let config = - TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo }; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos, config), @@ -571,6 +585,8 @@ mod test { struct TestContext { synonyms: HashMap, Vec>>, postings: HashMap, + // Raw bytes for the exact word fst Set + exact_words: Vec, } impl TestContext { @@ -605,6 +621,10 @@ mod test { fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } + + fn exact_words(&self) -> crate::Result>> { + Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) + } } impl Default for TestContext { @@ -621,6 +641,8 @@ mod test { RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() } + let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); + TestContext { synonyms: hashmap! { vec![String::from("hello")] => vec![ @@ -660,6 +682,7 @@ mod test { String::from("good") => random_postings(rng, 1250), String::from("morning") => random_postings(rng, 125), }, + exact_words, } } } @@ -1225,7 +1248,9 @@ mod test { #[test] fn test_min_word_len_typo() { - let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 }; + let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); + let config = + TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words }; assert_eq!( typos("hello".to_string(), true, config.clone()), @@ -1242,4 +1267,20 @@ mod test { QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } ); } + #[test] + fn disable_typo_on_word() { + let query = "goodbye"; + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let result = analyzer.analyze(query); + + let tokens = result.tokens(); + let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); + let context = TestContext { exact_words, ..Default::default() }; + let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); + + assert!(matches!( + query_tree, + Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) + )); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c03d6e0ae..503fbd06e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> { authorize_typos: Setting, min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, + exact_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -113,9 +114,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, - indexer_config, + exact_words: Setting::NotSet, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, + indexer_config, } } @@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.min_word_len_one_typo = Setting::Reset; } + pub fn set_exact_words(&mut self, words: BTreeSet) { + self.exact_words = Setting::Set(words); + } + + pub fn reset_exact_words(&mut self) { + self.exact_words = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -526,6 +536,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_exact_words(&mut self) -> Result<()> { + match self.exact_words { + Setting::Set(ref mut words) => { + let words = fst::Set::from_iter(words.iter())?; + self.index.put_exact_words(&mut self.wtxn, &words)?; + } + Setting::Reset => { + self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?; + } + Setting::NotSet => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -543,6 +568,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_primary_key()?; self.update_authorize_typos()?; self.update_min_typo_word_len()?; + self.update_exact_words()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 00e6853cc..df15fb768 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -1,5 +1,10 @@ -use milli::update::{IndexerConfig, Settings}; -use milli::{Criterion, Search}; +use std::collections::BTreeSet; + +use heed::EnvOpenOptions; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{Criterion, Index, Search}; +use serde_json::json; +use tempfile::tempdir; use Criterion::*; #[test] @@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); } + +#[test] +fn test_typo_disabled_on_word() { + let tmp = tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp.path()).unwrap(); + + let documents = json!([ + { + "id": 1usize, + "data": "zealand", + }, + { + "id": 2usize, + "data": "zearand", + }, + ]); + + let mut writer = std::io::Cursor::new(Vec::new()); + let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); + builder.finish().unwrap(); + + writer.set_position(0); + + let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + + let mut txn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); + + builder.add_documents(documents).unwrap(); + + builder.execute().unwrap(); + txn.commit().unwrap(); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + let mut exact_words = BTreeSet::new(); + // `zealand` doesn't allow typos anymore + exact_words.insert("zealand".to_string()); + builder.set_exact_words(exact_words); + builder.execute(|_| ()).unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +}