Merge #474

474: Disable typos on exact word r=MarinPostma a=MarinPostma This PR introduces the `exact_word` setting to disable typo tolerance on custom words. If a user query contains a word from `exact_words`, no typo derivation will be made for that particular word. I have chosen to store the words in a FST, to save on deserialization, and allow for fast lookups. I had some trouble with the `serde` module, and had to rename it `serde_impl`. ## steps: - [x] introduce new settings to register words to disable typos on - [x] in `typos`, return exact match is the current word is part of the word to disable typos for. - [x] update `Context` to return the exact words dictionary. - [x] merge #473 Co-authored-by: ad hoc <postma.marin@protonmail.com>
2025-07-04 20:37:15 +02:00 · 2022-04-04 18:39:43 +00:00 · 2022-04-04 18:39:43 +00:00 · 900825bac0
commit 900825bac0
parent 48a5ce7434 3e67d8818c
7 changed files with 179 additions and 13 deletions
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt};
 use serde::Deserializer;
 use serde_json::Value;

-use super::serde::DocumentVisitor;
+use super::serde_impl::DocumentVisitor;
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 use crate::FieldId;

--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -5,15 +5,15 @@ mod builder;
 /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
 /// later be read by milli using the `DocumentBatchReader` interface.
 mod reader;
-mod serde;
+mod serde_impl;

 use std::fmt::{self, Debug};
 use std::io;

-use ::serde::{Deserialize, Serialize};
 use bimap::BiHashMap;
 pub use builder::DocumentBatchBuilder;
 pub use reader::DocumentBatchReader;
+use serde::{Deserialize, Serialize};

 use crate::FieldId;

--- a/milli/src/documents/serde_impl.rs
+++ b/milli/src/documents/serde_impl.rs
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -52,6 +52,7 @@ pub mod main_key {
    pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
    pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
    pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
+    pub const EXACT_WORDS: &str = "exact-words";
 }

 pub mod db_name {
@ -927,6 +928,27 @@ impl Index {
        self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
        Ok(())
    }
+
+    /// List the words on which typo are not allowed
+    pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
+        match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
+            Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
+            None => Ok(fst::Set::default().map_data(Cow::Owned)?),
+        }
+    }
+
+    pub(crate) fn put_exact_words<A: AsRef<[u8]>>(
+        &self,
+        txn: &mut RwTxn,
+        words: &fst::Set<A>,
+    ) -> Result<()> {
+        self.main.put::<_, Str, ByteSlice>(
+            txn,
+            main_key::EXACT_WORDS,
+            words.as_fst().as_bytes(),
+        )?;
+        Ok(())
+    }
 }

 #[cfg(test)]
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::{cmp, fmt, mem};

 use fst::Set;
@ -157,6 +158,7 @@ trait Context {
    }
    /// Returns the minimum word len for 1 and 2 typos.
    fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
+    fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
 }

 /// The query tree builder is the interface to build a query tree.
@ -186,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
        let two = self.index.min_word_len_two_typos(&self.rtxn)?;
        Ok((one, two))
    }
+
+    fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
+        self.index.exact_words(self.rtxn)
+    }
 }

 impl<'a> QueryTreeBuilder<'a> {
@ -265,16 +271,17 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
 }

 #[derive(Clone)]
-pub struct TypoConfig {
+pub struct TypoConfig<'a> {
    pub max_typos: u8,
    pub word_len_one_typo: u8,
    pub word_len_two_typo: u8,
+    pub exact_words: fst::Set<Cow<'a, [u8]>>,
 }

 /// Return the `QueryKind` of a word depending on `authorize_typos`
 /// and the provided word length.
-fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
-    if authorize_typos {
+fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
+    if authorize_typos && !config.exact_words.contains(&word) {
        let count = word.chars().count().min(u8::MAX as usize) as u8;
        if count < config.word_len_one_typo {
            QueryKind::exact(word)
@ -333,7 +340,9 @@ fn create_query_tree(
                    children.push(child);
                }
                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
-                let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
+                let exact_words = ctx.exact_words()?;
+                let config =
+                    TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
                children.push(Operation::Query(Query {
                    prefix,
                    kind: typos(word, authorize_typos, config),
@ -385,8 +394,13 @@ fn create_query_tree(
                            let concat = words.concat();
                            let (word_len_one_typo, word_len_two_typo) =
                                ctx.min_word_len_for_typo()?;
-                            let config =
-                                TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
+                            let exact_words = ctx.exact_words()?;
+                            let config = TypoConfig {
+                                max_typos: 1,
+                                word_len_one_typo,
+                                word_len_two_typo,
+                                exact_words,
+                            };
                            let query = Query {
                                prefix: is_prefix,
                                kind: typos(concat, authorize_typos, config),
@ -571,6 +585,8 @@ mod test {
    struct TestContext {
        synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
        postings: HashMap<String, RoaringBitmap>,
+        // Raw bytes for the exact word fst Set
+        exact_words: Vec<u8>,
    }

    impl TestContext {
@ -605,6 +621,10 @@ mod test {
        fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
            Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
        }
+
+        fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
+            Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
+        }
    }

    impl Default for TestContext {
@ -621,6 +641,8 @@ mod test {
                RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
            }

+            let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
+
            TestContext {
                synonyms: hashmap! {
                    vec![String::from("hello")] => vec![
@ -660,6 +682,7 @@ mod test {
                    String::from("good")       => random_postings(rng,   1250),
                    String::from("morning")    => random_postings(rng,    125),
                },
+                exact_words,
            }
        }
    }
@ -1225,7 +1248,9 @@ mod test {

    #[test]
    fn test_min_word_len_typo() {
-        let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
+        let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
+        let config =
+            TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };

        assert_eq!(
            typos("hello".to_string(), true, config.clone()),
@ -1242,4 +1267,20 @@ mod test {
            QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
        );
    }
+    #[test]
+    fn disable_typo_on_word() {
+        let query = "goodbye";
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
+        let result = analyzer.analyze(query);
+
+        let tokens = result.tokens();
+        let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
+        let context = TestContext { exact_words, ..Default::default() };
+        let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
+
+        assert!(matches!(
+            query_tree,
+            Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
+        ));
+    }
 }
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
    authorize_typos: Setting<bool>,
    min_word_len_two_typos: Setting<u8>,
    min_word_len_one_typo: Setting<u8>,
+    exact_words: Setting<BTreeSet<String>>,
 }

 impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -113,9 +114,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
            synonyms: Setting::NotSet,
            primary_key: Setting::NotSet,
            authorize_typos: Setting::NotSet,
-            indexer_config,
+            exact_words: Setting::NotSet,
            min_word_len_two_typos: Setting::Reset,
            min_word_len_one_typo: Setting::Reset,
+            indexer_config,
        }
    }

@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.min_word_len_one_typo = Setting::Reset;
    }

+    pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
+        self.exact_words = Setting::Set(words);
+    }
+
+    pub fn reset_exact_words(&mut self) {
+        self.exact_words = Setting::Reset;
+    }
+
    fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
    where
        F: Fn(UpdateIndexingStep) + Sync,
@ -526,6 +536,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        Ok(())
    }

+    fn update_exact_words(&mut self) -> Result<()> {
+        match self.exact_words {
+            Setting::Set(ref mut words) => {
+                let words = fst::Set::from_iter(words.iter())?;
+                self.index.put_exact_words(&mut self.wtxn, &words)?;
+            }
+            Setting::Reset => {
+                self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?;
+            }
+            Setting::NotSet => (),
+        }
+
+        Ok(())
+    }
+
    pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
    where
        F: Fn(UpdateIndexingStep) + Sync,
@ -543,6 +568,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.update_primary_key()?;
        self.update_authorize_typos()?;
        self.update_min_typo_word_len()?;
+        self.update_exact_words()?;

        // If there is new faceted fields we indicate that we must reindex as we must
        // index new fields as facets. It means that the distinct attribute,
--- a/milli/tests/search/typo_tolerance.rs
+++ b/milli/tests/search/typo_tolerance.rs
@ -1,5 +1,10 @@
-use milli::update::{IndexerConfig, Settings};
-use milli::{Criterion, Search};
+use std::collections::BTreeSet;
+
+use heed::EnvOpenOptions;
+use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
+use milli::{Criterion, Index, Search};
+use serde_json::json;
+use tempfile::tempdir;
 use Criterion::*;

 #[test]
@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() {
    let result = search.execute().unwrap();
    assert_eq!(result.documents_ids.len(), 1);
 }
+
+#[test]
+fn test_typo_disabled_on_word() {
+    let tmp = tempdir().unwrap();
+    let mut options = EnvOpenOptions::new();
+    options.map_size(4096 * 100);
+    let index = Index::new(options, tmp.path()).unwrap();
+
+    let documents = json!([
+        {
+            "id": 1usize,
+            "data": "zealand",
+        },
+        {
+            "id": 2usize,
+            "data": "zearand",
+        },
+    ]);
+
+    let mut writer = std::io::Cursor::new(Vec::new());
+    let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
+    let documents = serde_json::to_vec(&documents).unwrap();
+    builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
+    builder.finish().unwrap();
+
+    writer.set_position(0);
+
+    let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
+
+    let mut txn = index.write_txn().unwrap();
+    let config = IndexerConfig::default();
+    let indexing_config = IndexDocumentsConfig::default();
+    let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
+
+    builder.add_documents(documents).unwrap();
+
+    builder.execute().unwrap();
+    txn.commit().unwrap();
+
+    // basic typo search with default typo settings
+    {
+        let txn = index.read_txn().unwrap();
+
+        let mut search = Search::new(&txn, &index);
+        search.query("zealand");
+        search.limit(10);
+        search.authorize_typos(true);
+        search.optional_words(true);
+
+        let result = search.execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2);
+    }
+
+    let mut txn = index.write_txn().unwrap();
+
+    let config = IndexerConfig::default();
+    let mut builder = Settings::new(&mut txn, &index, &config);
+    let mut exact_words = BTreeSet::new();
+    // `zealand` doesn't allow typos anymore
+    exact_words.insert("zealand".to_string());
+    builder.set_exact_words(exact_words);
+    builder.execute(|_| ()).unwrap();
+
+    let mut search = Search::new(&txn, &index);
+    search.query("zealand");
+    search.limit(10);
+    search.authorize_typos(true);
+    search.optional_words(true);
+
+    let result = search.execute().unwrap();
+    assert_eq!(result.documents_ids.len(), 1);
+}