Merge #474

474: Disable typos on exact word r=MarinPostma a=MarinPostma This PR introduces the `exact_word` setting to disable typo tolerance on custom words. If a user query contains a word from `exact_words`, no typo derivation will be made for that particular word. I have chosen to store the words in a FST, to save on deserialization, and allow for fast lookups. I had some trouble with the `serde` module, and had to rename it `serde_impl`. ## steps: - [x] introduce new settings to register words to disable typos on - [x] in `typos`, return exact match is the current word is part of the word to disable typos for. - [x] update `Context` to return the exact words dictionary. - [x] merge #473 Co-authored-by: ad hoc <postma.marin@protonmail.com>
2025-06-18 04:37:35 +02:00 · 2022-04-04 18:39:43 +00:00 · 2022-04-04 18:39:43 +00:00 · 900825bac0
commit 900825bac0
parent 48a5ce7434 3e67d8818c
7 changed files with 179 additions and 13 deletions
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt};
 use serde::Deserializer;
 use serde_json::Value;
-use super::serde::DocumentVisitor;
+use super::serde_impl::DocumentVisitor;
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 use crate::FieldId;
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -5,15 +5,15 @@ mod builder;
 /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
 /// later be read by milli using the `DocumentBatchReader` interface.
 mod reader;
-mod serde;
+mod serde_impl;
 use std::fmt::{self, Debug};
 use std::io;
 use ::serde::{Deserialize, Serialize};
 use bimap::BiHashMap;
 pub use builder::DocumentBatchBuilder;
 pub use reader::DocumentBatchReader;
 use serde::{Deserialize, Serialize};
 use crate::FieldId;
--- a/milli/src/documents/serde_impl.rs
+++ b/milli/src/documents/serde_impl.rs
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -52,6 +52,7 @@ pub mod main_key {
    pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
    pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
    pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
    pub const EXACT_WORDS: &str = "exact-words";
 }
 pub mod db_name {
@ -927,6 +928,27 @@ impl Index {
        self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
        Ok(())
    }
    /// List the words on which typo are not allowed
    pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
        match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
            Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
            None => Ok(fst::Set::default().map_data(Cow::Owned)?),
        }
    }
    pub(crate) fn put_exact_words<A: AsRef<[u8]>>(
        &self,
        txn: &mut RwTxn,
        words: &fst::Set<A>,
    ) -> Result<()> {
        self.main.put::<_, Str, ByteSlice>(
            txn,
            main_key::EXACT_WORDS,
            words.as_fst().as_bytes(),
        )?;
        Ok(())
    }
 }
 #[cfg(test)]
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -1,3 +1,4 @@
 use std::borrow::Cow;
 use std::{cmp, fmt, mem};
 use fst::Set;
@ -157,6 +158,7 @@ trait Context {
    }
    /// Returns the minimum word len for 1 and 2 typos.
    fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
    fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
 }
 /// The query tree builder is the interface to build a query tree.
@ -186,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
        let two = self.index.min_word_len_two_typos(&self.rtxn)?;
        Ok((one, two))
    }
    fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
        self.index.exact_words(self.rtxn)
    }
 }
 impl<'a> QueryTreeBuilder<'a> {
@ -265,16 +271,17 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
 }
 #[derive(Clone)]
-pub struct TypoConfig {
+pub struct TypoConfig<'a> {
    pub max_typos: u8,
    pub word_len_one_typo: u8,
    pub word_len_two_typo: u8,
    pub exact_words: fst::Set<Cow<'a, [u8]>>,
 }
 /// Return the `QueryKind` of a word depending on `authorize_typos`
 /// and the provided word length.
-fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
+fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
-    if authorize_typos {
+    if authorize_typos && !config.exact_words.contains(&word) {
        let count = word.chars().count().min(u8::MAX as usize) as u8;
        if count < config.word_len_one_typo {
            QueryKind::exact(word)
@ -333,7 +340,9 @@ fn create_query_tree(
                    children.push(child);
                }
                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
-                let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
+                let exact_words = ctx.exact_words()?;
                let config =
                    TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
                children.push(Operation::Query(Query {
                    prefix,
                    kind: typos(word, authorize_typos, config),
@ -385,8 +394,13 @@ fn create_query_tree(
                            let concat = words.concat();
                            let (word_len_one_typo, word_len_two_typo) =
                                ctx.min_word_len_for_typo()?;
-                            let config =
+                            let exact_words = ctx.exact_words()?;
-                                TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
+                            let config = TypoConfig {
                                max_typos: 1,
                                word_len_one_typo,
                                word_len_two_typo,
                                exact_words,
                            };
                            let query = Query {
                                prefix: is_prefix,
                                kind: typos(concat, authorize_typos, config),
@ -571,6 +585,8 @@ mod test {
    struct TestContext {
        synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
        postings: HashMap<String, RoaringBitmap>,
        // Raw bytes for the exact word fst Set
        exact_words: Vec<u8>,
    }
    impl TestContext {
@ -605,6 +621,10 @@ mod test {
        fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
            Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
        }
        fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
            Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
        }
    }
    impl Default for TestContext {
@ -621,6 +641,8 @@ mod test {
                RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
            }
            let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
            TestContext {
                synonyms: hashmap! {
                    vec![String::from("hello")] => vec![
@ -660,6 +682,7 @@ mod test {
                    String::from("good")       => random_postings(rng,   1250),
                    String::from("morning")    => random_postings(rng,    125),
                },
                exact_words,
            }
        }
    }
@ -1225,7 +1248,9 @@ mod test {
    #[test]
    fn test_min_word_len_typo() {
-        let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
+        let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
        let config =
            TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
        assert_eq!(
            typos("hello".to_string(), true, config.clone()),
@ -1242,4 +1267,20 @@ mod test {
            QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
        );
    }
    #[test]
    fn disable_typo_on_word() {
        let query = "goodbye";
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let result = analyzer.analyze(query);
        let tokens = result.tokens();
        let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
        let context = TestContext { exact_words, ..Default::default() };
        let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
        assert!(matches!(
            query_tree,
            Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
        ));
    }
 }
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
    authorize_typos: Setting<bool>,
    min_word_len_two_typos: Setting<u8>,
    min_word_len_one_typo: Setting<u8>,
    exact_words: Setting<BTreeSet<String>>,
 }
 impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -113,9 +114,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
            synonyms: Setting::NotSet,
            primary_key: Setting::NotSet,
            authorize_typos: Setting::NotSet,
-            indexer_config,
+            exact_words: Setting::NotSet,
            min_word_len_two_typos: Setting::Reset,
            min_word_len_one_typo: Setting::Reset,
            indexer_config,
        }
    }
@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.min_word_len_one_typo = Setting::Reset;
    }
    pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
        self.exact_words = Setting::Set(words);
    }
    pub fn reset_exact_words(&mut self) {
        self.exact_words = Setting::Reset;
    }
    fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
    where
        F: Fn(UpdateIndexingStep) + Sync,
@ -526,6 +536,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        Ok(())
    }
    fn update_exact_words(&mut self) -> Result<()> {
        match self.exact_words {
            Setting::Set(ref mut words) => {
                let words = fst::Set::from_iter(words.iter())?;
                self.index.put_exact_words(&mut self.wtxn, &words)?;
            }
            Setting::Reset => {
                self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?;
            }
            Setting::NotSet => (),
        }
        Ok(())
    }
    pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
    where
        F: Fn(UpdateIndexingStep) + Sync,
@ -543,6 +568,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.update_primary_key()?;
        self.update_authorize_typos()?;
        self.update_min_typo_word_len()?;
        self.update_exact_words()?;
        // If there is new faceted fields we indicate that we must reindex as we must
        // index new fields as facets. It means that the distinct attribute,
--- a/milli/tests/search/typo_tolerance.rs
+++ b/milli/tests/search/typo_tolerance.rs
@ -1,5 +1,10 @@
-use milli::update::{IndexerConfig, Settings};
+use std::collections::BTreeSet;
-use milli::{Criterion, Search};
+
 use heed::EnvOpenOptions;
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{Criterion, Index, Search};
 use serde_json::json;
 use tempfile::tempdir;
 use Criterion::*;
 #[test]
@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() {
    let result = search.execute().unwrap();
    assert_eq!(result.documents_ids.len(), 1);
 }
 #[test]
 fn test_typo_disabled_on_word() {
    let tmp = tempdir().unwrap();
    let mut options = EnvOpenOptions::new();
    options.map_size(4096 * 100);
    let index = Index::new(options, tmp.path()).unwrap();
    let documents = json!([
        {
            "id": 1usize,
            "data": "zealand",
        },
        {
            "id": 2usize,
            "data": "zearand",
        },
    ]);
    let mut writer = std::io::Cursor::new(Vec::new());
    let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
    let documents = serde_json::to_vec(&documents).unwrap();
    builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
    builder.finish().unwrap();
    writer.set_position(0);
    let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
    let mut txn = index.write_txn().unwrap();
    let config = IndexerConfig::default();
    let indexing_config = IndexDocumentsConfig::default();
    let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
    builder.add_documents(documents).unwrap();
    builder.execute().unwrap();
    txn.commit().unwrap();
    // basic typo search with default typo settings
    {
        let txn = index.read_txn().unwrap();
        let mut search = Search::new(&txn, &index);
        search.query("zealand");
        search.limit(10);
        search.authorize_typos(true);
        search.optional_words(true);
        let result = search.execute().unwrap();
        assert_eq!(result.documents_ids.len(), 2);
    }
    let mut txn = index.write_txn().unwrap();
    let config = IndexerConfig::default();
    let mut builder = Settings::new(&mut txn, &index, &config);
    let mut exact_words = BTreeSet::new();
    // `zealand` doesn't allow typos anymore
    exact_words.insert("zealand".to_string());
    builder.set_exact_words(exact_words);
    builder.execute(|_| ()).unwrap();
    let mut search = Search::new(&txn, &index);
    search.query("zealand");
    search.limit(10);
    search.authorize_typos(true);
    search.optional_words(true);
    let result = search.execute().unwrap();
    assert_eq!(result.documents_ids.len(), 1);
 }