474: Disable typos on exact word r=MarinPostma a=MarinPostma

This PR introduces the `exact_word` setting to disable typo tolerance on custom words.

If a user query contains a word from `exact_words`, no typo derivation will be made for that particular word.

I have chosen to store the words in a FST, to save on deserialization, and allow for fast lookups.

I had some trouble with the `serde` module, and had to rename it `serde_impl`.

## steps:
- [x] introduce new settings to register words to disable typos on
- [x] in `typos`, return exact match is the current word is part of the word to disable typos for.
- [x] update `Context` to return the exact words dictionary.
- [x] merge #473 


Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
bors[bot] 2022-04-04 18:39:43 +00:00 committed by GitHub
commit 900825bac0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 179 additions and 13 deletions

View File

@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt};
use serde::Deserializer;
use serde_json::Value;
use super::serde::DocumentVisitor;
use super::serde_impl::DocumentVisitor;
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
use crate::FieldId;

View File

@ -5,15 +5,15 @@ mod builder;
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
/// later be read by milli using the `DocumentBatchReader` interface.
mod reader;
mod serde;
mod serde_impl;
use std::fmt::{self, Debug};
use std::io;
use ::serde::{Deserialize, Serialize};
use bimap::BiHashMap;
pub use builder::DocumentBatchBuilder;
pub use reader::DocumentBatchReader;
use serde::{Deserialize, Serialize};
use crate::FieldId;

View File

@ -52,6 +52,7 @@ pub mod main_key {
pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
pub const EXACT_WORDS: &str = "exact-words";
}
pub mod db_name {
@ -927,6 +928,27 @@ impl Index {
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
Ok(())
}
/// List the words on which typo are not allowed
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
None => Ok(fst::Set::default().map_data(Cow::Owned)?),
}
}
pub(crate) fn put_exact_words<A: AsRef<[u8]>>(
&self,
txn: &mut RwTxn,
words: &fst::Set<A>,
) -> Result<()> {
self.main.put::<_, Str, ByteSlice>(
txn,
main_key::EXACT_WORDS,
words.as_fst().as_bytes(),
)?;
Ok(())
}
}
#[cfg(test)]

View File

@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::{cmp, fmt, mem};
use fst::Set;
@ -157,6 +158,7 @@ trait Context {
}
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
}
/// The query tree builder is the interface to build a query tree.
@ -186,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
Ok((one, two))
}
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
self.index.exact_words(self.rtxn)
}
}
impl<'a> QueryTreeBuilder<'a> {
@ -265,16 +271,17 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
}
#[derive(Clone)]
pub struct TypoConfig {
pub struct TypoConfig<'a> {
pub max_typos: u8,
pub word_len_one_typo: u8,
pub word_len_two_typo: u8,
pub exact_words: fst::Set<Cow<'a, [u8]>>,
}
/// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length.
fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
if authorize_typos {
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
if authorize_typos && !config.exact_words.contains(&word) {
let count = word.chars().count().min(u8::MAX as usize) as u8;
if count < config.word_len_one_typo {
QueryKind::exact(word)
@ -333,7 +340,9 @@ fn create_query_tree(
children.push(child);
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
let exact_words = ctx.exact_words()?;
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
children.push(Operation::Query(Query {
prefix,
kind: typos(word, authorize_typos, config),
@ -385,8 +394,13 @@ fn create_query_tree(
let concat = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
let config =
TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
let exact_words = ctx.exact_words()?;
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
word_len_two_typo,
exact_words,
};
let query = Query {
prefix: is_prefix,
kind: typos(concat, authorize_typos, config),
@ -571,6 +585,8 @@ mod test {
struct TestContext {
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
postings: HashMap<String, RoaringBitmap>,
// Raw bytes for the exact word fst Set
exact_words: Vec<u8>,
}
impl TestContext {
@ -605,6 +621,10 @@ mod test {
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
}
}
impl Default for TestContext {
@ -621,6 +641,8 @@ mod test {
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
}
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
TestContext {
synonyms: hashmap! {
vec![String::from("hello")] => vec![
@ -660,6 +682,7 @@ mod test {
String::from("good") => random_postings(rng, 1250),
String::from("morning") => random_postings(rng, 125),
},
exact_words,
}
}
}
@ -1225,7 +1248,9 @@ mod test {
#[test]
fn test_min_word_len_typo() {
let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
let config =
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
assert_eq!(
typos("hello".to_string(), true, config.clone()),
@ -1242,4 +1267,20 @@ mod test {
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
);
}
#[test]
fn disable_typo_on_word() {
let query = "goodbye";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
let context = TestContext { exact_words, ..Default::default() };
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
assert!(matches!(
query_tree,
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
));
}
}

View File

@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
authorize_typos: Setting<bool>,
min_word_len_two_typos: Setting<u8>,
min_word_len_one_typo: Setting<u8>,
exact_words: Setting<BTreeSet<String>>,
}
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -113,9 +114,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
synonyms: Setting::NotSet,
primary_key: Setting::NotSet,
authorize_typos: Setting::NotSet,
indexer_config,
exact_words: Setting::NotSet,
min_word_len_two_typos: Setting::Reset,
min_word_len_one_typo: Setting::Reset,
indexer_config,
}
}
@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.min_word_len_one_typo = Setting::Reset;
}
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
self.exact_words = Setting::Set(words);
}
pub fn reset_exact_words(&mut self) {
self.exact_words = Setting::Reset;
}
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync,
@ -526,6 +536,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
Ok(())
}
fn update_exact_words(&mut self) -> Result<()> {
match self.exact_words {
Setting::Set(ref mut words) => {
let words = fst::Set::from_iter(words.iter())?;
self.index.put_exact_words(&mut self.wtxn, &words)?;
}
Setting::Reset => {
self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?;
}
Setting::NotSet => (),
}
Ok(())
}
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync,
@ -543,6 +568,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.update_primary_key()?;
self.update_authorize_typos()?;
self.update_min_typo_word_len()?;
self.update_exact_words()?;
// If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute,

View File

@ -1,5 +1,10 @@
use milli::update::{IndexerConfig, Settings};
use milli::{Criterion, Search};
use std::collections::BTreeSet;
use heed::EnvOpenOptions;
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::{Criterion, Index, Search};
use serde_json::json;
use tempfile::tempdir;
use Criterion::*;
#[test]
@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() {
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
}
#[test]
fn test_typo_disabled_on_word() {
let tmp = tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(4096 * 100);
let index = Index::new(options, tmp.path()).unwrap();
let documents = json!([
{
"id": 1usize,
"data": "zealand",
},
{
"id": 2usize,
"data": "zearand",
},
]);
let mut writer = std::io::Cursor::new(Vec::new());
let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
let documents = serde_json::to_vec(&documents).unwrap();
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
builder.finish().unwrap();
writer.set_position(0);
let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
let mut txn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
txn.commit().unwrap();
// basic typo search with default typo settings
{
let txn = index.read_txn().unwrap();
let mut search = Search::new(&txn, &index);
search.query("zealand");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 2);
}
let mut txn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut txn, &index, &config);
let mut exact_words = BTreeSet::new();
// `zealand` doesn't allow typos anymore
exact_words.insert("zealand".to_string());
builder.set_exact_words(exact_words);
builder.execute(|_| ()).unwrap();
let mut search = Search::new(&txn, &index);
search.query("zealand");
search.limit(10);
search.authorize_typos(true);
search.optional_words(true);
let result = search.execute().unwrap();
assert_eq!(result.documents_ids.len(), 1);
}