mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 14:04:31 +01:00
Merge #474
474: Disable typos on exact word r=MarinPostma a=MarinPostma This PR introduces the `exact_word` setting to disable typo tolerance on custom words. If a user query contains a word from `exact_words`, no typo derivation will be made for that particular word. I have chosen to store the words in a FST, to save on deserialization, and allow for fast lookups. I had some trouble with the `serde` module, and had to rename it `serde_impl`. ## steps: - [x] introduce new settings to register words to disable typos on - [x] in `typos`, return exact match is the current word is part of the word to disable typos for. - [x] update `Context` to return the exact words dictionary. - [x] merge #473 Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
commit
900825bac0
@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt};
|
||||
use serde::Deserializer;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::serde::DocumentVisitor;
|
||||
use super::serde_impl::DocumentVisitor;
|
||||
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
||||
use crate::FieldId;
|
||||
|
||||
|
@ -5,15 +5,15 @@ mod builder;
|
||||
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
|
||||
/// later be read by milli using the `DocumentBatchReader` interface.
|
||||
mod reader;
|
||||
mod serde;
|
||||
mod serde_impl;
|
||||
|
||||
use std::fmt::{self, Debug};
|
||||
use std::io;
|
||||
|
||||
use ::serde::{Deserialize, Serialize};
|
||||
use bimap::BiHashMap;
|
||||
pub use builder::DocumentBatchBuilder;
|
||||
pub use reader::DocumentBatchReader;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
|
@ -52,6 +52,7 @@ pub mod main_key {
|
||||
pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
|
||||
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
|
||||
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
|
||||
pub const EXACT_WORDS: &str = "exact-words";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
@ -927,6 +928,27 @@ impl Index {
|
||||
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// List the words on which typo are not allowed
|
||||
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
|
||||
match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
|
||||
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
|
||||
None => Ok(fst::Set::default().map_data(Cow::Owned)?),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn put_exact_words<A: AsRef<[u8]>>(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
words: &fst::Set<A>,
|
||||
) -> Result<()> {
|
||||
self.main.put::<_, Str, ByteSlice>(
|
||||
txn,
|
||||
main_key::EXACT_WORDS,
|
||||
words.as_fst().as_bytes(),
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::{cmp, fmt, mem};
|
||||
|
||||
use fst::Set;
|
||||
@ -157,6 +158,7 @@ trait Context {
|
||||
}
|
||||
/// Returns the minimum word len for 1 and 2 typos.
|
||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
||||
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
|
||||
}
|
||||
|
||||
/// The query tree builder is the interface to build a query tree.
|
||||
@ -186,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
||||
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
|
||||
Ok((one, two))
|
||||
}
|
||||
|
||||
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
|
||||
self.index.exact_words(self.rtxn)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> QueryTreeBuilder<'a> {
|
||||
@ -265,16 +271,17 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TypoConfig {
|
||||
pub struct TypoConfig<'a> {
|
||||
pub max_typos: u8,
|
||||
pub word_len_one_typo: u8,
|
||||
pub word_len_two_typo: u8,
|
||||
pub exact_words: fst::Set<Cow<'a, [u8]>>,
|
||||
}
|
||||
|
||||
/// Return the `QueryKind` of a word depending on `authorize_typos`
|
||||
/// and the provided word length.
|
||||
fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
|
||||
if authorize_typos {
|
||||
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
|
||||
if authorize_typos && !config.exact_words.contains(&word) {
|
||||
let count = word.chars().count().min(u8::MAX as usize) as u8;
|
||||
if count < config.word_len_one_typo {
|
||||
QueryKind::exact(word)
|
||||
@ -333,7 +340,9 @@ fn create_query_tree(
|
||||
children.push(child);
|
||||
}
|
||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||
let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
|
||||
let exact_words = ctx.exact_words()?;
|
||||
let config =
|
||||
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
||||
children.push(Operation::Query(Query {
|
||||
prefix,
|
||||
kind: typos(word, authorize_typos, config),
|
||||
@ -385,8 +394,13 @@ fn create_query_tree(
|
||||
let concat = words.concat();
|
||||
let (word_len_one_typo, word_len_two_typo) =
|
||||
ctx.min_word_len_for_typo()?;
|
||||
let config =
|
||||
TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
|
||||
let exact_words = ctx.exact_words()?;
|
||||
let config = TypoConfig {
|
||||
max_typos: 1,
|
||||
word_len_one_typo,
|
||||
word_len_two_typo,
|
||||
exact_words,
|
||||
};
|
||||
let query = Query {
|
||||
prefix: is_prefix,
|
||||
kind: typos(concat, authorize_typos, config),
|
||||
@ -571,6 +585,8 @@ mod test {
|
||||
struct TestContext {
|
||||
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
|
||||
postings: HashMap<String, RoaringBitmap>,
|
||||
// Raw bytes for the exact word fst Set
|
||||
exact_words: Vec<u8>,
|
||||
}
|
||||
|
||||
impl TestContext {
|
||||
@ -605,6 +621,10 @@ mod test {
|
||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
||||
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
||||
}
|
||||
|
||||
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
|
||||
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TestContext {
|
||||
@ -621,6 +641,8 @@ mod test {
|
||||
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
|
||||
}
|
||||
|
||||
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
|
||||
|
||||
TestContext {
|
||||
synonyms: hashmap! {
|
||||
vec![String::from("hello")] => vec![
|
||||
@ -660,6 +682,7 @@ mod test {
|
||||
String::from("good") => random_postings(rng, 1250),
|
||||
String::from("morning") => random_postings(rng, 125),
|
||||
},
|
||||
exact_words,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1225,7 +1248,9 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_min_word_len_typo() {
|
||||
let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
|
||||
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let config =
|
||||
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
|
||||
|
||||
assert_eq!(
|
||||
typos("hello".to_string(), true, config.clone()),
|
||||
@ -1242,4 +1267,20 @@ mod test {
|
||||
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn disable_typo_on_word() {
|
||||
let query = "goodbye";
|
||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||
let result = analyzer.analyze(query);
|
||||
|
||||
let tokens = result.tokens();
|
||||
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
|
||||
let context = TestContext { exact_words, ..Default::default() };
|
||||
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
|
||||
|
||||
assert!(matches!(
|
||||
query_tree,
|
||||
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
|
||||
));
|
||||
}
|
||||
}
|
||||
|
@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
||||
authorize_typos: Setting<bool>,
|
||||
min_word_len_two_typos: Setting<u8>,
|
||||
min_word_len_one_typo: Setting<u8>,
|
||||
exact_words: Setting<BTreeSet<String>>,
|
||||
}
|
||||
|
||||
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
@ -113,9 +114,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
synonyms: Setting::NotSet,
|
||||
primary_key: Setting::NotSet,
|
||||
authorize_typos: Setting::NotSet,
|
||||
indexer_config,
|
||||
exact_words: Setting::NotSet,
|
||||
min_word_len_two_typos: Setting::Reset,
|
||||
min_word_len_one_typo: Setting::Reset,
|
||||
indexer_config,
|
||||
}
|
||||
}
|
||||
|
||||
@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
self.min_word_len_one_typo = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
|
||||
self.exact_words = Setting::Set(words);
|
||||
}
|
||||
|
||||
pub fn reset_exact_words(&mut self) {
|
||||
self.exact_words = Setting::Reset;
|
||||
}
|
||||
|
||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -526,6 +536,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_exact_words(&mut self) -> Result<()> {
|
||||
match self.exact_words {
|
||||
Setting::Set(ref mut words) => {
|
||||
let words = fst::Set::from_iter(words.iter())?;
|
||||
self.index.put_exact_words(&mut self.wtxn, &words)?;
|
||||
}
|
||||
Setting::Reset => {
|
||||
self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?;
|
||||
}
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -543,6 +568,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
self.update_primary_key()?;
|
||||
self.update_authorize_typos()?;
|
||||
self.update_min_typo_word_len()?;
|
||||
self.update_exact_words()?;
|
||||
|
||||
// If there is new faceted fields we indicate that we must reindex as we must
|
||||
// index new fields as facets. It means that the distinct attribute,
|
||||
|
@ -1,5 +1,10 @@
|
||||
use milli::update::{IndexerConfig, Settings};
|
||||
use milli::{Criterion, Search};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::{Criterion, Index, Search};
|
||||
use serde_json::json;
|
||||
use tempfile::tempdir;
|
||||
use Criterion::*;
|
||||
|
||||
#[test]
|
||||
@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() {
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_typo_disabled_on_word() {
|
||||
let tmp = tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(4096 * 100);
|
||||
let index = Index::new(options, tmp.path()).unwrap();
|
||||
|
||||
let documents = json!([
|
||||
{
|
||||
"id": 1usize,
|
||||
"data": "zealand",
|
||||
},
|
||||
{
|
||||
"id": 2usize,
|
||||
"data": "zearand",
|
||||
},
|
||||
]);
|
||||
|
||||
let mut writer = std::io::Cursor::new(Vec::new());
|
||||
let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||
let documents = serde_json::to_vec(&documents).unwrap();
|
||||
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
||||
builder.finish().unwrap();
|
||||
|
||||
writer.set_position(0);
|
||||
|
||||
let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
|
||||
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config = IndexDocumentsConfig::default();
|
||||
let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
|
||||
|
||||
builder.add_documents(documents).unwrap();
|
||||
|
||||
builder.execute().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// basic typo search with default typo settings
|
||||
{
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zealand");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 2);
|
||||
}
|
||||
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||
let mut exact_words = BTreeSet::new();
|
||||
// `zealand` doesn't allow typos anymore
|
||||
exact_words.insert("zealand".to_string());
|
||||
builder.set_exact_words(exact_words);
|
||||
builder.execute(|_| ()).unwrap();
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
search.query("zealand");
|
||||
search.limit(10);
|
||||
search.authorize_typos(true);
|
||||
search.optional_words(true);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert_eq!(result.documents_ids.len(), 1);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user