mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-24 21:50:07 +01:00
Merge #474
474: Disable typos on exact word r=MarinPostma a=MarinPostma This PR introduces the `exact_word` setting to disable typo tolerance on custom words. If a user query contains a word from `exact_words`, no typo derivation will be made for that particular word. I have chosen to store the words in a FST, to save on deserialization, and allow for fast lookups. I had some trouble with the `serde` module, and had to rename it `serde_impl`. ## steps: - [x] introduce new settings to register words to disable typos on - [x] in `typos`, return exact match is the current word is part of the word to disable typos for. - [x] update `Context` to return the exact words dictionary. - [x] merge #473 Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
commit
900825bac0
@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt};
|
|||||||
use serde::Deserializer;
|
use serde::Deserializer;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::serde::DocumentVisitor;
|
use super::serde_impl::DocumentVisitor;
|
||||||
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
|
@ -5,15 +5,15 @@ mod builder;
|
|||||||
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
|
/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
|
||||||
/// later be read by milli using the `DocumentBatchReader` interface.
|
/// later be read by milli using the `DocumentBatchReader` interface.
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serde;
|
mod serde_impl;
|
||||||
|
|
||||||
use std::fmt::{self, Debug};
|
use std::fmt::{self, Debug};
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use ::serde::{Deserialize, Serialize};
|
|
||||||
use bimap::BiHashMap;
|
use bimap::BiHashMap;
|
||||||
pub use builder::DocumentBatchBuilder;
|
pub use builder::DocumentBatchBuilder;
|
||||||
pub use reader::DocumentBatchReader;
|
pub use reader::DocumentBatchReader;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::FieldId;
|
use crate::FieldId;
|
||||||
|
|
||||||
|
@ -52,6 +52,7 @@ pub mod main_key {
|
|||||||
pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
|
pub const AUTHORIZE_TYPOS: &str = "authorize-typos";
|
||||||
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
|
pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len";
|
||||||
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
|
pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len";
|
||||||
|
pub const EXACT_WORDS: &str = "exact-words";
|
||||||
}
|
}
|
||||||
|
|
||||||
pub mod db_name {
|
pub mod db_name {
|
||||||
@ -927,6 +928,27 @@ impl Index {
|
|||||||
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
|
self.main.put::<_, Str, OwnedType<u8>>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// List the words on which typo are not allowed
|
||||||
|
pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
|
||||||
|
match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? {
|
||||||
|
Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?),
|
||||||
|
None => Ok(fst::Set::default().map_data(Cow::Owned)?),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn put_exact_words<A: AsRef<[u8]>>(
|
||||||
|
&self,
|
||||||
|
txn: &mut RwTxn,
|
||||||
|
words: &fst::Set<A>,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.main.put::<_, Str, ByteSlice>(
|
||||||
|
txn,
|
||||||
|
main_key::EXACT_WORDS,
|
||||||
|
words.as_fst().as_bytes(),
|
||||||
|
)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::{cmp, fmt, mem};
|
use std::{cmp, fmt, mem};
|
||||||
|
|
||||||
use fst::Set;
|
use fst::Set;
|
||||||
@ -157,6 +158,7 @@ trait Context {
|
|||||||
}
|
}
|
||||||
/// Returns the minimum word len for 1 and 2 typos.
|
/// Returns the minimum word len for 1 and 2 typos.
|
||||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
||||||
|
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The query tree builder is the interface to build a query tree.
|
/// The query tree builder is the interface to build a query tree.
|
||||||
@ -186,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
|
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
|
||||||
Ok((one, two))
|
Ok((one, two))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
|
||||||
|
self.index.exact_words(self.rtxn)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTreeBuilder<'a> {
|
impl<'a> QueryTreeBuilder<'a> {
|
||||||
@ -265,16 +271,17 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct TypoConfig {
|
pub struct TypoConfig<'a> {
|
||||||
pub max_typos: u8,
|
pub max_typos: u8,
|
||||||
pub word_len_one_typo: u8,
|
pub word_len_one_typo: u8,
|
||||||
pub word_len_two_typo: u8,
|
pub word_len_two_typo: u8,
|
||||||
|
pub exact_words: fst::Set<Cow<'a, [u8]>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the `QueryKind` of a word depending on `authorize_typos`
|
/// Return the `QueryKind` of a word depending on `authorize_typos`
|
||||||
/// and the provided word length.
|
/// and the provided word length.
|
||||||
fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
|
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
|
||||||
if authorize_typos {
|
if authorize_typos && !config.exact_words.contains(&word) {
|
||||||
let count = word.chars().count().min(u8::MAX as usize) as u8;
|
let count = word.chars().count().min(u8::MAX as usize) as u8;
|
||||||
if count < config.word_len_one_typo {
|
if count < config.word_len_one_typo {
|
||||||
QueryKind::exact(word)
|
QueryKind::exact(word)
|
||||||
@ -333,7 +340,9 @@ fn create_query_tree(
|
|||||||
children.push(child);
|
children.push(child);
|
||||||
}
|
}
|
||||||
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
||||||
let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo };
|
let exact_words = ctx.exact_words()?;
|
||||||
|
let config =
|
||||||
|
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
||||||
children.push(Operation::Query(Query {
|
children.push(Operation::Query(Query {
|
||||||
prefix,
|
prefix,
|
||||||
kind: typos(word, authorize_typos, config),
|
kind: typos(word, authorize_typos, config),
|
||||||
@ -385,8 +394,13 @@ fn create_query_tree(
|
|||||||
let concat = words.concat();
|
let concat = words.concat();
|
||||||
let (word_len_one_typo, word_len_two_typo) =
|
let (word_len_one_typo, word_len_two_typo) =
|
||||||
ctx.min_word_len_for_typo()?;
|
ctx.min_word_len_for_typo()?;
|
||||||
let config =
|
let exact_words = ctx.exact_words()?;
|
||||||
TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo };
|
let config = TypoConfig {
|
||||||
|
max_typos: 1,
|
||||||
|
word_len_one_typo,
|
||||||
|
word_len_two_typo,
|
||||||
|
exact_words,
|
||||||
|
};
|
||||||
let query = Query {
|
let query = Query {
|
||||||
prefix: is_prefix,
|
prefix: is_prefix,
|
||||||
kind: typos(concat, authorize_typos, config),
|
kind: typos(concat, authorize_typos, config),
|
||||||
@ -571,6 +585,8 @@ mod test {
|
|||||||
struct TestContext {
|
struct TestContext {
|
||||||
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
|
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
|
||||||
postings: HashMap<String, RoaringBitmap>,
|
postings: HashMap<String, RoaringBitmap>,
|
||||||
|
// Raw bytes for the exact word fst Set
|
||||||
|
exact_words: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TestContext {
|
impl TestContext {
|
||||||
@ -605,6 +621,10 @@ mod test {
|
|||||||
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
||||||
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
|
||||||
|
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TestContext {
|
impl Default for TestContext {
|
||||||
@ -621,6 +641,8 @@ mod test {
|
|||||||
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
|
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
|
||||||
|
|
||||||
TestContext {
|
TestContext {
|
||||||
synonyms: hashmap! {
|
synonyms: hashmap! {
|
||||||
vec![String::from("hello")] => vec![
|
vec![String::from("hello")] => vec![
|
||||||
@ -660,6 +682,7 @@ mod test {
|
|||||||
String::from("good") => random_postings(rng, 1250),
|
String::from("good") => random_postings(rng, 1250),
|
||||||
String::from("morning") => random_postings(rng, 125),
|
String::from("morning") => random_postings(rng, 125),
|
||||||
},
|
},
|
||||||
|
exact_words,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1225,7 +1248,9 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_min_word_len_typo() {
|
fn test_min_word_len_typo() {
|
||||||
let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 };
|
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
|
||||||
|
let config =
|
||||||
|
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
typos("hello".to_string(), true, config.clone()),
|
typos("hello".to_string(), true, config.clone()),
|
||||||
@ -1242,4 +1267,20 @@ mod test {
|
|||||||
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
#[test]
|
||||||
|
fn disable_typo_on_word() {
|
||||||
|
let query = "goodbye";
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
|
let result = analyzer.analyze(query);
|
||||||
|
|
||||||
|
let tokens = result.tokens();
|
||||||
|
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
|
||||||
|
let context = TestContext { exact_words, ..Default::default() };
|
||||||
|
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
|
||||||
|
|
||||||
|
assert!(matches!(
|
||||||
|
query_tree,
|
||||||
|
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
|
|||||||
authorize_typos: Setting<bool>,
|
authorize_typos: Setting<bool>,
|
||||||
min_word_len_two_typos: Setting<u8>,
|
min_word_len_two_typos: Setting<u8>,
|
||||||
min_word_len_one_typo: Setting<u8>,
|
min_word_len_one_typo: Setting<u8>,
|
||||||
|
exact_words: Setting<BTreeSet<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||||
@ -113,9 +114,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
synonyms: Setting::NotSet,
|
synonyms: Setting::NotSet,
|
||||||
primary_key: Setting::NotSet,
|
primary_key: Setting::NotSet,
|
||||||
authorize_typos: Setting::NotSet,
|
authorize_typos: Setting::NotSet,
|
||||||
indexer_config,
|
exact_words: Setting::NotSet,
|
||||||
min_word_len_two_typos: Setting::Reset,
|
min_word_len_two_typos: Setting::Reset,
|
||||||
min_word_len_one_typo: Setting::Reset,
|
min_word_len_one_typo: Setting::Reset,
|
||||||
|
indexer_config,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.min_word_len_one_typo = Setting::Reset;
|
self.min_word_len_one_typo = Setting::Reset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
|
||||||
|
self.exact_words = Setting::Set(words);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reset_exact_words(&mut self) {
|
||||||
|
self.exact_words = Setting::Reset;
|
||||||
|
}
|
||||||
|
|
||||||
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
|
||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
@ -526,6 +536,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn update_exact_words(&mut self) -> Result<()> {
|
||||||
|
match self.exact_words {
|
||||||
|
Setting::Set(ref mut words) => {
|
||||||
|
let words = fst::Set::from_iter(words.iter())?;
|
||||||
|
self.index.put_exact_words(&mut self.wtxn, &words)?;
|
||||||
|
}
|
||||||
|
Setting::Reset => {
|
||||||
|
self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?;
|
||||||
|
}
|
||||||
|
Setting::NotSet => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
|
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
|
||||||
where
|
where
|
||||||
F: Fn(UpdateIndexingStep) + Sync,
|
F: Fn(UpdateIndexingStep) + Sync,
|
||||||
@ -543,6 +568,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
self.update_primary_key()?;
|
self.update_primary_key()?;
|
||||||
self.update_authorize_typos()?;
|
self.update_authorize_typos()?;
|
||||||
self.update_min_typo_word_len()?;
|
self.update_min_typo_word_len()?;
|
||||||
|
self.update_exact_words()?;
|
||||||
|
|
||||||
// If there is new faceted fields we indicate that we must reindex as we must
|
// If there is new faceted fields we indicate that we must reindex as we must
|
||||||
// index new fields as facets. It means that the distinct attribute,
|
// index new fields as facets. It means that the distinct attribute,
|
||||||
|
@ -1,5 +1,10 @@
|
|||||||
use milli::update::{IndexerConfig, Settings};
|
use std::collections::BTreeSet;
|
||||||
use milli::{Criterion, Search};
|
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||||
|
use milli::{Criterion, Index, Search};
|
||||||
|
use serde_json::json;
|
||||||
|
use tempfile::tempdir;
|
||||||
use Criterion::*;
|
use Criterion::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() {
|
|||||||
let result = search.execute().unwrap();
|
let result = search.execute().unwrap();
|
||||||
assert_eq!(result.documents_ids.len(), 1);
|
assert_eq!(result.documents_ids.len(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_typo_disabled_on_word() {
|
||||||
|
let tmp = tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(4096 * 100);
|
||||||
|
let index = Index::new(options, tmp.path()).unwrap();
|
||||||
|
|
||||||
|
let documents = json!([
|
||||||
|
{
|
||||||
|
"id": 1usize,
|
||||||
|
"data": "zealand",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2usize,
|
||||||
|
"data": "zearand",
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
let mut writer = std::io::Cursor::new(Vec::new());
|
||||||
|
let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
|
||||||
|
let documents = serde_json::to_vec(&documents).unwrap();
|
||||||
|
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
|
||||||
|
builder.finish().unwrap();
|
||||||
|
|
||||||
|
writer.set_position(0);
|
||||||
|
|
||||||
|
let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
|
||||||
|
|
||||||
|
let mut txn = index.write_txn().unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ());
|
||||||
|
|
||||||
|
builder.add_documents(documents).unwrap();
|
||||||
|
|
||||||
|
builder.execute().unwrap();
|
||||||
|
txn.commit().unwrap();
|
||||||
|
|
||||||
|
// basic typo search with default typo settings
|
||||||
|
{
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut search = Search::new(&txn, &index);
|
||||||
|
search.query("zealand");
|
||||||
|
search.limit(10);
|
||||||
|
search.authorize_typos(true);
|
||||||
|
search.optional_words(true);
|
||||||
|
|
||||||
|
let result = search.execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut txn = index.write_txn().unwrap();
|
||||||
|
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
let mut builder = Settings::new(&mut txn, &index, &config);
|
||||||
|
let mut exact_words = BTreeSet::new();
|
||||||
|
// `zealand` doesn't allow typos anymore
|
||||||
|
exact_words.insert("zealand".to_string());
|
||||||
|
builder.set_exact_words(exact_words);
|
||||||
|
builder.execute(|_| ()).unwrap();
|
||||||
|
|
||||||
|
let mut search = Search::new(&txn, &index);
|
||||||
|
search.query("zealand");
|
||||||
|
search.limit(10);
|
||||||
|
search.authorize_typos(true);
|
||||||
|
search.optional_words(true);
|
||||||
|
|
||||||
|
let result = search.execute().unwrap();
|
||||||
|
assert_eq!(result.documents_ids.len(), 1);
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user