mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-03-20 06:30:38 +01:00
No more use FST to find a word without any typo
This commit is contained in:
parent
b0b1888ef9
commit
bf144a94d8
@ -1755,6 +1755,19 @@ impl Index {
|
|||||||
}
|
}
|
||||||
Ok(stats)
|
Ok(stats)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if the word is indexed in the index.
|
||||||
|
///
|
||||||
|
/// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `rtxn`: The read transaction.
|
||||||
|
/// * `word`: The word to check.
|
||||||
|
pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result<bool> {
|
||||||
|
Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
|
||||||
|
|| self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize)]
|
#[derive(Debug, Deserialize, Serialize)]
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
|
|
||||||
use fst::automaton::Str;
|
use fst::automaton::Str;
|
||||||
use fst::{Automaton, IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use heed::types::DecodeIgnore;
|
use heed::types::DecodeIgnore;
|
||||||
|
use itertools::{merge_join_by, EitherOrBoth};
|
||||||
|
|
||||||
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
|
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
|
||||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH};
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub enum NumberOfTypos {
|
pub enum NumberOfTypos {
|
||||||
Zero,
|
|
||||||
One,
|
One,
|
||||||
Two,
|
Two,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum ZeroOrOneTypo {
|
|
||||||
Zero,
|
|
||||||
One,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
@ -47,19 +43,27 @@ impl Interned<QueryTerm> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn find_zero_typo_prefix_derivations(
|
fn find_zero_typo_prefix_derivations(
|
||||||
|
ctx: &mut SearchContext<'_>,
|
||||||
word_interned: Interned<String>,
|
word_interned: Interned<String>,
|
||||||
fst: fst::Set<Cow<'_, [u8]>>,
|
|
||||||
word_interner: &mut DedupInterner<String>,
|
|
||||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let word = word_interner.get(word_interned).to_owned();
|
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||||
let word = word.as_str();
|
let word = word.as_str();
|
||||||
let prefix = Str::new(word).starts_with();
|
|
||||||
let mut stream = fst.search(prefix).into_stream();
|
|
||||||
|
|
||||||
while let Some(derived_word) = stream.next() {
|
let words =
|
||||||
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
|
ctx.index.word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
|
||||||
let derived_word_interned = word_interner.insert(derived_word);
|
let exact_words =
|
||||||
|
ctx.index.exact_word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
|
||||||
|
|
||||||
|
for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) {
|
||||||
|
(Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word),
|
||||||
|
(Err(_), _) | (_, Err(_)) => Ordering::Equal,
|
||||||
|
}) {
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => {
|
||||||
|
let (derived_word, _) = kv?;
|
||||||
|
let derived_word = derived_word.to_string();
|
||||||
|
let derived_word_interned = ctx.word_interner.insert(derived_word);
|
||||||
if derived_word_interned != word_interned {
|
if derived_word_interned != word_interned {
|
||||||
let cf = visit(derived_word_interned)?;
|
let cf = visit(derived_word_interned)?;
|
||||||
if cf.is_break() {
|
if cf.is_break() {
|
||||||
@ -67,14 +71,17 @@ fn find_zero_typo_prefix_derivations(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn find_zero_one_typo_derivations(
|
fn find_one_typo_derivations(
|
||||||
ctx: &mut SearchContext<'_>,
|
ctx: &mut SearchContext<'_>,
|
||||||
word_interned: Interned<String>,
|
word_interned: Interned<String>,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let fst = ctx.get_words_fst()?;
|
let fst = ctx.get_words_fst()?;
|
||||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||||
@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations(
|
|||||||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||||
let d = dfa.distance(state.1);
|
let d = dfa.distance(state.1);
|
||||||
match d.to_u8() {
|
match d.to_u8() {
|
||||||
0 => {
|
0 => (),
|
||||||
if derived_word != word_interned {
|
|
||||||
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
|
|
||||||
if cf.is_break() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
1 => {
|
1 => {
|
||||||
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
|
let cf = visit(derived_word)?;
|
||||||
if cf.is_break() {
|
if cf.is_break() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn find_zero_one_two_typo_derivations(
|
fn find_one_two_typo_derivations(
|
||||||
word_interned: Interned<String>,
|
word_interned: Interned<String>,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
fst: fst::Set<Cow<'_, [u8]>>,
|
fst: fst::Set<Cow<'_, [u8]>>,
|
||||||
@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations(
|
|||||||
// correct distance
|
// correct distance
|
||||||
let d = second_dfa.distance((state.1).0);
|
let d = second_dfa.distance((state.1).0);
|
||||||
match d.to_u8() {
|
match d.to_u8() {
|
||||||
0 => {
|
0 => (),
|
||||||
if derived_word_interned != word_interned {
|
|
||||||
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
|
|
||||||
if cf.is_break() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
1 => {
|
1 => {
|
||||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||||
if cf.is_break() {
|
if cf.is_break() {
|
||||||
@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
|
||||||
|
|
||||||
let use_prefix_db = is_prefix
|
let use_prefix_db = is_prefix
|
||||||
&& (ctx
|
&& (ctx
|
||||||
.index
|
.index
|
||||||
@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word(
|
|||||||
let mut zero_typo = None;
|
let mut zero_typo = None;
|
||||||
let mut prefix_of = BTreeSet::new();
|
let mut prefix_of = BTreeSet::new();
|
||||||
|
|
||||||
if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() {
|
if ctx.index.contains_word(ctx.txn, word)? {
|
||||||
zero_typo = Some(word_interned);
|
zero_typo = Some(word_interned);
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_prefix && use_prefix_db.is_none() {
|
if is_prefix && use_prefix_db.is_none() {
|
||||||
find_zero_typo_prefix_derivations(
|
find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
|
||||||
word_interned,
|
|
||||||
fst,
|
|
||||||
&mut ctx.word_interner,
|
|
||||||
|derived_word| {
|
|
||||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||||
prefix_of.insert(derived_word);
|
prefix_of.insert(derived_word);
|
||||||
Ok(ControlFlow::Continue(()))
|
Ok(ControlFlow::Continue(()))
|
||||||
} else {
|
} else {
|
||||||
Ok(ControlFlow::Break(()))
|
Ok(ControlFlow::Break(()))
|
||||||
}
|
}
|
||||||
},
|
})?;
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||||
let mut synonym_word_count = 0;
|
let mut synonym_word_count = 0;
|
||||||
@ -295,18 +281,13 @@ impl Interned<QueryTerm> {
|
|||||||
let mut one_typo_words = BTreeSet::new();
|
let mut one_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
if *max_nbr_typos > 0 {
|
if *max_nbr_typos > 0 {
|
||||||
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
|
||||||
match nbr_typos {
|
|
||||||
ZeroOrOneTypo::Zero => {}
|
|
||||||
ZeroOrOneTypo::One => {
|
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
one_typo_words.insert(derived_word);
|
one_typo_words.insert(derived_word);
|
||||||
} else {
|
|
||||||
return Ok(ControlFlow::Break(()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(ControlFlow::Continue(()))
|
Ok(ControlFlow::Continue(()))
|
||||||
|
} else {
|
||||||
|
Ok(ControlFlow::Break(()))
|
||||||
|
}
|
||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -357,7 +338,7 @@ impl Interned<QueryTerm> {
|
|||||||
let mut two_typo_words = BTreeSet::new();
|
let mut two_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
if *max_nbr_typos > 0 {
|
if *max_nbr_typos > 0 {
|
||||||
find_zero_one_two_typo_derivations(
|
find_one_two_typo_derivations(
|
||||||
*original,
|
*original,
|
||||||
*is_prefix,
|
*is_prefix,
|
||||||
ctx.index.words_fst(ctx.txn)?,
|
ctx.index.words_fst(ctx.txn)?,
|
||||||
@ -370,7 +351,6 @@ impl Interned<QueryTerm> {
|
|||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
match nbr_typos {
|
match nbr_typos {
|
||||||
NumberOfTypos::Zero => {}
|
|
||||||
NumberOfTypos::One => {
|
NumberOfTypos::One => {
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||||
one_typo_words.insert(derived_word);
|
one_typo_words.insert(derived_word);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user