mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-25 20:57:35 +01:00
custom fst automatons
This commit is contained in:
parent
628c835a22
commit
3f24555c3d
@ -18,6 +18,3 @@ opt-level = 3
|
|||||||
opt-level = 3
|
opt-level = 3
|
||||||
[profile.test.build-override]
|
[profile.test.build-override]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
|
|
||||||
[patch.crates-io]
|
|
||||||
fst = { git = "https://github.com/MarinPostma/fst.git", rev = "e6c606b7507e8cb5e502d1609f9b909b8690bac5" }
|
|
||||||
|
187
milli/src/search/fst_utils.rs
Normal file
187
milli/src/search/fst_utils.rs
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged.
|
||||||
|
/// All credits for this code go to BurntSushi.
|
||||||
|
use fst::Automaton;
|
||||||
|
|
||||||
|
pub struct StartsWith<A>(pub A);
|
||||||
|
|
||||||
|
/// The `Automaton` state for `StartsWith<A>`.
|
||||||
|
pub struct StartsWithState<A: Automaton>(pub StartsWithStateKind<A>);
|
||||||
|
|
||||||
|
impl<A: Automaton> Clone for StartsWithState<A>
|
||||||
|
where
|
||||||
|
A::State: Clone,
|
||||||
|
{
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Self(self.0.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The inner state of a `StartsWithState<A>`.
|
||||||
|
pub enum StartsWithStateKind<A: Automaton> {
|
||||||
|
/// Sink state that is reached when the automaton has matched the prefix.
|
||||||
|
Done,
|
||||||
|
/// State in which the automaton is while it hasn't matched the prefix.
|
||||||
|
Running(A::State),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: Automaton> Clone for StartsWithStateKind<A>
|
||||||
|
where
|
||||||
|
A::State: Clone,
|
||||||
|
{
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
match self {
|
||||||
|
StartsWithStateKind::Done => StartsWithStateKind::Done,
|
||||||
|
StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: Automaton> Automaton for StartsWith<A> {
|
||||||
|
type State = StartsWithState<A>;
|
||||||
|
|
||||||
|
fn start(&self) -> StartsWithState<A> {
|
||||||
|
StartsWithState({
|
||||||
|
let inner = self.0.start();
|
||||||
|
if self.0.is_match(&inner) {
|
||||||
|
StartsWithStateKind::Done
|
||||||
|
} else {
|
||||||
|
StartsWithStateKind::Running(inner)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
fn is_match(&self, state: &StartsWithState<A>) -> bool {
|
||||||
|
match state.0 {
|
||||||
|
StartsWithStateKind::Done => true,
|
||||||
|
StartsWithStateKind::Running(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn can_match(&self, state: &StartsWithState<A>) -> bool {
|
||||||
|
match state.0 {
|
||||||
|
StartsWithStateKind::Done => true,
|
||||||
|
StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
|
||||||
|
match state.0 {
|
||||||
|
StartsWithStateKind::Done => true,
|
||||||
|
StartsWithStateKind::Running(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> {
|
||||||
|
StartsWithState(match state.0 {
|
||||||
|
StartsWithStateKind::Done => StartsWithStateKind::Done,
|
||||||
|
StartsWithStateKind::Running(ref inner) => {
|
||||||
|
let next_inner = self.0.accept(inner, byte);
|
||||||
|
if self.0.is_match(&next_inner) {
|
||||||
|
StartsWithStateKind::Done
|
||||||
|
} else {
|
||||||
|
StartsWithStateKind::Running(next_inner)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// An automaton that matches when one of its component automata match.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Union<A, B>(pub A, pub B);
|
||||||
|
|
||||||
|
/// The `Automaton` state for `Union<A, B>`.
|
||||||
|
pub struct UnionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
|
||||||
|
|
||||||
|
impl<A: Automaton, B: Automaton> Clone for UnionState<A, B>
|
||||||
|
where
|
||||||
|
A::State: Clone,
|
||||||
|
B::State: Clone,
|
||||||
|
{
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Self(self.0.clone(), self.1.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
|
||||||
|
type State = UnionState<A, B>;
|
||||||
|
fn start(&self) -> UnionState<A, B> {
|
||||||
|
UnionState(self.0.start(), self.1.start())
|
||||||
|
}
|
||||||
|
fn is_match(&self, state: &UnionState<A, B>) -> bool {
|
||||||
|
self.0.is_match(&state.0) || self.1.is_match(&state.1)
|
||||||
|
}
|
||||||
|
fn can_match(&self, state: &UnionState<A, B>) -> bool {
|
||||||
|
self.0.can_match(&state.0) || self.1.can_match(&state.1)
|
||||||
|
}
|
||||||
|
fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
|
||||||
|
self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1)
|
||||||
|
}
|
||||||
|
fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
|
||||||
|
UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// An automaton that matches when both of its component automata match.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Intersection<A, B>(pub A, pub B);
|
||||||
|
|
||||||
|
/// The `Automaton` state for `Intersection<A, B>`.
|
||||||
|
pub struct IntersectionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
|
||||||
|
|
||||||
|
impl<A: Automaton, B: Automaton> Clone for IntersectionState<A, B>
|
||||||
|
where
|
||||||
|
A::State: Clone,
|
||||||
|
B::State: Clone,
|
||||||
|
{
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Self(self.0.clone(), self.1.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
|
||||||
|
type State = IntersectionState<A, B>;
|
||||||
|
fn start(&self) -> IntersectionState<A, B> {
|
||||||
|
IntersectionState(self.0.start(), self.1.start())
|
||||||
|
}
|
||||||
|
fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||||
|
self.0.is_match(&state.0) && self.1.is_match(&state.1)
|
||||||
|
}
|
||||||
|
fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||||
|
self.0.can_match(&state.0) && self.1.can_match(&state.1)
|
||||||
|
}
|
||||||
|
fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||||
|
self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1)
|
||||||
|
}
|
||||||
|
fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> {
|
||||||
|
IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// An automaton that matches exactly when the automaton it wraps does not.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Complement<A>(pub A);
|
||||||
|
|
||||||
|
/// The `Automaton` state for `Complement<A>`.
|
||||||
|
pub struct ComplementState<A: Automaton>(pub A::State);
|
||||||
|
|
||||||
|
impl<A: Automaton> Clone for ComplementState<A>
|
||||||
|
where
|
||||||
|
A::State: Clone,
|
||||||
|
{
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Self(self.0.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<A: Automaton> Automaton for Complement<A> {
|
||||||
|
type State = ComplementState<A>;
|
||||||
|
fn start(&self) -> ComplementState<A> {
|
||||||
|
ComplementState(self.0.start())
|
||||||
|
}
|
||||||
|
fn is_match(&self, state: &ComplementState<A>) -> bool {
|
||||||
|
!self.0.is_match(&state.0)
|
||||||
|
}
|
||||||
|
fn can_match(&self, state: &ComplementState<A>) -> bool {
|
||||||
|
!self.0.will_always_match(&state.0)
|
||||||
|
}
|
||||||
|
fn will_always_match(&self, state: &ComplementState<A>) -> bool {
|
||||||
|
!self.0.can_match(&state.0)
|
||||||
|
}
|
||||||
|
fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> {
|
||||||
|
ComplementState(self.0.accept(&state.0, byte))
|
||||||
|
}
|
||||||
|
}
|
@ -16,6 +16,7 @@ use once_cell::sync::Lazy;
|
|||||||
use roaring::bitmap::RoaringBitmap;
|
use roaring::bitmap::RoaringBitmap;
|
||||||
|
|
||||||
pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
|
pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
|
||||||
|
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
pub use self::matching_words::MatchingWords;
|
pub use self::matching_words::MatchingWords;
|
||||||
use self::query_tree::QueryTreeBuilder;
|
use self::query_tree::QueryTreeBuilder;
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
@ -30,6 +31,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
|||||||
mod criteria;
|
mod criteria;
|
||||||
mod distinct;
|
mod distinct;
|
||||||
mod facet;
|
mod facet;
|
||||||
|
mod fst_utils;
|
||||||
mod matching_words;
|
mod matching_words;
|
||||||
mod query_tree;
|
mod query_tree;
|
||||||
|
|
||||||
@ -70,7 +72,6 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
|
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
|
||||||
self.offset = offset;
|
self.offset = offset;
|
||||||
|
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -301,8 +302,9 @@ pub fn word_derivations<'c>(
|
|||||||
} else {
|
} else {
|
||||||
if max_typo == 1 {
|
if max_typo == 1 {
|
||||||
let dfa = build_dfa(word, 1, is_prefix);
|
let dfa = build_dfa(word, 1, is_prefix);
|
||||||
let starts = Str::new(get_first(word)).starts_with();
|
let starts = StartsWith(Str::new(get_first(word)));
|
||||||
let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream();
|
let mut stream =
|
||||||
|
fst.search_with_state(Intersection(starts, &dfa)).into_stream();
|
||||||
|
|
||||||
while let Some((word, state)) = stream.next() {
|
while let Some((word, state)) = stream.next() {
|
||||||
let word = std::str::from_utf8(word)?;
|
let word = std::str::from_utf8(word)?;
|
||||||
@ -310,11 +312,11 @@ pub fn word_derivations<'c>(
|
|||||||
derived_words.push((word.to_string(), d.to_u8()));
|
derived_words.push((word.to_string(), d.to_u8()));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let starts = Str::new(get_first(word)).starts_with();
|
let starts = StartsWith(Str::new(get_first(word)));
|
||||||
let first = build_dfa(word, 1, is_prefix).intersection((&starts).complement());
|
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
|
||||||
let second_dfa = build_dfa(word, 2, is_prefix);
|
let second_dfa = build_dfa(word, 2, is_prefix);
|
||||||
let second = (&second_dfa).intersection(&starts);
|
let second = Intersection(&second_dfa, &starts);
|
||||||
let automaton = first.union(&second);
|
let automaton = Union(first, &second);
|
||||||
|
|
||||||
let mut stream = fst.search_with_state(automaton).into_stream();
|
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user