From 5aa49d232c3fa62ec34799640a4d2405d84c2eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 27 Jun 2019 15:16:32 +0200 Subject: [PATCH] feat: Rewrite Automaton generation related code --- meilidb-core/src/automaton.rs | 59 +++------------------ meilidb-core/src/query_builder.rs | 87 ++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 84 deletions(-) diff --git a/meilidb-core/src/automaton.rs b/meilidb-core/src/automaton.rs index 972b2ce51..1ab845933 100644 --- a/meilidb-core/src/automaton.rs +++ b/meilidb-core/src/automaton.rs @@ -1,8 +1,7 @@ -use fst::Automaton; use lazy_static::lazy_static; use levenshtein_automata::{ LevenshteinAutomatonBuilder as LevBuilder, - DFA, Distance, + DFA, }; lazy_static! { @@ -11,55 +10,16 @@ lazy_static! { static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false); } -pub struct DfaExt { - query_len: usize, - automaton: DFA, -} - -impl Automaton for DfaExt { - type State = ::State; - - fn start(&self) -> Self::State { - self.automaton.start() - } - - fn is_match(&self, state: &Self::State) -> bool { - self.automaton.is_match(state) - } - - fn can_match(&self, state: &Self::State) -> bool { - self.automaton.can_match(state) - } - - fn will_always_match(&self, state: &Self::State) -> bool { - self.automaton.will_always_match(state) - } - - fn accept(&self, state: &Self::State, byte: u8) -> Self::State { - self.automaton.accept(state, byte) - } -} - -impl AutomatonExt for DfaExt { - fn eval>(&self, s: B) -> Distance { - self.automaton.eval(s) - } - - fn query_len(&self) -> usize { - self.query_len - } -} - #[derive(Copy, Clone)] enum PrefixSetting { Prefix, NoPrefix, } -fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt { +fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA { use self::PrefixSetting::{Prefix, NoPrefix}; - let dfa = match query.len() { + match query.len() { 0 ..= 4 => match setting { Prefix => LEVDIST0.build_prefix_dfa(query), NoPrefix => LEVDIST0.build_dfa(query), @@ -72,20 +32,13 @@ fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt { Prefix => LEVDIST2.build_prefix_dfa(query), NoPrefix => LEVDIST2.build_dfa(query), }, - }; - - DfaExt { query_len: query.len(), automaton: dfa } + } } -pub fn build_prefix_dfa(query: &str) -> DfaExt { +pub fn build_prefix_dfa(query: &str) -> DFA { build_dfa_with_setting(query, PrefixSetting::Prefix) } -pub fn build_dfa(query: &str) -> DfaExt { +pub fn build_dfa(query: &str) -> DFA { build_dfa_with_setting(query, PrefixSetting::NoPrefix) } - -pub trait AutomatonExt: Automaton { - fn eval>(&self, s: B) -> Distance; - fn query_len(&self) -> usize; -} diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index b48d4b696..378a06aa1 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -11,8 +11,9 @@ use meilidb_tokenizer::{is_cjk, split_query_string}; use rayon::slice::ParallelSliceMut; use sdset::SetBuf; use slice_group_by::GroupByMut; +use levenshtein_automata::DFA; -use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; +use crate::automaton::{build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; use crate::raw_documents_from_matches; @@ -21,18 +22,38 @@ use crate::{Match, DocumentId, Store, RawDocument, Document}; const NGRAMS: usize = 3; struct Automaton { - index: usize, + query_index: usize, + query_len: usize, is_exact: bool, - dfa: DfaExt, + dfa: DFA, } impl Automaton { - fn exact(index: usize, dfa: DfaExt) -> Automaton { - Automaton { index, is_exact: true, dfa } + fn exact(query_index: usize, query: &str) -> Automaton { + Automaton { + query_index, + query_len: query.len(), + is_exact: true, + dfa: build_dfa(query), + } } - fn non_exact(index: usize, dfa: DfaExt) -> Automaton { - Automaton { index, is_exact: false, dfa } + fn prefix_exact(query_index: usize, query: &str) -> Automaton { + Automaton { + query_index, + query_len: query.len(), + is_exact: true, + dfa: build_prefix_dfa(query), + } + } + + fn non_exact(query_index: usize, query: &str) -> Automaton { + Automaton { + query_index, + query_len: query.len(), + is_exact: false, + dfa: build_dfa(query), + } } } @@ -54,7 +75,7 @@ fn generate_automatons(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result(query: &str, store: &S) -> Result bool> { store: S, criteria: Criteria<'c>, @@ -184,9 +217,9 @@ where S: Store, while let Some((input, indexed_values)) = stream.next() { for iv in indexed_values { - let Automaton { index, is_exact, ref dfa } = automatons[iv.index]; + let Automaton { query_index, is_exact, query_len, ref dfa } = automatons[iv.index]; let distance = dfa.eval(input).to_u8(); - let is_exact = is_exact && distance == 0 && input.len() == dfa.query_len(); + let is_exact = is_exact && distance == 0 && input.len() == query_len; let doc_indexes = self.store.word_indexes(input)?; let doc_indexes = match doc_indexes { @@ -197,8 +230,8 @@ where S: Store, for di in doc_indexes.as_slice() { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { let match_ = Match { - query_index: index as u32, - distance: distance, + query_index: query_index as u32, + distance, attribute: di.attribute, word_index: di.word_index, is_exact, @@ -206,23 +239,15 @@ where S: Store, char_length: di.char_length, }; matches.push((di.document_id, match_)); + } } } } + // rewrite the matched positions for next criteria evaluations matches.par_sort_unstable(); - - for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) { - let mut offset = 0; - for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) { - let word_index = query_indexes[0].1.word_index - offset as u16; - for (_, match_) in query_indexes.iter_mut() { - match_.word_index = word_index; - } - offset += query_indexes.len() - 1; - } - } + rewrite_matched_positions(&mut matches); let total_matches = matches.len(); let padded_matches = SetBuf::from_dirty(matches);