diff --git a/meilisearch-core/src/automaton/dfa.rs b/meilisearch-core/src/automaton/dfa.rs index 6258da424..da1a6eb39 100644 --- a/meilisearch-core/src/automaton/dfa.rs +++ b/meilisearch-core/src/automaton/dfa.rs @@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA { pub fn build_dfa(query: &str) -> DFA { build_dfa_with_setting(query, PrefixSetting::NoPrefix) } + +pub fn build_exact_dfa(query: &str) -> DFA { + let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); + builder.build_dfa(query) +} diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index 406d72ce2..ecf99ee1c 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -13,7 +13,7 @@ use crate::database::MainT; use crate::error::MResult; use crate::store; -pub use self::dfa::{build_dfa, build_prefix_dfa}; +pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; pub use self::query_enhancer::QueryEnhancer; pub use self::query_enhancer::QueryEnhancerBuilder; diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 8e4612c22..7477ff383 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -19,7 +19,7 @@ use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::NGRAMS; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; -use crate::automaton::{build_dfa, build_prefix_dfa}; +use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::automaton::{normalize_str, split_best_frequency}; use crate::criterion2::*; @@ -41,6 +41,8 @@ pub fn bucket_sort<'c>( let (automatons, query_enhancer) = construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; + debug!("{:?}", query_enhancer); + let before_postings_lists_fetching = Instant::now(); mk_arena!(arena); let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; @@ -74,7 +76,7 @@ pub fn bucket_sort<'c>( let criteria = [ Box::new(Typo) as Box, - Box::new(Words), + Box::new(Words) as Box, Box::new(Proximity), Box::new(Attribute), Box::new(WordsPosition), @@ -88,7 +90,7 @@ pub fn bucket_sort<'c>( for mut group in tmp_groups { let before_criterion_preparation = Instant::now(); - criterion.prepare(&mut group, &mut arena, &query_enhancer); + criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); let before_criterion_sort = Instant::now(); @@ -116,6 +118,7 @@ pub fn bucket_sort<'c>( let postings_list = &arena[sm.postings_list]; let input = postings_list.input(); let query = &automatons[sm.query_index as usize].query; + debug!("{:?} contains {:?}", d.raw_matches[0].document_id, query); postings_list.iter().map(move |m| { let covered_area = if query.len() > input.len() { input.len() @@ -126,6 +129,8 @@ pub fn bucket_sort<'c>( }) }).collect(); + debug!("{:?} contains {:?}", d.raw_matches[0].document_id, d.processed_distances); + Document { id: d.raw_matches[0].document_id, highlights, @@ -233,7 +238,7 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { query, is_exact, is_prefix } = automaton; + let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; @@ -294,28 +299,48 @@ fn fetch_matches<'txn, 'tag>( #[derive(Debug)] pub struct QueryWordAutomaton { - query: String, + pub query: String, /// Is it a word that must be considered exact /// or is it some derived word (i.e. a synonym) - is_exact: bool, - is_prefix: bool, + pub is_exact: bool, + pub is_prefix: bool, + /// If it's a phrase query and what is + /// its index an the length of the phrase + pub phrase_query: Option<(u16, u16)>, } impl QueryWordAutomaton { pub fn exact(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: false } + QueryWordAutomaton { + query: query.to_string(), + is_exact: true, + is_prefix: false, + phrase_query: None, + } } pub fn exact_prefix(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: true } + QueryWordAutomaton { + query: query.to_string(), + is_exact: true, + is_prefix: true, + phrase_query: None, + } } pub fn non_exact(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { query: query.to_string(), is_exact: false, is_prefix: false } + QueryWordAutomaton { + query: query.to_string(), + is_exact: false, + is_prefix: false, + phrase_query: None, + } } pub fn dfa(&self) -> DFA { - if self.is_prefix { + if self.phrase_query.is_some() { + build_exact_dfa(&self.query) + } else if self.is_prefix { build_prefix_dfa(&self.query) } else { build_dfa(&self.query) @@ -411,16 +436,17 @@ fn construct_automatons2( if n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { - let left_automaton = QueryWordAutomaton::exact(left); + let mut left_automaton = QueryWordAutomaton::exact(left); + left_automaton.phrase_query = Some((0, 2)); enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); automaton_index += 1; automatons.push(left_automaton); - let right_automaton = QueryWordAutomaton::exact(right); + let mut right_automaton = QueryWordAutomaton::exact(right); + right_automaton.phrase_query = Some((1, 2)); enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); automaton_index += 1; automatons.push(right_automaton); - } } else { // automaton of concatenation of query words diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 4adb69dea..dd7769261 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -5,9 +5,10 @@ use std::sync::atomic::{self, AtomicUsize}; use slice_group_by::{GroupBy, GroupByMut}; use compact_arena::SmallArena; use sdset::{Set, SetBuf}; +use log::debug; use crate::{DocIndex, DocumentId}; -use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView}; +use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton}; use crate::automaton::QueryEnhancer; type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; @@ -20,6 +21,7 @@ pub trait Criterion { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ); fn evaluate<'a, 'tag, 'txn>( @@ -77,6 +79,7 @@ impl Criterion for Typo { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { prepare_query_distances(documents, query_enhancer); } @@ -134,6 +137,7 @@ impl Criterion for Words { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { prepare_query_distances(documents, query_enhancer); } @@ -161,6 +165,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { for document in documents { if !document.processed_matches.is_empty() { continue } @@ -181,7 +186,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( } } - let processed = multiword_rewrite_matches(&mut processed, query_enhancer); + let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); document.processed_matches = processed.into_vec(); } } @@ -196,8 +201,9 @@ impl Criterion for Proximity { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); } fn evaluate<'a, 'tag, 'txn>( @@ -264,8 +270,9 @@ impl Criterion for Attribute { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); } fn evaluate<'a, 'tag, 'txn>( @@ -276,16 +283,16 @@ impl Criterion for Attribute { ) -> Ordering { #[inline] - fn sum_attribute(matches: &[SimpleMatch]) -> usize { - let mut sum_attribute = 0; + fn best_attribute(matches: &[SimpleMatch]) -> u16 { + let mut best_attribute = u16::max_value(); for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_attribute += group[0].attribute as usize; + best_attribute = cmp::min(best_attribute, group[0].attribute); } - sum_attribute + best_attribute } - let lhs = sum_attribute(&lhs.processed_matches); - let rhs = sum_attribute(&rhs.processed_matches); + let lhs = best_attribute(&lhs.processed_matches); + let rhs = best_attribute(&rhs.processed_matches); lhs.cmp(&rhs) } @@ -301,8 +308,9 @@ impl Criterion for WordsPosition { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); } fn evaluate<'a, 'tag, 'txn>( @@ -338,6 +346,7 @@ impl Criterion for Exact { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { for document in documents { document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); @@ -379,6 +388,7 @@ impl Criterion for StableDocId { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { // ... } @@ -398,17 +408,58 @@ impl Criterion for StableDocId { } pub fn multiword_rewrite_matches( - matches: &mut [SimpleMatch], + simple_matches: &mut [SimpleMatch], query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) -> SetBuf { - let mut padded_matches = Vec::with_capacity(matches.len()); + let mut matches = Vec::with_capacity(simple_matches.len()); // let before_sort = Instant::now(); // we sort the matches by word index to make them rewritable - matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index)); // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) { + let iter = same_attribute.linear_group_by_key(|m| m.query_index); + let mut iter = iter.peekable(); + + while let Some(same_query_index) = iter.next() { + let query_index = same_query_index[0].query_index; + + // TODO we need to support phrase query of longer length + if let Some((i, len)) = automatons[query_index as usize].phrase_query { + if i != 0 { continue } + + // is the next query_index group the required one + if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) { + if let Some(next) = iter.next() { + for ma in same_query_index { + for mb in next { + if ma.word_index == mb.word_index + 1 { + matches.push(*ma); + matches.push(*mb); + } + } + } + } + } + } else { + matches.extend_from_slice(same_query_index); + } + } + } + + // let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some(); + // let next_query_index = match_.query_index + 1; + // if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() { + // continue + // } + + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + + let mut padded_matches = Vec::with_capacity(matches.len()); + // let before_padding = Instant::now(); // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {