From 11f3d7782d375c326dbc1b0e326a04524a5d8c0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2019 17:01:23 +0100 Subject: [PATCH 01/23] Introduce the AttrCount type --- meilisearch-core/src/criterion/exact.rs | 2 +- meilisearch-core/src/lib.rs | 2 +- meilisearch-core/src/query_builder.rs | 2 +- meilisearch-core/src/raw_document.rs | 4 ++-- meilisearch-core/src/serde/serializer.rs | 2 +- .../src/store/documents_fields_counts.rs | 16 ++++++++-------- meilisearch-types/src/lib.rs | 8 ++++++++ 7 files changed, 22 insertions(+), 14 deletions(-) diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index e9ae1b5dc..c3e7aba9c 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -12,7 +12,7 @@ fn number_exact_matches( query_index: &[u32], attribute: &[u16], is_exact: &[bool], - fields_counts: &Set<(SchemaAttr, u64)>, + fields_counts: &Set<(SchemaAttr, u16)>, ) -> usize { let mut count = 0; let mut index = 0; diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index e9ba84a41..0bc07e27e 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -25,7 +25,7 @@ pub use self::ranked_map::RankedMap; pub use self::raw_document::RawDocument; pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; -pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; +pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; #[doc(hidden)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 132dda557..87b4e9021 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -815,7 +815,7 @@ mod tests { let mut words_fst = BTreeSet::new(); let mut postings_lists = HashMap::new(); - let mut fields_counts = HashMap::<_, u64>::new(); + let mut fields_counts = HashMap::<_, u16>::new(); for (word, indexes) in iter { let word = word.to_lowercase().into_bytes(); diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 291e532be..1ecb9322d 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -12,7 +12,7 @@ pub struct RawDocument { pub id: DocumentId, pub matches: SharedMatches, pub highlights: Vec, - pub fields_counts: SetBuf<(SchemaAttr, u64)>, + pub fields_counts: SetBuf<(SchemaAttr, u16)>, } impl RawDocument { @@ -101,7 +101,7 @@ impl fmt::Debug for RawDocument { pub fn raw_documents_from( matches: SetBuf<(DocumentId, TmpMatch)>, highlights: SetBuf<(DocumentId, Highlight)>, - fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>, + fields_counts: SetBuf<(DocumentId, SchemaAttr, u16)>, ) -> Vec { let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); diff --git a/meilisearch-core/src/serde/serializer.rs b/meilisearch-core/src/serde/serializer.rs index c083991f5..2016cd314 100644 --- a/meilisearch-core/src/serde/serializer.rs +++ b/meilisearch-core/src/serde/serializer.rs @@ -325,7 +325,7 @@ where txn, document_id, attribute, - number_of_words as u64, + number_of_words as u16, )?; } } diff --git a/meilisearch-core/src/store/documents_fields_counts.rs b/meilisearch-core/src/store/documents_fields_counts.rs index 72ac7a2f8..0a7eb1bbf 100644 --- a/meilisearch-core/src/store/documents_fields_counts.rs +++ b/meilisearch-core/src/store/documents_fields_counts.rs @@ -7,7 +7,7 @@ use meilisearch_schema::SchemaAttr; #[derive(Copy, Clone)] pub struct DocumentsFieldsCounts { - pub(crate) documents_fields_counts: heed::Database, OwnedType>, + pub(crate) documents_fields_counts: heed::Database, OwnedType>, } impl DocumentsFieldsCounts { @@ -16,7 +16,7 @@ impl DocumentsFieldsCounts { writer: &mut heed::RwTxn, document_id: DocumentId, attribute: SchemaAttr, - value: u64, + value: u16, ) -> ZResult<()> { let key = DocumentAttrKey::new(document_id, attribute); self.documents_fields_counts.put(writer, &key, &value) @@ -42,7 +42,7 @@ impl DocumentsFieldsCounts { reader: &heed::RoTxn, document_id: DocumentId, attribute: SchemaAttr, - ) -> ZResult> { + ) -> ZResult> { let key = DocumentAttrKey::new(document_id, attribute); match self.documents_fields_counts.get(reader, &key)? { Some(count) => Ok(Some(count)), @@ -79,11 +79,11 @@ impl DocumentsFieldsCounts { } pub struct DocumentFieldsCountsIter<'txn> { - iter: heed::RoRange<'txn, OwnedType, OwnedType>, + iter: heed::RoRange<'txn, OwnedType, OwnedType>, } impl Iterator for DocumentFieldsCountsIter<'_> { - type Item = ZResult<(SchemaAttr, u64)>; + type Item = ZResult<(SchemaAttr, u16)>; fn next(&mut self) -> Option { match self.iter.next() { @@ -99,7 +99,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> { pub struct DocumentsIdsIter<'txn> { last_seen_id: Option, - iter: heed::RoIter<'txn, OwnedType, OwnedType>, + iter: heed::RoIter<'txn, OwnedType, OwnedType>, } impl Iterator for DocumentsIdsIter<'_> { @@ -123,11 +123,11 @@ impl Iterator for DocumentsIdsIter<'_> { } pub struct AllDocumentsFieldsCountsIter<'txn> { - iter: heed::RoIter<'txn, OwnedType, OwnedType>, + iter: heed::RoIter<'txn, OwnedType, OwnedType>, } impl Iterator for AllDocumentsFieldsCountsIter<'_> { - type Item = ZResult<(DocumentId, SchemaAttr, u64)>; + type Item = ZResult<(DocumentId, SchemaAttr, u16)>; fn next(&mut self) -> Option { match self.iter.next() { diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index c02281a5f..3419c61fd 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -63,3 +63,11 @@ pub struct Highlight { /// without needing to run the tokenizer again. pub char_length: u16, } + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))] +#[repr(C)] +pub struct AttrCount { + pub attr: u16, + pub count: u16, +} From ef6a4db18246e26d0fe13ac84b8f9074cafc9869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Nov 2019 16:31:47 +0100 Subject: [PATCH 02/23] Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion --- meilisearch-core/src/automaton/mod.rs | 11 ++++- meilisearch-core/src/criterion/exact.rs | 21 ++++---- meilisearch-core/src/query_builder.rs | 64 ++++++++++++++----------- meilisearch-core/src/raw_document.rs | 43 +++++++++-------- 4 files changed, 79 insertions(+), 60 deletions(-) diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index a803eee8e..782049942 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -2,7 +2,7 @@ mod dfa; mod query_enhancer; use std::cmp::Reverse; -use std::{cmp, vec}; +use std::{cmp, fmt, vec}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; @@ -68,7 +68,6 @@ impl AutomatonGroup { } } -#[derive(Debug)] pub struct Automaton { pub index: usize, pub ngram: usize, @@ -78,6 +77,14 @@ pub struct Automaton { pub query: String, } +impl fmt::Debug for Automaton { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Automaton") + .field("query", &self.query) + .finish() + } +} + impl Automaton { pub fn dfa(&self) -> DFA { if self.is_prefix { diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index c3e7aba9c..55a19001b 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,18 +1,17 @@ use std::cmp::Ordering; -use meilisearch_schema::SchemaAttr; use sdset::Set; use slice_group_by::GroupBy; use crate::criterion::Criterion; -use crate::RawDocument; +use crate::{AttrCount, RawDocument}; #[inline] fn number_exact_matches( query_index: &[u32], attribute: &[u16], is_exact: &[bool], - fields_counts: &Set<(SchemaAttr, u16)>, + fields_counts: &Set, ) -> usize { let mut count = 0; let mut index = 0; @@ -25,8 +24,8 @@ fn number_exact_matches( if *is_exact { found_exact = true; let attr = &attribute[index + pos]; - if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) { - let (_, count) = fields_counts[pos]; + if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { + let AttrCount { count, .. } = fields_counts[pos]; if count == 1 { return usize::max_value(); } @@ -50,7 +49,7 @@ impl Criterion for Exact { let query_index = lhs.query_index(); let is_exact = lhs.is_exact(); let attribute = lhs.attribute(); - let fields_counts = &lhs.fields_counts; + let fields_counts = lhs.fields_counts.as_ref().unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -59,7 +58,7 @@ impl Criterion for Exact { let query_index = rhs.query_index(); let is_exact = rhs.is_exact(); let attribute = rhs.attribute(); - let fields_counts = &rhs.fields_counts; + let fields_counts = rhs.fields_counts.as_ref().unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -86,7 +85,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -95,7 +94,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[false]; - let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -113,7 +112,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -122,7 +121,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 87b4e9021..489b0db43 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -6,6 +6,7 @@ use std::time::{Duration, Instant}; use std::{cmp, mem}; use fst::{IntoStreamer, Streamer}; +use log::debug; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; @@ -14,7 +15,7 @@ use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhanc use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::levenshtein::prefix_damerau_levenshtein; use crate::raw_document::{raw_documents_from, RawDocument}; -use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; +use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; pub struct QueryBuilder<'c, 'f, 'd> { @@ -146,27 +147,18 @@ fn fetch_raw_documents( searchables: Option<&ReorderedAttrs>, main_store: store::Main, postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, ) -> MResult> { let mut matches = Vec::new(); let mut highlights = Vec::new(); + let before_automatons_groups_loop = Instant::now(); for group in automatons_groups { - let AutomatonGroup { - is_phrase_query, - automatons, - } = group; + let AutomatonGroup { is_phrase_query, automatons } = group; let phrase_query_len = automatons.len(); let mut tmp_matches = Vec::new(); for (id, automaton) in automatons.into_iter().enumerate() { - let Automaton { - index, - is_exact, - query_len, - query, - .. - } = automaton; + let Automaton { index, is_exact, query_len, query, .. } = automaton; let dfa = automaton.dfa(); let words = match main_store.words_fst(reader)? { @@ -250,26 +242,26 @@ fn fetch_raw_documents( } } } + debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); + let before_multiword_rewrite_matches = Instant::now(); let matches = multiword_rewrite_matches(matches, &query_enhancer); + debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); + + let before_highlight_sorting = Instant::now(); let highlights = { highlights.sort_unstable_by_key(|(id, _)| *id); SetBuf::new_unchecked(highlights) }; + debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); - let fields_counts = { - let mut fields_counts = Vec::new(); - for group in matches.linear_group_by_key(|(id, ..)| *id) { - let id = group[0].0; - for result in documents_fields_counts_store.document_fields_counts(reader, id)? { - let (attr, count) = result?; - fields_counts.push((id, attr, count)); - } - } - SetBuf::new(fields_counts).unwrap() - }; - Ok(raw_documents_from(matches, highlights, fields_counts)) + let before_raw_documents = Instant::now(); + let raw_documents = raw_documents_from(matches, highlights); + debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); + debug!("documents to worry about: {}", raw_documents.len()); + + Ok(raw_documents) } impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { @@ -434,6 +426,11 @@ where for auts in automaton_producer { automatons.push(auts); + for (i, group) in automatons.iter().enumerate() { + debug!("group {} automatons {:?}", i, group.automatons); + } + + let before_fetch_raw_documents = Instant::now(); // we must retrieve the documents associated // with the current automatons let mut raw_documents = fetch_raw_documents( @@ -443,8 +440,8 @@ where searchable_attrs.as_ref(), main_store, postings_lists_store, - documents_fields_counts_store, )?; + debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed()); // stop processing when time is running out if let Some(timeout) = timeout { @@ -468,6 +465,20 @@ where continue; } + // we must pull the fields counts of these documents + // TODO it would be great to had a "dependency" thing for each criterion + // and make it so that we can be lazy on pulling/computing some data. + if criterion.name() == "Exact" { + for document in group.iter_mut() { + let mut fields_counts = Vec::new(); + for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? { + let (attr, count) = result?; + fields_counts.push(AttrCount { attr: attr.0, count }); + } + document.fields_counts = Some(SetBuf::new(fields_counts).unwrap()); + } + } + group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { @@ -561,7 +572,6 @@ where searchable_attrs.as_ref(), main_store, postings_lists_store, - documents_fields_counts_store, )?; // stop processing when time is running out diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 1ecb9322d..a37531131 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,18 +1,18 @@ use std::fmt; use std::sync::Arc; -use meilisearch_schema::SchemaAttr; use sdset::SetBuf; use slice_group_by::GroupBy; +use log::debug; -use crate::{DocumentId, Highlight, TmpMatch}; +use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; #[derive(Clone)] pub struct RawDocument { pub id: DocumentId, pub matches: SharedMatches, pub highlights: Vec, - pub fields_counts: SetBuf<(SchemaAttr, u16)>, + pub fields_counts: Option>, } impl RawDocument { @@ -100,44 +100,47 @@ impl fmt::Debug for RawDocument { pub fn raw_documents_from( matches: SetBuf<(DocumentId, TmpMatch)>, - highlights: SetBuf<(DocumentId, Highlight)>, - fields_counts: SetBuf<(DocumentId, SchemaAttr, u16)>, + highlights: SetBuf<(DocumentId, Highlight)> ) -> Vec { let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); let matches = matches.linear_group_by_key(|(id, _)| *id); let highlights = highlights.linear_group_by_key(|(id, _)| *id); - let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id); - for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) { - debug_assert_eq!(mgroup[0].0, hgroup[0].0); - debug_assert_eq!(mgroup[0].0, fgroup[0].0); + let mut loops_count = 0; + + for (mgroup, hgroup) in matches.zip(highlights) { + loops_count += 1; + assert_eq!(mgroup[0].0, hgroup[0].0); let document_id = mgroup[0].0; let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); let end = start + mgroup.len(); let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap(); + let fields_counts = None; docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); + // TODO we could try to keep both data + // - the data oriented one and the raw one, + // - the one that comes from the arguments of this function + // This way we would be able to only produce data oriented lazily. + // + // For example the default first criterion is `SumOfTypos` + // and just needs the `query_index` and the `distance` fields. + // It would probably be good to avoid wasting time sorting other fields of documents + // that will never ever reach the second criterion. matches2.extend_from_slice(mgroup); } + debug!("loops_counts number is {}", loops_count); + let matches = Arc::new(matches2); docs_ranges .into_iter() .map(|(id, range, highlights, fields_counts)| { - let matches = SharedMatches { - range, - matches: matches.clone(), - }; - RawDocument { - id, - matches, - highlights, - fields_counts, - } + let matches = SharedMatches { range, matches: matches.clone() }; + RawDocument { id, matches, highlights, fields_counts } }) .collect() } From d17d4dc5ec4192e3e02b85e23652abedefccc26d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 30 Nov 2019 16:33:48 +0100 Subject: [PATCH 03/23] Add more debug infos --- meilisearch-core/src/automaton/mod.rs | 7 + .../src/automaton/query_enhancer.rs | 2 + meilisearch-core/src/query_builder.rs | 30 +- meilisearch-core/src/query_enhancer.rs | 398 ------------------ meilisearch-core/src/raw_document.rs | 10 +- 5 files changed, 39 insertions(+), 408 deletions(-) delete mode 100644 meilisearch-core/src/query_enhancer.rs diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index 782049942..3fd86c73d 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -7,6 +7,7 @@ use std::{cmp, fmt, vec}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; use meilisearch_tokenizer::{is_cjk, split_query_string}; +use log::debug; use crate::database::MainT; use crate::error::MResult; @@ -38,6 +39,10 @@ impl AutomatonProducer { synonyms_store, )?; + for (i, group) in automatons.iter().enumerate() { + debug!("all automatons: group {} automatons {:?}", i, group.automatons); + } + Ok((AutomatonProducer { automatons }, query_enhancer)) } @@ -80,7 +85,9 @@ pub struct Automaton { impl fmt::Debug for Automaton { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Automaton") + .field("index", &self.index) .field("query", &self.query) + .field("is_prefix", &self.is_prefix) .finish() } } diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index 2194f3ff1..3b88b1157 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -58,6 +58,7 @@ where type Origin = usize; type RealLength = usize; +#[derive(Debug)] struct FakeIntervalTree { intervals: Vec<(Range, (Origin, RealLength))>, } @@ -154,6 +155,7 @@ impl> QueryEnhancerBuilder<'_, S> { } } +#[derive(Debug)] pub struct QueryEnhancer { origins: Vec, real_to_origin: FakeIntervalTree, diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 489b0db43..44f1a1028 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -34,6 +34,14 @@ fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, ) -> SetBuf<(DocumentId, TmpMatch)> { + if true { + let before_sort = Instant::now(); + matches.sort_unstable(); + let matches = SetBuf::new_unchecked(matches); + debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + return matches; + } + let mut padded_matches = Vec::with_capacity(matches.len()); // we sort the matches by word index to make them rewritable @@ -137,6 +145,10 @@ fn multiword_rewrite_matches( document_matches.sort_unstable(); } + // With this check we can see that the loop above takes something + // like 43% of the search time even when no rewrite is needed. + // assert_eq!(before_matches, padded_matches); + SetBuf::new_unchecked(padded_matches) } @@ -236,16 +248,28 @@ fn fetch_raw_documents( } } } else { + let before_rerewrite = Instant::now(); for (id, _, match_, highlight) in tmp_matches { matches.push((id, match_)); highlights.push((id, highlight)); } + debug!("rerewrite took {:.02?}", before_rerewrite.elapsed()); } } debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); + { + let mut cloned = matches.clone(); + let before_sort_test = Instant::now(); + cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); + debug!("sorting test took {:.02?}", before_sort_test.elapsed()); + } + let before_multiword_rewrite_matches = Instant::now(); + debug!("number of matches before rewrite {}", matches.len()); + debug!("{:?}", query_enhancer); let matches = multiword_rewrite_matches(matches, &query_enhancer); + debug!("number of matches after rewrite {}", matches.len()); debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); let before_highlight_sorting = Instant::now(); @@ -299,9 +323,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { synonyms_store: synonyms, } } -} -impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { pub fn with_filter(&mut self, function: F) where F: Fn(DocumentId) -> bool + 'f, @@ -479,12 +501,16 @@ where } } + group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { + debug!("criterion {} produced a group of size {}", criterion.name(), group.len()); + documents_seen += group.len(); groups.push(group); + // we have sort enough documents if the last document sorted is after // the end of the requested range, we can continue to the next criterion if documents_seen >= range.end { diff --git a/meilisearch-core/src/query_enhancer.rs b/meilisearch-core/src/query_enhancer.rs deleted file mode 100644 index 165c1b094..000000000 --- a/meilisearch-core/src/query_enhancer.rs +++ /dev/null @@ -1,398 +0,0 @@ -use std::ops::Range; -use std::cmp::Ordering::{Less, Greater, Equal}; - -/// Return `true` if the specified range can accept the given replacements words. -/// Returns `false` if the replacements words are already present in the original query -/// or if there is fewer replacement words than the range to replace. -// -// -// ## Ignored because already present in original -// -// new york city subway -// -------- ^^^^ -// / \ -// [new york city] -// -// -// ## Ignored because smaller than the original -// -// new york city subway -// ------------- -// \ / -// [new york] -// -// -// ## Accepted because bigger than the original -// -// NYC subway -// --- -// / \ -// / \ -// / \ -// / \ -// / \ -// [new york city] -// -fn rewrite_range_with(query: &[S], range: Range, words: &[T]) -> bool -where S: AsRef, - T: AsRef, -{ - if words.len() <= range.len() { - // there is fewer or equal replacement words - // than there is already in the replaced range - return false - } - - // retrieve the part to rewrite but with the length - // of the replacement part - let original = query.iter().skip(range.start).take(words.len()); - - // check if the original query doesn't already contain - // the replacement words - !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref)) -} - -type Origin = usize; -type RealLength = usize; - -struct FakeIntervalTree { - intervals: Vec<(Range, (Origin, RealLength))>, -} - -impl FakeIntervalTree { - fn new(mut intervals: Vec<(Range, (Origin, RealLength))>) -> FakeIntervalTree { - intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end)); - FakeIntervalTree { intervals } - } - - fn query(&self, point: usize) -> Option<(Range, (Origin, RealLength))> { - let element = self.intervals.binary_search_by(|(r, _)| { - if point >= r.start { - if point < r.end { Equal } else { Less } - } else { Greater } - }); - - let n = match element { Ok(n) => n, Err(n) => n }; - - match self.intervals.get(n) { - Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)), - _otherwise => None, - } - } -} - -pub struct QueryEnhancerBuilder<'a, S> { - query: &'a [S], - origins: Vec, - real_to_origin: Vec<(Range, (Origin, RealLength))>, -} - -impl> QueryEnhancerBuilder<'_, S> { - pub fn new(query: &[S]) -> QueryEnhancerBuilder { - // we initialize origins query indices based on their positions - let origins: Vec<_> = (0..query.len() + 1).collect(); - let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect(); - - QueryEnhancerBuilder { query, origins, real_to_origin } - } - - /// Update the final real to origin query indices mapping. - /// - /// `range` is the original words range that this `replacement` words replace - /// and `real` is the first real query index of these replacement words. - pub fn declare(&mut self, range: Range, real: usize, replacement: &[T]) - where T: AsRef, - { - // check if the range of original words - // can be rewritten with the replacement words - if rewrite_range_with(self.query, range.clone(), replacement) { - - // this range can be replaced so we need to - // modify the origins accordingly - let offset = replacement.len() - range.len(); - - let previous_padding = self.origins[range.end - 1]; - let current_offset = (self.origins[range.end] - 1) - previous_padding; - let diff = offset.saturating_sub(current_offset); - self.origins[range.end] += diff; - - for r in &mut self.origins[range.end + 1..] { - *r += diff; - } - } - - // we need to store the real number and origins relations - // this way it will be possible to know by how many - // we need to pad real query indices - let real_range = real..real + replacement.len().max(range.len()); - let real_length = replacement.len(); - self.real_to_origin.push((real_range, (range.start, real_length))); - } - - pub fn build(self) -> QueryEnhancer { - QueryEnhancer { - origins: self.origins, - real_to_origin: FakeIntervalTree::new(self.real_to_origin), - } - } -} - -pub struct QueryEnhancer { - origins: Vec, - real_to_origin: FakeIntervalTree, -} - -impl QueryEnhancer { - /// Returns the query indices to use to replace this real query index. - pub fn replacement(&self, real: u32) -> Range { - let real = real as usize; - - // query the fake interval tree with the real query index - let (range, (origin, real_length)) = - self.real_to_origin - .query(real) - .expect("real has never been declared"); - - // if `real` is the end bound of the range - if (range.start + real_length - 1) == real { - let mut count = range.len(); - let mut new_origin = origin; - for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { - let len = slice[1] - slice[0]; - count = count.saturating_sub(len); - if count == 0 { new_origin = origin + i; break } - } - - let n = real - range.start; - let start = self.origins[origin]; - let end = self.origins[new_origin + 1]; - let remaining = (end - start) - n; - - Range { start: (start + n) as u32, end: (start + n + remaining) as u32 } - - } else { - // just return the origin along with - // the real position of the word - let n = real as usize - range.start; - let origin = self.origins[origin]; - - Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn original_unmodified() { - let query = ["new", "york", "city", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // new york = new york city - builder.declare(0..2, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // new - assert_eq!(enhancer.replacement(1), 1..2); // york - assert_eq!(enhancer.replacement(2), 2..3); // city - assert_eq!(enhancer.replacement(3), 3..4); // subway - assert_eq!(enhancer.replacement(4), 0..1); // new - assert_eq!(enhancer.replacement(5), 1..2); // york - assert_eq!(enhancer.replacement(6), 2..3); // city - } - - #[test] - fn simple_growing() { - let query = ["new", "york", "subway"]; - // 0 1 2 - let mut builder = QueryEnhancerBuilder::new(&query); - - // new york = new york city - builder.declare(0..2, 3, &["new", "york", "city"]); - // ^ 3 4 5 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // new - assert_eq!(enhancer.replacement(1), 1..3); // york - assert_eq!(enhancer.replacement(2), 3..4); // subway - assert_eq!(enhancer.replacement(3), 0..1); // new - assert_eq!(enhancer.replacement(4), 1..2); // york - assert_eq!(enhancer.replacement(5), 2..3); // city - } - - #[test] - fn same_place_growings() { - let query = ["NY", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NY = new york - builder.declare(0..1, 2, &["new", "york"]); - // ^ 2 3 - - // NY = new york city - builder.declare(0..1, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // NY = NYC - builder.declare(0..1, 7, &["NYC"]); - // ^ 7 - - // NY = new york city - builder.declare(0..1, 8, &["new", "york", "city"]); - // ^ 8 9 10 - - // subway = underground train - builder.declare(1..2, 11, &["underground", "train"]); - // ^ 11 12 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..3); // NY - assert_eq!(enhancer.replacement(1), 3..5); // subway - assert_eq!(enhancer.replacement(2), 0..1); // new - assert_eq!(enhancer.replacement(3), 1..3); // york - assert_eq!(enhancer.replacement(4), 0..1); // new - assert_eq!(enhancer.replacement(5), 1..2); // york - assert_eq!(enhancer.replacement(6), 2..3); // city - assert_eq!(enhancer.replacement(7), 0..3); // NYC - assert_eq!(enhancer.replacement(8), 0..1); // new - assert_eq!(enhancer.replacement(9), 1..2); // york - assert_eq!(enhancer.replacement(10), 2..3); // city - assert_eq!(enhancer.replacement(11), 3..4); // underground - assert_eq!(enhancer.replacement(12), 4..5); // train - } - - #[test] - fn bigger_growing() { - let query = ["NYC", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(0..1, 2, &["new", "york", "city"]); - // ^ 2 3 4 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..3); // NYC - assert_eq!(enhancer.replacement(1), 3..4); // subway - assert_eq!(enhancer.replacement(2), 0..1); // new - assert_eq!(enhancer.replacement(3), 1..2); // york - assert_eq!(enhancer.replacement(4), 2..3); // city - } - - #[test] - fn middle_query_growing() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..6); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - } - - #[test] - fn end_query_growing() { - let query = ["NYC", "subway"]; - // 0 1 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(1..2, 2, &["underground", "train"]); - // ^ 2 3 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // NYC - assert_eq!(enhancer.replacement(1), 1..3); // subway - assert_eq!(enhancer.replacement(2), 1..2); // underground - assert_eq!(enhancer.replacement(3), 2..3); // train - } - - #[test] - fn multiple_growings() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // subway = underground train - builder.declare(3..4, 7, &["underground", "train"]); - // ^ 7 8 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - } - - #[test] - fn multiple_probable_growings() { - let query = ["great", "awesome", "NYC", "subway"]; - // 0 1 2 3 - let mut builder = QueryEnhancerBuilder::new(&query); - - // NYC = new york city - builder.declare(2..3, 4, &["new", "york", "city"]); - // ^ 4 5 6 - - // subway = underground train - builder.declare(3..4, 7, &["underground", "train"]); - // ^ 7 8 - - // great awesome = good - builder.declare(0..2, 9, &["good"]); - // ^ 9 - - // awesome NYC = NY - builder.declare(1..3, 10, &["NY"]); - // ^^ 10 - - // NYC subway = metro - builder.declare(2..4, 11, &["metro"]); - // ^^ 11 - - let enhancer = builder.build(); - - assert_eq!(enhancer.replacement(0), 0..1); // great - assert_eq!(enhancer.replacement(1), 1..2); // awesome - assert_eq!(enhancer.replacement(2), 2..5); // NYC - assert_eq!(enhancer.replacement(3), 5..7); // subway - assert_eq!(enhancer.replacement(4), 2..3); // new - assert_eq!(enhancer.replacement(5), 3..4); // york - assert_eq!(enhancer.replacement(6), 4..5); // city - assert_eq!(enhancer.replacement(7), 5..6); // underground - assert_eq!(enhancer.replacement(8), 6..7); // train - assert_eq!(enhancer.replacement(9), 0..2); // good - assert_eq!(enhancer.replacement(10), 1..5); // NY - assert_eq!(enhancer.replacement(11), 2..5); // metro - } -} diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index a37531131..5ba660d11 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use sdset::SetBuf; use slice_group_by::GroupBy; -use log::debug; use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; @@ -108,10 +107,7 @@ pub fn raw_documents_from( let matches = matches.linear_group_by_key(|(id, _)| *id); let highlights = highlights.linear_group_by_key(|(id, _)| *id); - let mut loops_count = 0; - for (mgroup, hgroup) in matches.zip(highlights) { - loops_count += 1; assert_eq!(mgroup[0].0, hgroup[0].0); let document_id = mgroup[0].0; @@ -122,8 +118,8 @@ pub fn raw_documents_from( docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); // TODO we could try to keep both data - // - the data oriented one and the raw one, - // - the one that comes from the arguments of this function + // - the data oriented one and, + // - the raw one, the one that comes from the arguments of this function // This way we would be able to only produce data oriented lazily. // // For example the default first criterion is `SumOfTypos` @@ -133,8 +129,6 @@ pub fn raw_documents_from( matches2.extend_from_slice(mgroup); } - debug!("loops_counts number is {}", loops_count); - let matches = Arc::new(matches2); docs_ranges .into_iter() From 902625601aa8bd803c4a13b309c54354abf4c8a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 30 Nov 2019 16:53:34 +0100 Subject: [PATCH 04/23] Work in progress: It seems like we support synonyms, split and concat words --- Cargo.lock | 8 + meilisearch-core/Cargo.toml | 2 + meilisearch-core/examples/from_file.rs | 6 +- meilisearch-core/src/automaton/mod.rs | 8 +- .../src/automaton/query_enhancer.rs | 5 +- meilisearch-core/src/bucket_sort.rs | 467 +++++++++++++++++ meilisearch-core/src/criterion2.rs | 479 ++++++++++++++++++ meilisearch-core/src/lib.rs | 4 + meilisearch-core/src/query_builder.rs | 95 ++-- 9 files changed, 1026 insertions(+), 48 deletions(-) create mode 100644 meilisearch-core/src/bucket_sort.rs create mode 100644 meilisearch-core/src/criterion2.rs diff --git a/Cargo.lock b/Cargo.lock index 2dedeb04a..8034a4add 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -257,6 +257,11 @@ dependencies = [ "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "compact_arena" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "const-random" version = "0.1.6" @@ -937,6 +942,7 @@ dependencies = [ "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", + "compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -946,6 +952,7 @@ dependencies = [ "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", "meilisearch-schema 0.8.4", @@ -2648,6 +2655,7 @@ dependencies = [ "checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141" "checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe" "checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59" "checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5" diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index a268c6605..62da7cfb8 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -10,6 +10,7 @@ arc-swap = "0.4.3" bincode = "1.1.4" byteorder = "1.3.2" chrono = { version = "0.4.9", features = ["serde"] } +compact_arena = "0.4.0" crossbeam-channel = "0.4.0" deunicode = "1.0.0" env_logger = "0.7.0" @@ -35,6 +36,7 @@ assert_matches = "1.3" criterion = "0.3" csv = "1.0.7" indexmap = { version = "1.2.0", features = ["serde-1"] } +jemallocator = "0.3.2" rustyline = { version = "5.0.0", default-features = false } structopt = "0.3.2" tempfile = "3.1.0" diff --git a/meilisearch-core/examples/from_file.rs b/meilisearch-core/examples/from_file.rs index dff8d1b2a..c0b50362c 100644 --- a/meilisearch-core/examples/from_file.rs +++ b/meilisearch-core/examples/from_file.rs @@ -1,5 +1,5 @@ -use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::HashSet; +use std::collections::btree_map::{BTreeMap, Entry}; use std::error::Error; use std::io::{Read, Write}; use std::iter::FromIterator; @@ -15,6 +15,10 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use meilisearch_core::{Database, Highlight, ProcessedUpdateResult}; use meilisearch_schema::SchemaAttr; +// #[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + #[derive(Debug, StructOpt)] struct IndexCommand { /// The destination where the database must be created. diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index 3fd86c73d..406d72ce2 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -13,11 +13,11 @@ use crate::database::MainT; use crate::error::MResult; use crate::store; -use self::dfa::{build_dfa, build_prefix_dfa}; +pub use self::dfa::{build_dfa, build_prefix_dfa}; pub use self::query_enhancer::QueryEnhancer; -use self::query_enhancer::QueryEnhancerBuilder; +pub use self::query_enhancer::QueryEnhancerBuilder; -const NGRAMS: usize = 3; +pub const NGRAMS: usize = 3; pub struct AutomatonProducer { automatons: Vec, @@ -145,7 +145,7 @@ pub fn normalize_str(string: &str) -> String { string } -fn split_best_frequency<'a>( +pub fn split_best_frequency<'a>( reader: &heed::RoTxn, word: &'a str, postings_lists_store: store::PostingsLists, diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index 3b88b1157..5f2ac53cf 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -143,8 +143,7 @@ impl> QueryEnhancerBuilder<'_, S> { // we need to pad real query indices let real_range = real..real + replacement.len().max(range.len()); let real_length = replacement.len(); - self.real_to_origin - .push((real_range, (range.start, real_length))); + self.real_to_origin.push((real_range, (range.start, real_length))); } pub fn build(self) -> QueryEnhancer { @@ -162,7 +161,7 @@ pub struct QueryEnhancer { } impl QueryEnhancer { - /// Returns the query indices to use to replace this real query index. + /// Returns the query indices that represent this real query index. pub fn replacement(&self, real: u32) -> Range { let real = real as usize; diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs new file mode 100644 index 000000000..0fb1fed3b --- /dev/null +++ b/meilisearch-core/src/bucket_sort.rs @@ -0,0 +1,467 @@ +use std::ops::Deref; +use std::borrow::Cow; +use std::cmp::Ordering; +use std::collections::HashSet; +use std::io::Write; +use std::mem; +use std::ops::Range; +use std::rc::Rc; +use std::time::{Duration, Instant}; + +use compact_arena::{SmallArena, Idx32, mk_arena}; +use fst::{IntoStreamer, Streamer}; +use levenshtein_automata::DFA; +use log::debug; +use meilisearch_tokenizer::{is_cjk, split_query_string}; +use meilisearch_types::{DocIndex, Highlight}; +use sdset::Set; +use slice_group_by::{GroupBy, GroupByMut}; + +use crate::automaton::NGRAMS; +use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; +use crate::automaton::{build_dfa, build_prefix_dfa}; +use crate::automaton::{normalize_str, split_best_frequency}; + +use crate::criterion2::*; +use crate::levenshtein::prefix_damerau_levenshtein; +use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; +use crate::{store, Document, DocumentId, MResult}; + +pub fn bucket_sort<'c>( + reader: &heed::RoTxn, + query: &str, + range: Range, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, +) -> MResult> +{ + // let automatons = construct_automatons(query); + let (automatons, query_enhancer) = + construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; + + let before_postings_lists_fetching = Instant::now(); + mk_arena!(arena); + let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + debug!("bare matches ({}) retrieved in {:.02?}", + bare_matches.len(), + before_postings_lists_fetching.elapsed(), + ); + + let before_raw_documents_presort = Instant::now(); + bare_matches.sort_unstable_by_key(|sm| sm.document_id); + debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut raw_documents = Vec::new(); + for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + raw_documents.push(RawDocument { raw_matches, processed_matches: None }); + } + debug!("creating {} candidates documents took {:.02?}", + raw_documents.len(), + before_raw_documents_building.elapsed(), + ); + + dbg!(mem::size_of::()); + dbg!(mem::size_of::()); + + let mut groups = vec![raw_documents.as_mut_slice()]; + + let criteria = [ + Box::new(Typo) as Box, + Box::new(Words), + Box::new(Proximity), + Box::new(Attribute), + Box::new(WordsPosition), + Box::new(Exact), + Box::new(StableDocId), + ]; + + 'criteria: for criterion in &criteria { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut documents_seen = 0; + + for mut group in tmp_groups { + let before_criterion_preparation = Instant::now(); + criterion.prepare(&mut group, &mut arena, &query_enhancer); + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { + debug!("{:?} produced a group of size {}", criterion.name(), group.len()); + + documents_seen += group.len(); + groups.push(group); + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if documents_seen >= range.end { + continue 'criteria; + } + } + } + } + + let iter = raw_documents.into_iter().skip(range.start).take(range.len()); + let iter = iter.map(|d| { + let highlights = d.raw_matches.iter().flat_map(|sm| { + let postings_list = &arena[sm.postings_list]; + let input = postings_list.input(); + let query = &automatons[sm.query_index as usize].query; + postings_list.iter().map(move |m| { + let covered_area = if query.len() > input.len() { + input.len() + } else { + prefix_damerau_levenshtein(query.as_bytes(), input).1 + }; + Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 } + }) + }).collect(); + + Document { + id: d.raw_matches[0].document_id, + highlights, + #[cfg(test)] matches: Vec::new(), + } + }); + + Ok(iter.collect()) +} + +pub struct RawDocument<'a, 'tag> { + pub raw_matches: &'a mut [BareMatch<'tag>], + pub processed_matches: Option>, +} + +pub struct BareMatch<'tag> { + pub document_id: DocumentId, + pub query_index: u16, + pub distance: u8, + pub is_exact: bool, + pub postings_list: Idx32<'tag>, +} + +// TODO remove that +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct SimpleMatch { + pub query_index: u16, + pub distance: u8, + pub attribute: u16, + pub word_index: u16, + pub is_exact: bool, +} + +#[derive(Clone)] +pub struct PostingsListView<'txn> { + input: Rc<[u8]>, + postings_list: Rc>>, + offset: usize, + len: usize, +} + +impl<'txn> PostingsListView<'txn> { + pub fn new(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { + let len = postings_list.len(); + PostingsListView { input, postings_list, offset: 0, len } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn input(&self) -> &[u8] { + &self.input + } + + pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> { + assert!(offset + len <= self.len); + PostingsListView { + input: self.input.clone(), + postings_list: self.postings_list.clone(), + offset: self.offset + offset, + len: len, + } + } +} + +impl AsRef> for PostingsListView<'_> { + fn as_ref(&self) -> &Set { + Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + } +} + +impl Deref for PostingsListView<'_> { + type Target = Set; + + fn deref(&self) -> &Set { + Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + } +} + +fn fetch_matches<'txn, 'tag>( + reader: &'txn heed::RoTxn, + automatons: &[QueryWordAutomaton], + arena: &mut SmallArena<'tag, PostingsListView<'txn>>, + main_store: store::Main, + postings_lists_store: store::PostingsLists, +) -> MResult>> +{ + let mut before_words_fst = Instant::now(); + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; + debug!("words fst took {:.02?}", before_words_fst.elapsed()); + + let mut total_postings_lists = Vec::new(); + + let mut dfa_time = Duration::default(); + let mut stream_next_time = Duration::default(); + let mut postings_lists_fetching_time = Duration::default(); + + for (query_index, automaton) in automatons.iter().enumerate() { + let before_dfa = Instant::now(); + let dfa = automaton.dfa(); + let QueryWordAutomaton { index, query, is_exact, is_prefix } = automaton; + dfa_time += before_dfa.elapsed(); + + let mut number_of_words = 0; + + let before_fst_search = Instant::now(); + let mut stream = words.search(&dfa).into_stream(); + debug!("fst search took {:.02?}", before_fst_search.elapsed()); + + // while let Some(input) = stream.next() { + loop { + let before_stream_next = Instant::now(); + let input = match stream.next() { + Some(input) => input, + None => break, + }; + stream_next_time += before_stream_next.elapsed(); + + number_of_words += 1; + + let distance = dfa.eval(input).to_u8(); + let is_exact = *is_exact && distance == 0 && input.len() == query.len(); + + let before_postings_lists_fetching = Instant::now(); + if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? { + + let input = Rc::from(input); + let postings_list = Rc::new(postings_list); + let postings_list_view = PostingsListView::new(input, postings_list); + let mut offset = 0; + for group in postings_list_view.linear_group_by_key(|di| di.document_id) { + + let posting_list_index = arena.add(postings_list_view.range(offset, group.len())); + let document_id = group[0].document_id; + let stuffed = BareMatch { + document_id, + query_index: query_index as u16, + distance, + is_exact, + postings_list: posting_list_index, + }; + + total_postings_lists.push(stuffed); + offset += group.len(); + } + } + postings_lists_fetching_time += before_postings_lists_fetching.elapsed(); + } + + debug!("{:?} gives {} words", query, number_of_words); + } + + debug!("stream next took {:.02?}", stream_next_time); + debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time); + debug!("dfa creation took {:.02?}", dfa_time); + + Ok(total_postings_lists) +} + +#[derive(Debug)] +pub struct QueryWordAutomaton { + index: usize, + query: String, + /// Is it a word that must be considered exact + /// or is it some derived word (i.e. a synonym) + is_exact: bool, + is_prefix: bool, +} + +impl QueryWordAutomaton { + pub fn exact(query: &str, index: usize) -> QueryWordAutomaton { + QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: false } + } + + pub fn exact_prefix(query: &str, index: usize) -> QueryWordAutomaton { + QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: true } + } + + pub fn non_exact(query: &str, index: usize) -> QueryWordAutomaton { + QueryWordAutomaton { index, query: query.to_string(), is_exact: false, is_prefix: false } + } + + pub fn dfa(&self) -> DFA { + if self.is_prefix { + build_prefix_dfa(&self.query) + } else { + build_dfa(&self.query) + } + } +} + +// fn construct_automatons(query: &str) -> Vec { +// let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); +// let mut original_words = split_query_string(query).map(str::to_lowercase).peekable(); +// let mut automatons = Vec::new(); + +// while let Some(word) = original_words.next() { +// let has_following_word = original_words.peek().is_some(); +// let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + +// let automaton = if not_prefix_dfa { +// QueryWordAutomaton::exact(word) +// } else { +// QueryWordAutomaton::exact_prefix(word) +// }; + +// automatons.push(automaton); +// } + +// automatons +// } + +fn construct_automatons2( + reader: &heed::RoTxn, + query: &str, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + synonym_store: store::Synonyms, +) -> MResult<(Vec, QueryEnhancer)> { + let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); + let synonyms = match main_store.synonyms_fst(reader)? { + Some(synonym) => synonym, + None => fst::Set::default(), + }; + + let mut automaton_index = 0; + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); + + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_words = query_words.iter().peekable(); + while let Some(word) = original_words.next() { + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + QueryWordAutomaton::exact(word, automaton_index) + } else { + QueryWordAutomaton::exact_prefix(word, automaton_index) + }; + automaton_index += 1; + automatons.push(automaton); + } + + for n in 1..=NGRAMS { + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + let query_range = query_index..query_index + n; + let ngram_nb_words = ngram_slice.len(); + let ngram = ngram_slice.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = + has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + // automaton of synonyms of the ngrams + let normalized = normalize_str(&ngram); + let lev = if not_prefix_dfa { + build_dfa(&normalized) + } else { + build_prefix_dfa(&normalized) + }; + + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(base) = stream.next() { + // only trigger alternatives when the last word has been typed + // i.e. "new " do not but "new yo" triggers alternatives to "new york" + let base = std::str::from_utf8(base).unwrap(); + let base_nb_words = split_query_string(base).count(); + if ngram_nb_words != base_nb_words { + continue; + } + + if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for synonym in synonyms_words { + let automaton = if nb_synonym_words == 1 { + QueryWordAutomaton::exact(synonym, automaton_index) + } else { + QueryWordAutomaton::non_exact(synonym, automaton_index) + }; + automaton_index += 1; + automatons.push(automaton); + } + } + } + } + + if n == 1 { + if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { + let left_automaton = QueryWordAutomaton::exact(left, automaton_index); + enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); + automaton_index += 1; + automatons.push(left_automaton); + + let right_automaton = QueryWordAutomaton::exact(right, automaton_index); + enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); + automaton_index += 1; + automatons.push(right_automaton); + + } + } else { + // automaton of concatenation of query words + let concat = ngram_slice.concat(); + let normalized = normalize_str(&concat); + + let real_query_index = automaton_index; + enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); + + let automaton = QueryWordAutomaton::exact(&normalized, automaton_index); + automaton_index += 1; + automatons.push(automaton); + } + } + } + + // // order automatons, the most important first, + // // we keep the original automatons at the front. + // automatons[1..].sort_by_key(|group| { + // let a = group.automatons.first().unwrap(); + // ( + // Reverse(a.is_exact), + // a.ngram, + // Reverse(group.automatons.len()), + // ) + // }); + + Ok((automatons, enhancer_builder.build())) +} diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs new file mode 100644 index 000000000..469f936fa --- /dev/null +++ b/meilisearch-core/src/criterion2.rs @@ -0,0 +1,479 @@ +use std::cmp::{self, Ordering, Reverse}; +use std::borrow::Cow; +use std::sync::atomic::{self, AtomicUsize}; + +use slice_group_by::{GroupBy, GroupByMut}; +use compact_arena::SmallArena; +use sdset::{Set, SetBuf}; + +use crate::{DocIndex, DocumentId}; +use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView}; +use crate::automaton::QueryEnhancer; + +type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; + +pub trait Criterion { + fn name(&self) -> &str; + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ); + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering; + + #[inline] + fn eq<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> bool + { + self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal + } +} + +pub struct Typo; + +impl Criterion for Typo { + fn name(&self) -> &str { "typo" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, bm.distance)); + } + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + // This function is a wrong logarithmic 10 function. + // It is safe to panic on input number higher than 3, + // the number of typos is never bigger than that. + #[inline] + fn custom_log10(n: u8) -> f32 { + match n { + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) + _ => panic!("invalid number"), + } + } + + #[inline] + fn compute_typos(matches: &[BareMatch]) -> usize { + let mut number_words: usize = 0; + let mut sum_typos = 0.0; + + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_typos += custom_log10(group[0].distance); + number_words += 1; + } + + (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize + } + + let lhs = compute_typos(&lhs.raw_matches); + let rhs = compute_typos(&rhs.raw_matches); + + lhs.cmp(&rhs).reverse() + } +} + +pub struct Words; + +impl Criterion for Words { + fn name(&self) -> &str { "words" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| bm.query_index); + } + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + #[inline] + fn number_of_query_words(matches: &[BareMatch]) -> usize { + matches.linear_group_by_key(|bm| bm.query_index).count() + } + + let lhs = number_of_query_words(&lhs.raw_matches); + let rhs = number_of_query_words(&rhs.raw_matches); + + lhs.cmp(&rhs).reverse() + } +} + +fn process_raw_matches<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, +) { + for document in documents { + if document.processed_matches.is_some() { continue } + + let mut processed = Vec::new(); + let document_id = document.raw_matches[0].document_id; + + for m in document.raw_matches.iter() { + let postings_list = &postings_lists[m.postings_list]; + processed.reserve(postings_list.len()); + for di in postings_list.as_ref() { + let simple_match = SimpleMatch { + query_index: m.query_index, + distance: m.distance, + attribute: di.attribute, + word_index: di.word_index, + is_exact: m.is_exact, + }; + processed.push(simple_match); + } + } + + let processed = multiword_rewrite_matches(&mut processed, query_enhancer); + document.processed_matches = Some(processed.into_vec()); + } +} + +pub struct Proximity; + +impl Criterion for Proximity { + fn name(&self) -> &str { "proximity" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ) { + process_raw_matches(documents, postings_lists, query_enhancer); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering + { + const MAX_DISTANCE: u16 = 8; + + fn index_proximity(lhs: u16, rhs: u16) -> u16 { + if lhs < rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + } + } + + fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { + if lhs.attribute != rhs.attribute { MAX_DISTANCE } + else { index_proximity(lhs.word_index, rhs.word_index) } + } + + fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { + let mut min_prox = u16::max_value(); + for a in lhs { + for b in rhs { + let prox = attribute_proximity(*a, *b); + min_prox = cmp::min(min_prox, prox); + } + } + min_prox + } + + fn matches_proximity(matches: &[SimpleMatch],) -> u16 { + let mut proximity = 0; + let mut iter = matches.linear_group_by_key(|m| m.query_index); + + // iterate over groups by windows of size 2 + let mut last = iter.next(); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + proximity += min_proximity(lhs, rhs); + last = Some(rhs); + } + + proximity + } + + let lhs = matches_proximity(&lhs.processed_matches.as_ref().unwrap()); + let rhs = matches_proximity(&rhs.processed_matches.as_ref().unwrap()); + + lhs.cmp(&rhs) + } +} + +pub struct Attribute; + +impl Criterion for Attribute { + fn name(&self) -> &str { "attribute" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ) { + process_raw_matches(documents, postings_lists, query_enhancer); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering + { + #[inline] + fn sum_attribute(matches: &[SimpleMatch]) -> usize { + let mut sum_attribute = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_attribute += group[0].attribute as usize; + } + sum_attribute + } + + let lhs = sum_attribute(&lhs.processed_matches.as_ref().unwrap()); + let rhs = sum_attribute(&rhs.processed_matches.as_ref().unwrap()); + + lhs.cmp(&rhs) + } +} + +pub struct WordsPosition; + +impl Criterion for WordsPosition { + fn name(&self) -> &str { "words position" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, + query_enhancer: &QueryEnhancer, + ) { + process_raw_matches(documents, postings_lists, query_enhancer); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &PostingsListsArena<'tag, 'txn>, + ) -> Ordering + { + #[inline] + fn sum_words_position(matches: &[SimpleMatch]) -> usize { + let mut sum_words_position = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_words_position += group[0].word_index as usize; + } + sum_words_position + } + + let lhs = sum_words_position(&lhs.processed_matches.as_ref().unwrap()); + let rhs = sum_words_position(&rhs.processed_matches.as_ref().unwrap()); + + lhs.cmp(&rhs) + } +} + +pub struct Exact; + +impl Criterion for Exact { + fn name(&self) -> &str { "exact" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + } + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + #[inline] + fn sum_exact_query_words(matches: &[BareMatch]) -> usize { + let mut sum_exact_query_words = 0; + + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_exact_query_words += group[0].is_exact as usize; + } + + sum_exact_query_words + } + + let lhs = sum_exact_query_words(&lhs.raw_matches); + let rhs = sum_exact_query_words(&rhs.raw_matches); + + lhs.cmp(&rhs).reverse() + } +} + +pub struct StableDocId; + +impl Criterion for StableDocId { + fn name(&self) -> &str { "stable document id" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut PostingsListsArena, + query_enhancer: &QueryEnhancer, + ) { + // ... + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &PostingsListsArena, + ) -> Ordering + { + let lhs = &lhs.raw_matches[0].document_id; + let rhs = &rhs.raw_matches[0].document_id; + + lhs.cmp(rhs) + } +} + +pub fn multiword_rewrite_matches( + matches: &mut [SimpleMatch], + query_enhancer: &QueryEnhancer, +) -> SetBuf +{ + let mut padded_matches = Vec::with_capacity(matches.len()); + + // let before_sort = Instant::now(); + // we sort the matches by word index to make them rewritable + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + + // let before_padding = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + // find the biggest padding + let mut biggest = 0; + for match_ in same_word_index { + let mut replacement = query_enhancer.replacement(match_.query_index as u32); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; + + for nmatch_ in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); + let query_index = rep.next().unwrap() as u16; + if query_index == padmatch.query_index { + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) + { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push(padmatch); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break; + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + + // debug!("padding matches took {:.02?}", before_padding.elapsed()); + + // With this check we can see that the loop above takes something + // like 43% of the search time even when no rewrite is needed. + // assert_eq!(before_matches, padded_matches); + + SetBuf::from_dirty(padded_matches) +} diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 0bc07e27e..3a54168b4 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -18,6 +18,10 @@ pub mod serde; pub mod store; mod update; +// TODO replace +mod bucket_sort; +mod criterion2; + pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::error::{Error, MResult}; pub use self::number::{Number, ParseNumberError}; diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 44f1a1028..7edda5294 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -10,7 +10,7 @@ use log::debug; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; -use crate::database::MainT; +use crate::{bucket_sort::bucket_sort, database::MainT}; use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::levenshtein::prefix_damerau_levenshtein; @@ -34,19 +34,14 @@ fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, ) -> SetBuf<(DocumentId, TmpMatch)> { - if true { - let before_sort = Instant::now(); - matches.sort_unstable(); - let matches = SetBuf::new_unchecked(matches); - debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); - return matches; - } - let mut padded_matches = Vec::with_capacity(matches.len()); + let before_sort = Instant::now(); // we sort the matches by word index to make them rewritable matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); + debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + let before_padding = Instant::now(); // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { // padding will only be applied @@ -145,6 +140,8 @@ fn multiword_rewrite_matches( document_matches.sort_unstable(); } + debug!("padding matches took {:.02?}", before_padding.elapsed()); + // With this check we can see that the loop above takes something // like 43% of the search time even when no rewrite is needed. // assert_eq!(before_matches, padded_matches); @@ -163,7 +160,18 @@ fn fetch_raw_documents( let mut matches = Vec::new(); let mut highlights = Vec::new(); + let words = match main_store.words_fst(reader)? { + Some(words) => words, + None => return Ok(Vec::new()), + }; + let before_automatons_groups_loop = Instant::now(); + let mut doc_indexes_rewrite = Duration::default(); + let mut retrieve_postings_lists = Duration::default(); + let mut stream_reserve = Duration::default(); + let mut covered_area_time = Duration::default(); + let mut eval_time = Duration::default(); + for group in automatons_groups { let AutomatonGroup { is_phrase_query, automatons } = group; let phrase_query_len = automatons.len(); @@ -173,29 +181,39 @@ fn fetch_raw_documents( let Automaton { index, is_exact, query_len, query, .. } = automaton; let dfa = automaton.dfa(); - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; + let before_stream_loop = Instant::now(); + let mut stream_count = 0; let mut stream = words.search(&dfa).into_stream(); while let Some(input) = stream.next() { + let before_eval_time = Instant::now(); let distance = dfa.eval(input).to_u8(); + eval_time += before_eval_time.elapsed(); + let is_exact = *is_exact && distance == 0 && input.len() == *query_len; + stream_count += 1; + + let before_covered_area = Instant::now(); let covered_area = if *query_len > input.len() { input.len() } else { prefix_damerau_levenshtein(query.as_bytes(), input).1 }; + covered_area_time += before_covered_area.elapsed(); + let before_retrieve_postings_lists = Instant::now(); let doc_indexes = match postings_lists_store.postings_list(reader, input)? { Some(doc_indexes) => doc_indexes, None => continue, }; + retrieve_postings_lists += before_retrieve_postings_lists.elapsed(); + let before_stream_reserve = Instant::now(); tmp_matches.reserve(doc_indexes.len()); + stream_reserve += before_stream_reserve.elapsed(); + let before_doc_indexes_rewrite = Instant::now(); for di in doc_indexes.as_ref() { let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); if let Some(attribute) = attribute { @@ -219,7 +237,9 @@ fn fetch_raw_documents( tmp_matches.push((di.document_id, id, match_, highlight)); } } + doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed(); } + debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count); } if *is_phrase_query { @@ -249,6 +269,10 @@ fn fetch_raw_documents( } } else { let before_rerewrite = Instant::now(); + + matches.reserve(tmp_matches.len()); + highlights.reserve(tmp_matches.len()); + for (id, _, match_, highlight) in tmp_matches { matches.push((id, match_)); highlights.push((id, highlight)); @@ -257,13 +281,18 @@ fn fetch_raw_documents( } } debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); + debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite); + debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists); + debug!("stream reserve took {:.02?}", stream_reserve); + debug!("covered area took {:.02?}", covered_area_time); + debug!("eval value took {:.02?}", eval_time); - { - let mut cloned = matches.clone(); - let before_sort_test = Instant::now(); - cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); - debug!("sorting test took {:.02?}", before_sort_test.elapsed()); - } + // { + // let mut cloned = matches.clone(); + // let before_sort_test = Instant::now(); + // cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); + // debug!("sorting test took {:.02?}", before_sort_test.elapsed()); + // } let before_multiword_rewrite_matches = Instant::now(); debug!("number of matches before rewrite {}", matches.len()); @@ -279,7 +308,6 @@ fn fetch_raw_documents( }; debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); - let before_raw_documents = Instant::now(); let raw_documents = raw_documents_from(matches, highlights); debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); @@ -356,29 +384,12 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { range: Range, ) -> MResult> { match self.distinct { - Some((distinct, distinct_size)) => raw_query_with_distinct( + Some((distinct, distinct_size)) => unimplemented!("distinct"), + None => bucket_sort( reader, query, range, - self.filter, - distinct, - distinct_size, - self.timeout, - self.criteria, - self.searchable_attrs, - self.main_store, - self.postings_lists_store, - self.documents_fields_counts_store, - self.synonyms_store, - ), - None => raw_query( - reader, - query, - range, - self.filter, - self.timeout, - self.criteria, - self.searchable_attrs, + // self.criteria, self.main_store, self.postings_lists_store, self.documents_fields_counts_store, @@ -472,6 +483,8 @@ where } } + let before_bucket_sort = Instant::now(); + let mut groups = vec![raw_documents.as_mut_slice()]; 'criteria: for criterion in criteria.as_ref() { @@ -520,6 +533,8 @@ where } } + debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed()); + // once we classified the documents related to the current // automatons we save that as the next valid result let iter = raw_documents From f87c67fcad80fcb209859d51503f972f09d194c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 6 Dec 2019 12:10:28 +0100 Subject: [PATCH 05/23] Improve the QueryEnhancer by doing a single lookup --- .../src/automaton/query_enhancer.rs | 105 ++++++++++-------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index 5f2ac53cf..f564239d7 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -147,63 +147,76 @@ impl> QueryEnhancerBuilder<'_, S> { } pub fn build(self) -> QueryEnhancer { - QueryEnhancer { - origins: self.origins, - real_to_origin: FakeIntervalTree::new(self.real_to_origin), + let interval_tree = FakeIntervalTree::new(self.real_to_origin); + let mut table = Vec::new(); + + for real in 0.. { + match replacement(&self.origins, &interval_tree, real) { + Some(range) => table.push(range), + None => break, + } } + + QueryEnhancer { table } + } +} + +/// Returns the query indices that represent this real query index. +fn replacement( + origins: &[usize], + real_to_origin: &FakeIntervalTree, + real: u32, +) -> Option> +{ + let real = real as usize; + + // query the fake interval tree with the real query index + let (range, (origin, real_length)) = real_to_origin.query(real)?; + + // if `real` is the end bound of the range + if (range.start + real_length - 1) == real { + let mut count = range.len(); + let mut new_origin = origin; + for (i, slice) in origins[new_origin..].windows(2).enumerate() { + let len = slice[1] - slice[0]; + count = count.saturating_sub(len); + if count == 0 { + new_origin = origin + i; + break; + } + } + + let n = real - range.start; + let start = origins[origin]; + let end = origins[new_origin + 1]; + let remaining = (end - start) - n; + + Some(Range { + start: (start + n) as u32, + end: (start + n + remaining) as u32, + }) + } else { + // just return the origin along with + // the real position of the word + let n = real as usize - range.start; + let origin = origins[origin]; + + Some(Range { + start: (origin + n) as u32, + end: (origin + n + 1) as u32, + }) } } #[derive(Debug)] pub struct QueryEnhancer { - origins: Vec, - real_to_origin: FakeIntervalTree, + table: Vec>, } impl QueryEnhancer { /// Returns the query indices that represent this real query index. pub fn replacement(&self, real: u32) -> Range { - let real = real as usize; - - // query the fake interval tree with the real query index - let (range, (origin, real_length)) = self - .real_to_origin - .query(real) - .expect("real has never been declared"); - - // if `real` is the end bound of the range - if (range.start + real_length - 1) == real { - let mut count = range.len(); - let mut new_origin = origin; - for (i, slice) in self.origins[new_origin..].windows(2).enumerate() { - let len = slice[1] - slice[0]; - count = count.saturating_sub(len); - if count == 0 { - new_origin = origin + i; - break; - } - } - - let n = real - range.start; - let start = self.origins[origin]; - let end = self.origins[new_origin + 1]; - let remaining = (end - start) - n; - - Range { - start: (start + n) as u32, - end: (start + n + remaining) as u32, - } - } else { - // just return the origin along with - // the real position of the word - let n = real as usize - range.start; - let origin = self.origins[origin]; - - Range { - start: (origin + n) as u32, - end: (origin + n + 1) as u32, - } - } + self.table[real as usize].clone() } } From 4e91b31b1f68e976779571139bb78949d928f32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 6 Dec 2019 13:41:22 +0100 Subject: [PATCH 06/23] Make the Typo and Words work with synonyms --- .../src/automaton/query_enhancer.rs | 2 +- meilisearch-core/src/bucket_sort.rs | 72 ++++++---------- meilisearch-core/src/criterion2.rs | 82 ++++++++++++------- 3 files changed, 76 insertions(+), 80 deletions(-) diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index f564239d7..4b7582dd5 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -188,7 +188,7 @@ fn replacement( let n = real - range.start; let start = origins[origin]; - let end = origins[new_origin + 1]; + let end = origins.get(new_origin + 1)?; let remaining = (end - start) - n; Some(Range { diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 0fb1fed3b..8e4612c22 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -56,7 +56,11 @@ pub fn bucket_sort<'c>( let before_raw_documents_building = Instant::now(); let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - raw_documents.push(RawDocument { raw_matches, processed_matches: None }); + raw_documents.push(RawDocument { + raw_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), + }); } debug!("creating {} candidates documents took {:.02?}", raw_documents.len(), @@ -134,7 +138,10 @@ pub fn bucket_sort<'c>( pub struct RawDocument<'a, 'tag> { pub raw_matches: &'a mut [BareMatch<'tag>], - pub processed_matches: Option>, + pub processed_matches: Vec, + /// The list of minimum `distance` found + /// where the `query_index` is the index + pub processed_distances: Vec>, } pub struct BareMatch<'tag> { @@ -226,7 +233,7 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { index, query, is_exact, is_prefix } = automaton; + let QueryWordAutomaton { query, is_exact, is_prefix } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; @@ -287,7 +294,6 @@ fn fetch_matches<'txn, 'tag>( #[derive(Debug)] pub struct QueryWordAutomaton { - index: usize, query: String, /// Is it a word that must be considered exact /// or is it some derived word (i.e. a synonym) @@ -296,16 +302,16 @@ pub struct QueryWordAutomaton { } impl QueryWordAutomaton { - pub fn exact(query: &str, index: usize) -> QueryWordAutomaton { - QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: false } + pub fn exact(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: false } } - pub fn exact_prefix(query: &str, index: usize) -> QueryWordAutomaton { - QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: true } + pub fn exact_prefix(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: true } } - pub fn non_exact(query: &str, index: usize) -> QueryWordAutomaton { - QueryWordAutomaton { index, query: query.to_string(), is_exact: false, is_prefix: false } + pub fn non_exact(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { query: query.to_string(), is_exact: false, is_prefix: false } } pub fn dfa(&self) -> DFA { @@ -317,27 +323,6 @@ impl QueryWordAutomaton { } } -// fn construct_automatons(query: &str) -> Vec { -// let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); -// let mut original_words = split_query_string(query).map(str::to_lowercase).peekable(); -// let mut automatons = Vec::new(); - -// while let Some(word) = original_words.next() { -// let has_following_word = original_words.peek().is_some(); -// let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - -// let automaton = if not_prefix_dfa { -// QueryWordAutomaton::exact(word) -// } else { -// QueryWordAutomaton::exact_prefix(word) -// }; - -// automatons.push(automaton); -// } - -// automatons -// } - fn construct_automatons2( reader: &heed::RoTxn, query: &str, @@ -364,9 +349,9 @@ fn construct_automatons2( let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); let automaton = if not_prefix_dfa { - QueryWordAutomaton::exact(word, automaton_index) + QueryWordAutomaton::exact(word) } else { - QueryWordAutomaton::exact_prefix(word, automaton_index) + QueryWordAutomaton::exact_prefix(word) }; automaton_index += 1; automatons.push(automaton); @@ -413,9 +398,9 @@ fn construct_automatons2( for synonym in synonyms_words { let automaton = if nb_synonym_words == 1 { - QueryWordAutomaton::exact(synonym, automaton_index) + QueryWordAutomaton::exact(synonym) } else { - QueryWordAutomaton::non_exact(synonym, automaton_index) + QueryWordAutomaton::non_exact(synonym) }; automaton_index += 1; automatons.push(automaton); @@ -426,12 +411,12 @@ fn construct_automatons2( if n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { - let left_automaton = QueryWordAutomaton::exact(left, automaton_index); + let left_automaton = QueryWordAutomaton::exact(left); enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); automaton_index += 1; automatons.push(left_automaton); - let right_automaton = QueryWordAutomaton::exact(right, automaton_index); + let right_automaton = QueryWordAutomaton::exact(right); enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); automaton_index += 1; automatons.push(right_automaton); @@ -445,23 +430,12 @@ fn construct_automatons2( let real_query_index = automaton_index; enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - let automaton = QueryWordAutomaton::exact(&normalized, automaton_index); + let automaton = QueryWordAutomaton::exact(&normalized); automaton_index += 1; automatons.push(automaton); } } } - // // order automatons, the most important first, - // // we keep the original automatons at the front. - // automatons[1..].sort_by_key(|group| { - // let a = group.automatons.first().unwrap(); - // ( - // Reverse(a.is_exact), - // a.ngram, - // Reverse(group.automatons.len()), - // ) - // }); - Ok((automatons, enhancer_builder.build())) } diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 469f936fa..4adb69dea 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -41,6 +41,32 @@ pub trait Criterion { } } +fn prepare_query_distances( + documents: &mut [RawDocument], + query_enhancer: &QueryEnhancer, +) { + for document in documents { + if !document.processed_distances.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.raw_matches.iter() { + let range = query_enhancer.replacement(m.query_index as u32); + processed.resize(range.end as usize, None); + + for index in range { + let index = index as usize; + processed[index] = match processed[index] { + Some(distance) if distance > m.distance => Some(m.distance), + Some(distance) => Some(distance), + None => Some(m.distance), + }; + } + } + + document.processed_distances = processed; + } +} + pub struct Typo; impl Criterion for Typo { @@ -52,9 +78,7 @@ impl Criterion for Typo { postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, ) { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, bm.distance)); - } + prepare_query_distances(documents, query_enhancer); } fn evaluate( @@ -79,20 +103,22 @@ impl Criterion for Typo { } #[inline] - fn compute_typos(matches: &[BareMatch]) -> usize { + fn compute_typos(distances: &[Option]) -> usize { let mut number_words: usize = 0; let mut sum_typos = 0.0; - for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_typos += custom_log10(group[0].distance); - number_words += 1; + for distance in distances { + if let Some(distance) = distance { + sum_typos += custom_log10(*distance); + number_words += 1; + } } (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize } - let lhs = compute_typos(&lhs.raw_matches); - let rhs = compute_typos(&rhs.raw_matches); + let lhs = compute_typos(&lhs.processed_distances); + let rhs = compute_typos(&rhs.processed_distances); lhs.cmp(&rhs).reverse() } @@ -109,9 +135,7 @@ impl Criterion for Words { postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, ) { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| bm.query_index); - } + prepare_query_distances(documents, query_enhancer); } fn evaluate( @@ -122,28 +146,26 @@ impl Criterion for Words { ) -> Ordering { #[inline] - fn number_of_query_words(matches: &[BareMatch]) -> usize { - matches.linear_group_by_key(|bm| bm.query_index).count() + fn number_of_query_words(distances: &[Option]) -> usize { + distances.iter().cloned().filter(Option::is_some).count() } - let lhs = number_of_query_words(&lhs.raw_matches); - let rhs = number_of_query_words(&rhs.raw_matches); + let lhs = number_of_query_words(&lhs.processed_distances); + let rhs = number_of_query_words(&rhs.processed_distances); lhs.cmp(&rhs).reverse() } } -fn process_raw_matches<'a, 'tag, 'txn>( +fn prepare_raw_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { for document in documents { - if document.processed_matches.is_some() { continue } + if !document.processed_matches.is_empty() { continue } let mut processed = Vec::new(); - let document_id = document.raw_matches[0].document_id; - for m in document.raw_matches.iter() { let postings_list = &postings_lists[m.postings_list]; processed.reserve(postings_list.len()); @@ -160,7 +182,7 @@ fn process_raw_matches<'a, 'tag, 'txn>( } let processed = multiword_rewrite_matches(&mut processed, query_enhancer); - document.processed_matches = Some(processed.into_vec()); + document.processed_matches = processed.into_vec(); } } @@ -175,7 +197,7 @@ impl Criterion for Proximity { postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { - process_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer); } fn evaluate<'a, 'tag, 'txn>( @@ -225,8 +247,8 @@ impl Criterion for Proximity { proximity } - let lhs = matches_proximity(&lhs.processed_matches.as_ref().unwrap()); - let rhs = matches_proximity(&rhs.processed_matches.as_ref().unwrap()); + let lhs = matches_proximity(&lhs.processed_matches); + let rhs = matches_proximity(&rhs.processed_matches); lhs.cmp(&rhs) } @@ -243,7 +265,7 @@ impl Criterion for Attribute { postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { - process_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer); } fn evaluate<'a, 'tag, 'txn>( @@ -262,8 +284,8 @@ impl Criterion for Attribute { sum_attribute } - let lhs = sum_attribute(&lhs.processed_matches.as_ref().unwrap()); - let rhs = sum_attribute(&rhs.processed_matches.as_ref().unwrap()); + let lhs = sum_attribute(&lhs.processed_matches); + let rhs = sum_attribute(&rhs.processed_matches); lhs.cmp(&rhs) } @@ -280,7 +302,7 @@ impl Criterion for WordsPosition { postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { - process_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer); } fn evaluate<'a, 'tag, 'txn>( @@ -299,8 +321,8 @@ impl Criterion for WordsPosition { sum_words_position } - let lhs = sum_words_position(&lhs.processed_matches.as_ref().unwrap()); - let rhs = sum_words_position(&rhs.processed_matches.as_ref().unwrap()); + let lhs = sum_words_position(&lhs.processed_matches); + let rhs = sum_words_position(&rhs.processed_matches); lhs.cmp(&rhs) } From 0f698d6bd930f2186aa40e5037923d6d4bbbe174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 6 Dec 2019 19:15:19 +0100 Subject: [PATCH 07/23] Work in progress: Bad Typo detection I have an issue where "speakers" is split into "speaker" and "s", when I compute the distances for the Typo criterion, it takes "s" into account and put a distance of zero in the bucket 0 (the "speakers" bucket), therefore it reports any document matching "s" without typos as best results. I need to make sure to ignore "s" when its associated part "speaker" doesn't even exist in the document and is not in the place it should be ("speaker" followed by "s"). This is hard to think that it will had much computation time to the Typo criterion like in the previous algorithm where I computed the real query/words indexes based and removed the invalid ones before sending the documents to the bucket sort. --- meilisearch-core/src/automaton/dfa.rs | 5 ++ meilisearch-core/src/automaton/mod.rs | 2 +- meilisearch-core/src/bucket_sort.rs | 54 +++++++++++++----- meilisearch-core/src/criterion2.rs | 79 ++++++++++++++++++++++----- 4 files changed, 111 insertions(+), 29 deletions(-) diff --git a/meilisearch-core/src/automaton/dfa.rs b/meilisearch-core/src/automaton/dfa.rs index 6258da424..da1a6eb39 100644 --- a/meilisearch-core/src/automaton/dfa.rs +++ b/meilisearch-core/src/automaton/dfa.rs @@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA { pub fn build_dfa(query: &str) -> DFA { build_dfa_with_setting(query, PrefixSetting::NoPrefix) } + +pub fn build_exact_dfa(query: &str) -> DFA { + let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); + builder.build_dfa(query) +} diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index 406d72ce2..ecf99ee1c 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -13,7 +13,7 @@ use crate::database::MainT; use crate::error::MResult; use crate::store; -pub use self::dfa::{build_dfa, build_prefix_dfa}; +pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; pub use self::query_enhancer::QueryEnhancer; pub use self::query_enhancer::QueryEnhancerBuilder; diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 8e4612c22..7477ff383 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -19,7 +19,7 @@ use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::NGRAMS; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; -use crate::automaton::{build_dfa, build_prefix_dfa}; +use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::automaton::{normalize_str, split_best_frequency}; use crate::criterion2::*; @@ -41,6 +41,8 @@ pub fn bucket_sort<'c>( let (automatons, query_enhancer) = construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; + debug!("{:?}", query_enhancer); + let before_postings_lists_fetching = Instant::now(); mk_arena!(arena); let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; @@ -74,7 +76,7 @@ pub fn bucket_sort<'c>( let criteria = [ Box::new(Typo) as Box, - Box::new(Words), + Box::new(Words) as Box, Box::new(Proximity), Box::new(Attribute), Box::new(WordsPosition), @@ -88,7 +90,7 @@ pub fn bucket_sort<'c>( for mut group in tmp_groups { let before_criterion_preparation = Instant::now(); - criterion.prepare(&mut group, &mut arena, &query_enhancer); + criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); let before_criterion_sort = Instant::now(); @@ -116,6 +118,7 @@ pub fn bucket_sort<'c>( let postings_list = &arena[sm.postings_list]; let input = postings_list.input(); let query = &automatons[sm.query_index as usize].query; + debug!("{:?} contains {:?}", d.raw_matches[0].document_id, query); postings_list.iter().map(move |m| { let covered_area = if query.len() > input.len() { input.len() @@ -126,6 +129,8 @@ pub fn bucket_sort<'c>( }) }).collect(); + debug!("{:?} contains {:?}", d.raw_matches[0].document_id, d.processed_distances); + Document { id: d.raw_matches[0].document_id, highlights, @@ -233,7 +238,7 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { query, is_exact, is_prefix } = automaton; + let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; @@ -294,28 +299,48 @@ fn fetch_matches<'txn, 'tag>( #[derive(Debug)] pub struct QueryWordAutomaton { - query: String, + pub query: String, /// Is it a word that must be considered exact /// or is it some derived word (i.e. a synonym) - is_exact: bool, - is_prefix: bool, + pub is_exact: bool, + pub is_prefix: bool, + /// If it's a phrase query and what is + /// its index an the length of the phrase + pub phrase_query: Option<(u16, u16)>, } impl QueryWordAutomaton { pub fn exact(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: false } + QueryWordAutomaton { + query: query.to_string(), + is_exact: true, + is_prefix: false, + phrase_query: None, + } } pub fn exact_prefix(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: true } + QueryWordAutomaton { + query: query.to_string(), + is_exact: true, + is_prefix: true, + phrase_query: None, + } } pub fn non_exact(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { query: query.to_string(), is_exact: false, is_prefix: false } + QueryWordAutomaton { + query: query.to_string(), + is_exact: false, + is_prefix: false, + phrase_query: None, + } } pub fn dfa(&self) -> DFA { - if self.is_prefix { + if self.phrase_query.is_some() { + build_exact_dfa(&self.query) + } else if self.is_prefix { build_prefix_dfa(&self.query) } else { build_dfa(&self.query) @@ -411,16 +436,17 @@ fn construct_automatons2( if n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { - let left_automaton = QueryWordAutomaton::exact(left); + let mut left_automaton = QueryWordAutomaton::exact(left); + left_automaton.phrase_query = Some((0, 2)); enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); automaton_index += 1; automatons.push(left_automaton); - let right_automaton = QueryWordAutomaton::exact(right); + let mut right_automaton = QueryWordAutomaton::exact(right); + right_automaton.phrase_query = Some((1, 2)); enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); automaton_index += 1; automatons.push(right_automaton); - } } else { // automaton of concatenation of query words diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 4adb69dea..dd7769261 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -5,9 +5,10 @@ use std::sync::atomic::{self, AtomicUsize}; use slice_group_by::{GroupBy, GroupByMut}; use compact_arena::SmallArena; use sdset::{Set, SetBuf}; +use log::debug; use crate::{DocIndex, DocumentId}; -use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView}; +use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton}; use crate::automaton::QueryEnhancer; type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; @@ -20,6 +21,7 @@ pub trait Criterion { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ); fn evaluate<'a, 'tag, 'txn>( @@ -77,6 +79,7 @@ impl Criterion for Typo { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { prepare_query_distances(documents, query_enhancer); } @@ -134,6 +137,7 @@ impl Criterion for Words { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { prepare_query_distances(documents, query_enhancer); } @@ -161,6 +165,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { for document in documents { if !document.processed_matches.is_empty() { continue } @@ -181,7 +186,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( } } - let processed = multiword_rewrite_matches(&mut processed, query_enhancer); + let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); document.processed_matches = processed.into_vec(); } } @@ -196,8 +201,9 @@ impl Criterion for Proximity { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); } fn evaluate<'a, 'tag, 'txn>( @@ -264,8 +270,9 @@ impl Criterion for Attribute { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); } fn evaluate<'a, 'tag, 'txn>( @@ -276,16 +283,16 @@ impl Criterion for Attribute { ) -> Ordering { #[inline] - fn sum_attribute(matches: &[SimpleMatch]) -> usize { - let mut sum_attribute = 0; + fn best_attribute(matches: &[SimpleMatch]) -> u16 { + let mut best_attribute = u16::max_value(); for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_attribute += group[0].attribute as usize; + best_attribute = cmp::min(best_attribute, group[0].attribute); } - sum_attribute + best_attribute } - let lhs = sum_attribute(&lhs.processed_matches); - let rhs = sum_attribute(&rhs.processed_matches); + let lhs = best_attribute(&lhs.processed_matches); + let rhs = best_attribute(&rhs.processed_matches); lhs.cmp(&rhs) } @@ -301,8 +308,9 @@ impl Criterion for WordsPosition { documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); } fn evaluate<'a, 'tag, 'txn>( @@ -338,6 +346,7 @@ impl Criterion for Exact { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { for document in documents { document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); @@ -379,6 +388,7 @@ impl Criterion for StableDocId { documents: &mut [RawDocument], postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { // ... } @@ -398,17 +408,58 @@ impl Criterion for StableDocId { } pub fn multiword_rewrite_matches( - matches: &mut [SimpleMatch], + simple_matches: &mut [SimpleMatch], query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) -> SetBuf { - let mut padded_matches = Vec::with_capacity(matches.len()); + let mut matches = Vec::with_capacity(simple_matches.len()); // let before_sort = Instant::now(); // we sort the matches by word index to make them rewritable - matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index)); // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); + for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) { + let iter = same_attribute.linear_group_by_key(|m| m.query_index); + let mut iter = iter.peekable(); + + while let Some(same_query_index) = iter.next() { + let query_index = same_query_index[0].query_index; + + // TODO we need to support phrase query of longer length + if let Some((i, len)) = automatons[query_index as usize].phrase_query { + if i != 0 { continue } + + // is the next query_index group the required one + if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) { + if let Some(next) = iter.next() { + for ma in same_query_index { + for mb in next { + if ma.word_index == mb.word_index + 1 { + matches.push(*ma); + matches.push(*mb); + } + } + } + } + } + } else { + matches.extend_from_slice(same_query_index); + } + } + } + + // let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some(); + // let next_query_index = match_.query_index + 1; + // if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() { + // continue + // } + + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + + let mut padded_matches = Vec::with_capacity(matches.len()); + // let before_padding = Instant::now(); // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { From 22b19c0d9316a93af04e274a015fade56943cd72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 7 Dec 2019 13:32:43 +0100 Subject: [PATCH 08/23] Fix the processed distance algorithm --- meilisearch-core/src/bucket_sort.rs | 24 ++++++++++++++++-------- meilisearch-core/src/criterion2.rs | 14 +++++++++++--- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 7477ff383..303e94e50 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,4 +1,5 @@ use std::ops::Deref; +use std::fmt; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; @@ -145,7 +146,6 @@ pub struct RawDocument<'a, 'tag> { pub raw_matches: &'a mut [BareMatch<'tag>], pub processed_matches: Vec, /// The list of minimum `distance` found - /// where the `query_index` is the index pub processed_distances: Vec>, } @@ -157,6 +157,17 @@ pub struct BareMatch<'tag> { pub postings_list: Idx32<'tag>, } +impl fmt::Debug for BareMatch<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("BareMatch") + .field("document_id", &self.document_id) + .field("query_index", &self.query_index) + .field("distance", &self.distance) + .field("is_exact", &self.is_exact) + .finish() + } +} + // TODO remove that #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct SimpleMatch { @@ -238,14 +249,11 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton; + let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; - - let before_fst_search = Instant::now(); let mut stream = words.search(&dfa).into_stream(); - debug!("fst search took {:.02?}", before_fst_search.elapsed()); // while let Some(input) = stream.next() { loop { @@ -272,7 +280,7 @@ fn fetch_matches<'txn, 'tag>( let posting_list_index = arena.add(postings_list_view.range(offset, group.len())); let document_id = group[0].document_id; - let stuffed = BareMatch { + let bare_match = BareMatch { document_id, query_index: query_index as u16, distance, @@ -280,7 +288,7 @@ fn fetch_matches<'txn, 'tag>( postings_list: posting_list_index, }; - total_postings_lists.push(stuffed); + total_postings_lists.push(bare_match); offset += group.len(); } } @@ -434,7 +442,7 @@ fn construct_automatons2( } } - if n == 1 { + if false && n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { let mut left_automaton = QueryWordAutomaton::exact(left); left_automaton.phrase_query = Some((0, 2)); diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index dd7769261..4c40b9969 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -46,14 +46,22 @@ pub trait Criterion { fn prepare_query_distances( documents: &mut [RawDocument], query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { for document in documents { if !document.processed_distances.is_empty() { continue } let mut processed = Vec::new(); for m in document.raw_matches.iter() { + // FIXME we really need to take splitted words into account + // those must be seen at the same level as the non-splitteds + // if automatons[m.query_index as usize].phrase_query.is_some() { + // continue + // } + let range = query_enhancer.replacement(m.query_index as u32); - processed.resize(range.end as usize, None); + let new_len = cmp::max(range.end as usize, processed.len()); + processed.resize(new_len, None); for index in range { let index = index as usize; @@ -81,7 +89,7 @@ impl Criterion for Typo { query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer); + prepare_query_distances(documents, query_enhancer, automatons); } fn evaluate( @@ -139,7 +147,7 @@ impl Criterion for Words { query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer); + prepare_query_distances(documents, query_enhancer, automatons); } fn evaluate( From 9c03bb3428d574a15e3770ddc18d81da477dfb91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 9 Dec 2019 15:30:14 +0100 Subject: [PATCH 09/23] First probably working phrase query doc filtering --- Cargo.lock | 1 + meilisearch-core/Cargo.toml | 1 + meilisearch-core/src/bucket_sort.rs | 71 ++++++++++++++++++++++++++--- meilisearch-core/src/criterion2.rs | 50 +++++++++++++++----- 4 files changed, 106 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8034a4add..ad3f3494b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -952,6 +952,7 @@ dependencies = [ "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 62da7cfb8..3455f755d 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -17,6 +17,7 @@ env_logger = "0.7.0" fst = { version = "0.3.5", default-features = false } hashbrown = { version = "0.6.0", features = ["serde"] } heed = "0.6.1" +itertools = "0.8.2" # kill me please levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } log = "0.4.8" meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" } diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 303e94e50..ae0fdf63f 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -59,11 +59,9 @@ pub fn bucket_sort<'c>( let before_raw_documents_building = Instant::now(); let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - raw_documents.push(RawDocument { - raw_matches, - processed_matches: Vec::new(), - processed_distances: Vec::new(), - }); + if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) { + raw_documents.push(raw_document); + } } debug!("creating {} candidates documents took {:.02?}", raw_documents.len(), @@ -149,6 +147,57 @@ pub struct RawDocument<'a, 'tag> { pub processed_distances: Vec>, } +impl<'a, 'tag> RawDocument<'a, 'tag> { + fn new<'txn>( + raw_matches: &'a mut [BareMatch<'tag>], + automatons: &[QueryWordAutomaton], + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Option> + { + raw_matches.sort_unstable_by_key(|m| m.query_index); + + // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches); + + let mut previous_word = None; + for i in 0..raw_matches.len() { + let a = &raw_matches[i]; + let auta = &automatons[a.query_index as usize]; + + match auta.phrase_query { + Some((0, _)) => { + previous_word = Some(a.query_index); + let b = raw_matches.get(i + 1)?; + if a.query_index + 1 != b.query_index { + return None; + } + + let pla = &postings_lists[a.postings_list]; + let plb = &postings_lists[b.postings_list]; + + let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) + }); + + if !iter.any(|eb| eb.is_both()) { return None } + }, + Some((1, _)) => { + if previous_word.take() != Some(a.query_index - 1) { + return None; + } + }, + Some((_, _)) => unreachable!(), + None => (), + } + } + + Some(RawDocument { + raw_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), + }) + } +} + pub struct BareMatch<'tag> { pub document_id: DocumentId, pub query_index: u16, @@ -186,6 +235,15 @@ pub struct PostingsListView<'txn> { len: usize, } +impl fmt::Debug for PostingsListView<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PostingsListView") + .field("input", &std::str::from_utf8(&self.input).unwrap()) + .field("postings_list", &self.as_ref()) + .finish() + } +} + impl<'txn> PostingsListView<'txn> { pub fn new(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { let len = postings_list.len(); @@ -275,6 +333,7 @@ fn fetch_matches<'txn, 'tag>( let input = Rc::from(input); let postings_list = Rc::new(postings_list); let postings_list_view = PostingsListView::new(input, postings_list); + let mut offset = 0; for group in postings_list_view.linear_group_by_key(|di| di.document_id) { @@ -442,7 +501,7 @@ fn construct_automatons2( } } - if false && n == 1 { + if true && n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { let mut left_automaton = QueryWordAutomaton::exact(left); left_automaton.phrase_query = Some((0, 2)); diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 4c40b9969..3bfbe76ea 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -43,16 +43,42 @@ pub trait Criterion { } } -fn prepare_query_distances( - documents: &mut [RawDocument], +fn prepare_query_distances<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], + postings_lists: &PostingsListsArena<'tag, 'txn>, ) { for document in documents { if !document.processed_distances.is_empty() { continue } + // debug!("{:?}", document.raw_matches[0].document_id); + let mut processed = Vec::new(); - for m in document.raw_matches.iter() { + let mut raw_matches = document.raw_matches.iter().peekable(); + while let Some(m) = raw_matches.next() { + + // let automaton = &automatons[m.query_index as usize]; + + // debug!("{:?} {:?}", m, automaton); + // debug!("{:?}", &postings_lists[m.postings_list]); + + // match automaton.phrase_query { + // Some((0, len)) => { + // match raw_matches.peek() { + // Some(BareMatch { query_index, .. }) => { + // if *query_index != m.query_index + 1 { + // raw_matches.next(); + // continue + // } + // }, + // None => continue, + // } + // }, + // Some((_, _)) => continue, + // None => (), + // } + // FIXME we really need to take splitted words into account // those must be seen at the same level as the non-splitteds // if automatons[m.query_index as usize].phrase_query.is_some() { @@ -73,6 +99,8 @@ fn prepare_query_distances( } } + // debug!("{:?}", processed); + document.processed_distances = processed; } } @@ -82,14 +110,14 @@ pub struct Typo; impl Criterion for Typo { fn name(&self) -> &str { "typo" } - fn prepare( + fn prepare<'a, 'tag, 'txn>( &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer, automatons); + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); } fn evaluate( @@ -140,14 +168,14 @@ pub struct Words; impl Criterion for Words { fn name(&self) -> &str { "words" } - fn prepare( + fn prepare<'a, 'tag, 'txn>( &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer, automatons); + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); } fn evaluate( From dd03a6256a9b1d9d0892a21242a10d5184078e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 9 Dec 2019 16:45:06 +0100 Subject: [PATCH 10/23] Debug pre filtered number of documents --- meilisearch-core/src/bucket_sort.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index ae0fdf63f..fcbe5a262 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -57,14 +57,17 @@ pub fn bucket_sort<'c>( debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); let before_raw_documents_building = Instant::now(); + let mut prefiltered_documents = 0; let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + prefiltered_documents += 1; if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) { raw_documents.push(raw_document); } } - debug!("creating {} candidates documents took {:.02?}", + debug!("creating {} (original {}) candidates documents took {:.02?}", raw_documents.len(), + prefiltered_documents, before_raw_documents_building.elapsed(), ); From 8d71112dcb7924edd8f3a064af3fd213a6c86185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 10 Dec 2019 12:19:38 +0100 Subject: [PATCH 11/23] Rewrite the phrase query postings lists This simplified the multiword_rewrite_matches function a little bit. --- meilisearch-core/src/bucket_sort.rs | 131 +++++++++++++++++++++------- meilisearch-core/src/criterion2.rs | 80 +---------------- 2 files changed, 103 insertions(+), 108 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index fcbe5a262..ccbf1e756 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -15,8 +15,9 @@ use levenshtein_automata::DFA; use log::debug; use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_types::{DocIndex, Highlight}; -use sdset::Set; +use sdset::{Set, SetBuf}; use slice_group_by::{GroupBy, GroupByMut}; +use itertools::EitherOrBoth; use crate::automaton::NGRAMS; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; @@ -61,7 +62,7 @@ pub fn bucket_sort<'c>( let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) { + if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { raw_documents.push(raw_document); } } @@ -78,7 +79,7 @@ pub fn bucket_sort<'c>( let criteria = [ Box::new(Typo) as Box, - Box::new(Words) as Box, + Box::new(Words), Box::new(Proximity), Box::new(Attribute), Box::new(WordsPosition), @@ -154,13 +155,11 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { fn new<'txn>( raw_matches: &'a mut [BareMatch<'tag>], automatons: &[QueryWordAutomaton], - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, ) -> Option> { raw_matches.sort_unstable_by_key(|m| m.query_index); - // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches); - let mut previous_word = None; for i in 0..raw_matches.len() { let a = &raw_matches[i]; @@ -168,10 +167,17 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { match auta.phrase_query { Some((0, _)) => { - previous_word = Some(a.query_index); - let b = raw_matches.get(i + 1)?; + let b = match raw_matches.get(i + 1) { + Some(b) => b, + None => { + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + continue; + } + }; + if a.query_index + 1 != b.query_index { - return None; + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + continue } let pla = &postings_lists[a.postings_list]; @@ -181,11 +187,31 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) }); - if !iter.any(|eb| eb.is_both()) { return None } + let mut newa = Vec::new(); + let mut newb = Vec::new(); + + for eb in iter { + if let EitherOrBoth::Both(a, b) = eb { + newa.push(*a); + newb.push(*b); + } + } + + + if !newa.is_empty() { + previous_word = Some(a.query_index); + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); + + } else { + // TODO use SetBuf::default when merged + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + } }, Some((1, _)) => { if previous_word.take() != Some(a.query_index - 1) { - return None; + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); } }, Some((_, _)) => unreachable!(), @@ -193,6 +219,10 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { } } + if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { + return None + } + Some(RawDocument { raw_matches, processed_matches: Vec::new(), @@ -231,50 +261,84 @@ pub struct SimpleMatch { } #[derive(Clone)] -pub struct PostingsListView<'txn> { - input: Rc<[u8]>, - postings_list: Rc>>, - offset: usize, - len: usize, +pub enum PostingsListView<'txn> { + Original { + input: Rc<[u8]>, + postings_list: Rc>>, + offset: usize, + len: usize, + }, + Rewritten { + input: Rc<[u8]>, + postings_list: SetBuf, + }, } impl fmt::Debug for PostingsListView<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("PostingsListView") - .field("input", &std::str::from_utf8(&self.input).unwrap()) + .field("input", &std::str::from_utf8(&self.input()).unwrap()) .field("postings_list", &self.as_ref()) .finish() } } impl<'txn> PostingsListView<'txn> { - pub fn new(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { + pub fn original(input: Rc<[u8]>, postings_list: Rc>>) -> PostingsListView<'txn> { let len = postings_list.len(); - PostingsListView { input, postings_list, offset: 0, len } + PostingsListView::Original { input, postings_list, offset: 0, len } + } + + pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf) -> PostingsListView<'txn> { + PostingsListView::Rewritten { input, postings_list } + } + + pub fn rewrite_with(&mut self, postings_list: SetBuf) { + *self = match self { + PostingsListView::Original { input, .. } => { + PostingsListView::Rewritten { input: input.clone(), postings_list } + }, + PostingsListView::Rewritten { input, .. } => { + PostingsListView::Rewritten { input: input.clone(), postings_list } + }, + }; } pub fn len(&self) -> usize { - self.len + match self { + PostingsListView::Original { len, .. } => *len, + PostingsListView::Rewritten { postings_list, .. } => postings_list.len(), + } } pub fn input(&self) -> &[u8] { - &self.input + match self { + PostingsListView::Original { ref input, .. } => input, + PostingsListView::Rewritten { ref input, .. } => input, + } } - pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> { - assert!(offset + len <= self.len); - PostingsListView { - input: self.input.clone(), - postings_list: self.postings_list.clone(), - offset: self.offset + offset, - len: len, + pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> { + match self { + PostingsListView::Original { input, postings_list, offset, len } => { + assert!(range_offset + range_len <= *len); + PostingsListView::Original { + input: input.clone(), + postings_list: postings_list.clone(), + offset: offset + range_offset, + len: range_len, + } + }, + PostingsListView::Rewritten { .. } => { + panic!("Cannot create a range on a rewritten postings list view"); + } } } } impl AsRef> for PostingsListView<'_> { fn as_ref(&self) -> &Set { - Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + self } } @@ -282,7 +346,12 @@ impl Deref for PostingsListView<'_> { type Target = Set; fn deref(&self) -> &Set { - Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len]) + match *self { + PostingsListView::Original { ref postings_list, offset, len, .. } => { + Set::new_unchecked(&postings_list[offset..offset + len]) + }, + PostingsListView::Rewritten { ref postings_list, .. } => postings_list, + } } } @@ -335,7 +404,7 @@ fn fetch_matches<'txn, 'tag>( let input = Rc::from(input); let postings_list = Rc::new(postings_list); - let postings_list_view = PostingsListView::new(input, postings_list); + let postings_list_view = PostingsListView::original(input, postings_list); let mut offset = 0; for group in postings_list_view.linear_group_by_key(|di| di.document_id) { diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 3bfbe76ea..a82dbf123 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -52,38 +52,9 @@ fn prepare_query_distances<'a, 'tag, 'txn>( for document in documents { if !document.processed_distances.is_empty() { continue } - // debug!("{:?}", document.raw_matches[0].document_id); - let mut processed = Vec::new(); - let mut raw_matches = document.raw_matches.iter().peekable(); - while let Some(m) = raw_matches.next() { - - // let automaton = &automatons[m.query_index as usize]; - - // debug!("{:?} {:?}", m, automaton); - // debug!("{:?}", &postings_lists[m.postings_list]); - - // match automaton.phrase_query { - // Some((0, len)) => { - // match raw_matches.peek() { - // Some(BareMatch { query_index, .. }) => { - // if *query_index != m.query_index + 1 { - // raw_matches.next(); - // continue - // } - // }, - // None => continue, - // } - // }, - // Some((_, _)) => continue, - // None => (), - // } - - // FIXME we really need to take splitted words into account - // those must be seen at the same level as the non-splitteds - // if automatons[m.query_index as usize].phrase_query.is_some() { - // continue - // } + for m in document.raw_matches.iter() { + if postings_lists[m.postings_list].is_empty() { continue } let range = query_enhancer.replacement(m.query_index as u32); let new_len = cmp::max(range.end as usize, processed.len()); @@ -99,8 +70,6 @@ fn prepare_query_distances<'a, 'tag, 'txn>( } } - // debug!("{:?}", processed); - document.processed_distances = processed; } } @@ -444,54 +413,11 @@ impl Criterion for StableDocId { } pub fn multiword_rewrite_matches( - simple_matches: &mut [SimpleMatch], + matches: &mut [SimpleMatch], query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) -> SetBuf { - let mut matches = Vec::with_capacity(simple_matches.len()); - - // let before_sort = Instant::now(); - // we sort the matches by word index to make them rewritable - simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index)); - // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); - - for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) { - let iter = same_attribute.linear_group_by_key(|m| m.query_index); - let mut iter = iter.peekable(); - - while let Some(same_query_index) = iter.next() { - let query_index = same_query_index[0].query_index; - - // TODO we need to support phrase query of longer length - if let Some((i, len)) = automatons[query_index as usize].phrase_query { - if i != 0 { continue } - - // is the next query_index group the required one - if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) { - if let Some(next) = iter.next() { - for ma in same_query_index { - for mb in next { - if ma.word_index == mb.word_index + 1 { - matches.push(*ma); - matches.push(*mb); - } - } - } - } - } - } else { - matches.extend_from_slice(same_query_index); - } - } - } - - // let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some(); - // let next_query_index = match_.query_index + 1; - // if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() { - // continue - // } - matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); let mut padded_matches = Vec::with_capacity(matches.len()); From efc2be0b7b29819f78ebf11f5cecffdb250b5af0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Dec 2019 12:08:30 +0100 Subject: [PATCH 12/23] Bump the sdset dependency to 0.3.6 --- Cargo.lock | 6 +++--- meilisearch-core/Cargo.toml | 2 +- meilisearch-core/src/bucket_sort.rs | 20 ++++++-------------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ad3f3494b..750cdc30c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -962,7 +962,7 @@ dependencies = [ "once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)", "siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -1692,7 +1692,7 @@ dependencies = [ [[package]] name = "sdset" -version = "0.3.3" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -2807,7 +2807,7 @@ dependencies = [ "checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421" "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" "checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" -"checksum sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b6d2447743d6c37b6d67af88d9c0f1fc92989e2d9745d9b2f3d305b906a90195" +"checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9" "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0" diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 3455f755d..3b19369f8 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -25,7 +25,7 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.4" } meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" } once_cell = "1.2.0" ordered-float = { version = "1.0.2", features = ["serde"] } -sdset = "0.3.3" +sdset = "0.3.6" serde = { version = "1.0.101", features = ["derive"] } serde_json = "1.0.41" siphasher = "0.3.1" diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index ccbf1e756..9b9e5ab44 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -121,7 +121,6 @@ pub fn bucket_sort<'c>( let postings_list = &arena[sm.postings_list]; let input = postings_list.input(); let query = &automatons[sm.query_index as usize].query; - debug!("{:?} contains {:?}", d.raw_matches[0].document_id, query); postings_list.iter().map(move |m| { let covered_area = if query.len() > input.len() { input.len() @@ -132,8 +131,6 @@ pub fn bucket_sort<'c>( }) }).collect(); - debug!("{:?} contains {:?}", d.raw_matches[0].document_id, d.processed_distances); - Document { id: d.raw_matches[0].document_id, highlights, @@ -170,13 +167,13 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { let b = match raw_matches.get(i + 1) { Some(b) => b, None => { - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); continue; } }; if a.query_index + 1 != b.query_index { - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); continue } @@ -197,21 +194,16 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { } } - if !newa.is_empty() { previous_word = Some(a.query_index); - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); - postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); - - } else { - // TODO use SetBuf::default when merged - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); - postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); } + + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); }, Some((1, _)) => { if previous_word.take() != Some(a.query_index - 1) { - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(Vec::new())); + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); } }, Some((_, _)) => unreachable!(), From ea148575cf2d9901219e44cf406a51e9113f634e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Dec 2019 15:34:30 +0100 Subject: [PATCH 13/23] Remove the raw_query functions --- meilisearch-core/src/bucket_sort.rs | 15 +- meilisearch-core/src/query_builder.rs | 340 -------------------------- 2 files changed, 6 insertions(+), 349 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 9b9e5ab44..5413db17f 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -39,7 +39,6 @@ pub fn bucket_sort<'c>( synonyms_store: store::Synonyms, ) -> MResult> { - // let automatons = construct_automatons(query); let (automatons, query_enhancer) = construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; @@ -286,14 +285,11 @@ impl<'txn> PostingsListView<'txn> { } pub fn rewrite_with(&mut self, postings_list: SetBuf) { - *self = match self { - PostingsListView::Original { input, .. } => { - PostingsListView::Rewritten { input: input.clone(), postings_list } - }, - PostingsListView::Rewritten { input, .. } => { - PostingsListView::Rewritten { input: input.clone(), postings_list } - }, + let input = match self { + PostingsListView::Original { input, .. } => input.clone(), + PostingsListView::Rewritten { input, .. } => input.clone(), }; + *self = PostingsListView::rewritten(input, postings_list); } pub fn len(&self) -> usize { @@ -565,7 +561,8 @@ fn construct_automatons2( } } - if true && n == 1 { + if n == 1 { + // automatons for splitted words if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { let mut left_automaton = QueryWordAutomaton::exact(left); left_automaton.phrase_query = Some((0, 2)); diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 7edda5294..c862ae2a2 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -399,346 +399,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { } } -fn raw_query<'c, FI>( - reader: &heed::RoTxn, - - query: &str, - range: Range, - - filter: Option, - timeout: Option, - - criteria: Criteria<'c>, - searchable_attrs: Option, - - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, -{ - // We delegate the filter work to the distinct query builder, - // specifying a distinct rule that has no effect. - if filter.is_some() { - let distinct = |_| None; - let distinct_size = 1; - return raw_query_with_distinct( - reader, - query, - range, - filter, - distinct, - distinct_size, - timeout, - criteria, - searchable_attrs, - main_store, - postings_lists_store, - documents_fields_counts_store, - synonyms_store, - ); - } - - let start_processing = Instant::now(); - let mut raw_documents_processed = Vec::with_capacity(range.len()); - - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - postings_lists_store, - synonyms_store, - )?; - - let automaton_producer = automaton_producer.into_iter(); - let mut automatons = Vec::new(); - - // aggregate automatons groups by groups after time - for auts in automaton_producer { - automatons.push(auts); - - for (i, group) in automatons.iter().enumerate() { - debug!("group {} automatons {:?}", i, group.automatons); - } - - let before_fetch_raw_documents = Instant::now(); - // we must retrieve the documents associated - // with the current automatons - let mut raw_documents = fetch_raw_documents( - reader, - &automatons, - &query_enhancer, - searchable_attrs.as_ref(), - main_store, - postings_lists_store, - )?; - debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed()); - - // stop processing when time is running out - if let Some(timeout) = timeout { - if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break; - } - } - - let before_bucket_sort = Instant::now(); - - let mut groups = vec![raw_documents.as_mut_slice()]; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut documents_seen = 0; - - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < range.start { - documents_seen += group.len(); - groups.push(group); - continue; - } - - // we must pull the fields counts of these documents - // TODO it would be great to had a "dependency" thing for each criterion - // and make it so that we can be lazy on pulling/computing some data. - if criterion.name() == "Exact" { - for document in group.iter_mut() { - let mut fields_counts = Vec::new(); - for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? { - let (attr, count) = result?; - fields_counts.push(AttrCount { attr: attr.0, count }); - } - document.fields_counts = Some(SetBuf::new(fields_counts).unwrap()); - } - } - - - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - debug!("criterion {} produced a group of size {}", criterion.name(), group.len()); - - documents_seen += group.len(); - groups.push(group); - - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if documents_seen >= range.end { - continue 'criteria; - } - } - } - } - - debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed()); - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let iter = raw_documents - .into_iter() - .skip(range.start) - .take(range.len()); - raw_documents_processed.clear(); - raw_documents_processed.extend(iter); - - // stop processing when time is running out - if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { - break; - } - } - } - - // make real documents now that we know - // those must be returned - let documents = raw_documents_processed - .into_iter() - .map(Document::from_raw) - .collect(); - - Ok(documents) -} - -fn raw_query_with_distinct<'c, FI, FD>( - reader: &heed::RoTxn, - - query: &str, - range: Range, - - filter: Option, - - distinct: FD, - distinct_size: usize, - timeout: Option, - - criteria: Criteria<'c>, - searchable_attrs: Option, - - main_store: store::Main, - postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, - synonyms_store: store::Synonyms, -) -> MResult> -where - FI: Fn(DocumentId) -> bool, - FD: Fn(DocumentId) -> Option, -{ - let start_processing = Instant::now(); - let mut raw_documents_processed = Vec::new(); - - let (automaton_producer, query_enhancer) = AutomatonProducer::new( - reader, - query, - main_store, - postings_lists_store, - synonyms_store, - )?; - - let automaton_producer = automaton_producer.into_iter(); - let mut automatons = Vec::new(); - - // aggregate automatons groups by groups after time - for auts in automaton_producer { - automatons.push(auts); - - // we must retrieve the documents associated - // with the current automatons - let mut raw_documents = fetch_raw_documents( - reader, - &automatons, - &query_enhancer, - searchable_attrs.as_ref(), - main_store, - postings_lists_store, - )?; - - // stop processing when time is running out - if let Some(timeout) = timeout { - if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout { - break; - } - } - - let mut groups = vec![raw_documents.as_mut_slice()]; - let mut key_cache = HashMap::new(); - - let mut filter_map = HashMap::new(); - // these two variables informs on the current distinct map and - // on the raw offset of the start of the group where the - // range.start bound is located according to the distinct function - let mut distinct_map = DistinctMap::new(distinct_size); - let mut distinct_raw_offset = 0; - - 'criteria: for criterion in criteria.as_ref() { - let tmp_groups = mem::replace(&mut groups, Vec::new()); - let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); - let mut documents_seen = 0; - - for group in tmp_groups { - // if this group does not overlap with the requested range, - // push it without sorting and splitting it - if documents_seen + group.len() < distinct_raw_offset { - documents_seen += group.len(); - groups.push(group); - continue; - } - - group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); - - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { - // we must compute the real distinguished len of this sub-group - for document in group.iter() { - let filter_accepted = match &filter { - Some(filter) => { - let entry = filter_map.entry(document.id); - *entry.or_insert_with(|| (filter)(document.id)) - } - None => true, - }; - - if filter_accepted { - let entry = key_cache.entry(document.id); - let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); - - match key.clone() { - Some(key) => buf_distinct.register(key), - None => buf_distinct.register_without_key(), - }; - } - - // the requested range end is reached: stop computing distinct - if buf_distinct.len() >= range.end { - break; - } - } - - documents_seen += group.len(); - groups.push(group); - - // if this sub-group does not overlap with the requested range - // we must update the distinct map and its start index - if buf_distinct.len() < range.start { - buf_distinct.transfert_to_internal(); - distinct_raw_offset = documents_seen; - } - - // we have sort enough documents if the last document sorted is after - // the end of the requested range, we can continue to the next criterion - if buf_distinct.len() >= range.end { - continue 'criteria; - } - } - } - } - - // once we classified the documents related to the current - // automatons we save that as the next valid result - let mut seen = BufferedDistinctMap::new(&mut distinct_map); - raw_documents_processed.clear(); - - for document in raw_documents.into_iter().skip(distinct_raw_offset) { - let filter_accepted = match &filter { - Some(_) => filter_map.remove(&document.id).unwrap(), - None => true, - }; - - if filter_accepted { - let key = key_cache.remove(&document.id).unwrap(); - let distinct_accepted = match key { - Some(key) => seen.register(key), - None => seen.register_without_key(), - }; - - if distinct_accepted && seen.len() > range.start { - raw_documents_processed.push(document); - if raw_documents_processed.len() == range.len() { - break; - } - } - } - } - - // stop processing when time is running out - if let Some(timeout) = timeout { - if start_processing.elapsed() > timeout { - break; - } - } - } - - // make real documents now that we know - // those must be returned - let documents = raw_documents_processed - .into_iter() - .map(Document::from_raw) - .collect(); - - Ok(documents) -} - #[cfg(test)] mod tests { use super::*; From 248ccfc0d86be2009d0186fb22f225e0ae997386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Dec 2019 17:02:10 +0100 Subject: [PATCH 14/23] Update the criteria to the new ones --- meilisearch-core/src/bucket_sort.rs | 104 +--- meilisearch-core/src/criterion/attribute.rs | 48 ++ meilisearch-core/src/criterion/document_id.rs | 35 +- meilisearch-core/src/criterion/exact.rs | 154 ++---- meilisearch-core/src/criterion/mod.rs | 266 +++++++-- .../src/criterion/number_of_words.rs | 31 -- meilisearch-core/src/criterion/proximity.rs | 79 +++ .../src/criterion/sort_by_attr.rs | 44 +- .../src/criterion/sum_of_typos.rs | 116 ---- .../src/criterion/sum_of_words_attribute.rs | 64 --- .../src/criterion/sum_of_words_position.rs | 64 --- meilisearch-core/src/criterion/typo.rs | 67 +++ meilisearch-core/src/criterion/words.rs | 43 ++ .../src/criterion/words_position.rs | 48 ++ .../src/criterion/words_proximity.rs | 164 ------ meilisearch-core/src/criterion2.rs | 514 ------------------ meilisearch-core/src/lib.rs | 54 +- meilisearch-core/src/query_builder.rs | 305 +---------- meilisearch-core/src/raw_document.rs | 248 +++------ meilisearch-http/src/helpers/meilisearch.rs | 20 +- 20 files changed, 693 insertions(+), 1775 deletions(-) create mode 100644 meilisearch-core/src/criterion/attribute.rs delete mode 100644 meilisearch-core/src/criterion/number_of_words.rs create mode 100644 meilisearch-core/src/criterion/proximity.rs delete mode 100644 meilisearch-core/src/criterion/sum_of_typos.rs delete mode 100644 meilisearch-core/src/criterion/sum_of_words_attribute.rs delete mode 100644 meilisearch-core/src/criterion/sum_of_words_position.rs create mode 100644 meilisearch-core/src/criterion/typo.rs create mode 100644 meilisearch-core/src/criterion/words.rs create mode 100644 meilisearch-core/src/criterion/words_position.rs delete mode 100644 meilisearch-core/src/criterion/words_proximity.rs delete mode 100644 meilisearch-core/src/criterion2.rs diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 5413db17f..0c5fbdee3 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,9 +1,6 @@ use std::ops::Deref; use std::fmt; use std::borrow::Cow; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::io::Write; use std::mem; use std::ops::Range; use std::rc::Rc; @@ -17,15 +14,15 @@ use meilisearch_tokenizer::{is_cjk, split_query_string}; use meilisearch_types::{DocIndex, Highlight}; use sdset::{Set, SetBuf}; use slice_group_by::{GroupBy, GroupByMut}; -use itertools::EitherOrBoth; use crate::automaton::NGRAMS; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::automaton::{normalize_str, split_best_frequency}; -use crate::criterion2::*; +use crate::criterion::Criteria; use crate::levenshtein::prefix_damerau_levenshtein; +use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; @@ -33,6 +30,7 @@ pub fn bucket_sort<'c>( reader: &heed::RoTxn, query: &str, range: Range, + criteria: Criteria<'c>, main_store: store::Main, postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, @@ -76,17 +74,7 @@ pub fn bucket_sort<'c>( let mut groups = vec![raw_documents.as_mut_slice()]; - let criteria = [ - Box::new(Typo) as Box, - Box::new(Words), - Box::new(Proximity), - Box::new(Attribute), - Box::new(WordsPosition), - Box::new(Exact), - Box::new(StableDocId), - ]; - - 'criteria: for criterion in &criteria { + 'criteria: for criterion in criteria.as_ref() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut documents_seen = 0; @@ -131,7 +119,7 @@ pub fn bucket_sort<'c>( }).collect(); Document { - id: d.raw_matches[0].document_id, + id: d.id, highlights, #[cfg(test)] matches: Vec::new(), } @@ -140,88 +128,6 @@ pub fn bucket_sort<'c>( Ok(iter.collect()) } -pub struct RawDocument<'a, 'tag> { - pub raw_matches: &'a mut [BareMatch<'tag>], - pub processed_matches: Vec, - /// The list of minimum `distance` found - pub processed_distances: Vec>, -} - -impl<'a, 'tag> RawDocument<'a, 'tag> { - fn new<'txn>( - raw_matches: &'a mut [BareMatch<'tag>], - automatons: &[QueryWordAutomaton], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - ) -> Option> - { - raw_matches.sort_unstable_by_key(|m| m.query_index); - - let mut previous_word = None; - for i in 0..raw_matches.len() { - let a = &raw_matches[i]; - let auta = &automatons[a.query_index as usize]; - - match auta.phrase_query { - Some((0, _)) => { - let b = match raw_matches.get(i + 1) { - Some(b) => b, - None => { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - continue; - } - }; - - if a.query_index + 1 != b.query_index { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - continue - } - - let pla = &postings_lists[a.postings_list]; - let plb = &postings_lists[b.postings_list]; - - let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { - a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) - }); - - let mut newa = Vec::new(); - let mut newb = Vec::new(); - - for eb in iter { - if let EitherOrBoth::Both(a, b) = eb { - newa.push(*a); - newb.push(*b); - } - } - - if !newa.is_empty() { - previous_word = Some(a.query_index); - } - - postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); - postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); - }, - Some((1, _)) => { - if previous_word.take() != Some(a.query_index - 1) { - postings_lists[a.postings_list].rewrite_with(SetBuf::default()); - } - }, - Some((_, _)) => unreachable!(), - None => (), - } - } - - if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { - return None - } - - Some(RawDocument { - raw_matches, - processed_matches: Vec::new(), - processed_distances: Vec::new(), - }) - } -} - pub struct BareMatch<'tag> { pub document_id: DocumentId, pub query_index: u16, diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs new file mode 100644 index 000000000..3dc6e4282 --- /dev/null +++ b/meilisearch-core/src/criterion/attribute.rs @@ -0,0 +1,48 @@ +use std::cmp::{self, Ordering}; + +use compact_arena::SmallArena; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_raw_matches}; + +pub struct Attribute; + +impl Criterion for Attribute { + fn name(&self) -> &str { "attribute" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { + #[inline] + fn best_attribute(matches: &[SimpleMatch]) -> u16 { + let mut best_attribute = u16::max_value(); + for group in matches.linear_group_by_key(|bm| bm.query_index) { + best_attribute = cmp::min(best_attribute, group[0].attribute); + } + best_attribute + } + + let lhs = best_attribute(&lhs.processed_matches); + let rhs = best_attribute(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/document_id.rs b/meilisearch-core/src/criterion/document_id.rs index e4a402d26..596194bca 100644 --- a/meilisearch-core/src/criterion/document_id.rs +++ b/meilisearch-core/src/criterion/document_id.rs @@ -1,16 +1,37 @@ -use crate::criterion::Criterion; -use crate::RawDocument; use std::cmp::Ordering; -#[derive(Debug, Clone, Copy)] +use compact_arena::SmallArena; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; +use super::Criterion; + pub struct DocumentId; impl Criterion for DocumentId { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - lhs.id.cmp(&rhs.id) + fn name(&self) -> &str { "stable document id" } + + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut SmallArena, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + // ... } - fn name(&self) -> &str { - "DocumentId" + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + let lhs = &lhs.id; + let rhs = &rhs.id; + + lhs.cmp(rhs) } } diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index 55a19001b..d82f69462 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,131 +1,51 @@ -use std::cmp::Ordering; +use std::cmp::{Ordering, Reverse}; -use sdset::Set; +use compact_arena::SmallArena; use slice_group_by::GroupBy; -use crate::criterion::Criterion; -use crate::{AttrCount, RawDocument}; +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton}; +use crate::RawDocument; +use super::Criterion; -#[inline] -fn number_exact_matches( - query_index: &[u32], - attribute: &[u16], - is_exact: &[bool], - fields_counts: &Set, -) -> usize { - let mut count = 0; - let mut index = 0; - - for group in query_index.linear_group() { - let len = group.len(); - - let mut found_exact = false; - for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() { - if *is_exact { - found_exact = true; - let attr = &attribute[index + pos]; - if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { - let AttrCount { count, .. } = fields_counts[pos]; - if count == 1 { - return usize::max_value(); - } - } - } - } - - count += found_exact as usize; - index += len; - } - - count -} - -#[derive(Debug, Clone, Copy)] pub struct Exact; impl Criterion for Exact { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let is_exact = lhs.is_exact(); - let attribute = lhs.attribute(); - let fields_counts = lhs.fields_counts.as_ref().unwrap(); + fn name(&self) -> &str { "exact" } - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; + fn prepare( + &self, + documents: &mut [RawDocument], + postings_lists: &mut SmallArena, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + for document in documents { + document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + } + } - let rhs = { - let query_index = rhs.query_index(); - let is_exact = rhs.is_exact(); - let attribute = rhs.attribute(); - let fields_counts = rhs.fields_counts.as_ref().unwrap(); + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + #[inline] + fn sum_exact_query_words(matches: &[BareMatch]) -> usize { + let mut sum_exact_query_words = 0; - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_exact_query_words += group[0].is_exact as usize; + } + + sum_exact_query_words + } + + let lhs = sum_exact_query_words(&lhs.raw_matches); + let rhs = sum_exact_query_words(&rhs.raw_matches); lhs.cmp(&rhs).reverse() } - - fn name(&self) -> &str { - "Exact" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: "Soulier bleu" - // doc1: "souliereres rouge" - #[test] - fn easy_case() { - let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[false]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "soulier" - // - // doc0: { 0. "soulier" } - // doc1: { 0. "soulier bleu et blanc" } - #[test] - fn basic() { - let doc0 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - let doc1 = { - let query_index = &[0]; - let attribute = &[0]; - let is_exact = &[true]; - let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); - - number_exact_matches(query_index, attribute, is_exact, fields_counts) - }; - - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } } diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index e94b1b2c7..0d54d89f2 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -1,58 +1,58 @@ -mod document_id; -mod exact; -mod number_of_words; -mod sort_by_attr; -mod sum_of_typos; -mod sum_of_words_attribute; -mod sum_of_words_position; -mod words_proximity; +use std::cmp::{self, Ordering}; +use compact_arena::SmallArena; +use sdset::SetBuf; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; use crate::RawDocument; -use std::cmp::Ordering; -pub use self::{ - document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords, - sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos, - sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, - words_proximity::WordsProximity, -}; +mod typo; +mod words; +mod proximity; +mod attribute; +mod words_position; +mod exact; +mod document_id; +mod sort_by_attr; -pub trait Criterion: Send + Sync { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; +pub use self::typo::Typo; +pub use self::words::Words; +pub use self::proximity::Proximity; +pub use self::attribute::Attribute; +pub use self::words_position::WordsPosition; +pub use self::exact::Exact; +pub use self::document_id::DocumentId; +pub use self::sort_by_attr::SortByAttr; +pub trait Criterion { fn name(&self) -> &str; + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ); + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering; + #[inline] - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - self.evaluate(lhs, rhs) == Ordering::Equal - } -} - -impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - (**self).evaluate(lhs, rhs) - } - - fn name(&self) -> &str { - (**self).name() - } - - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - (**self).eq(lhs, rhs) - } -} - -impl Criterion for Box { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - (**self).evaluate(lhs, rhs) - } - - fn name(&self) -> &str { - (**self).name() - } - - fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { - (**self).eq(lhs, rhs) + fn eq<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> bool + { + self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal } } @@ -103,11 +103,11 @@ pub struct Criteria<'a> { impl<'a> Default for Criteria<'a> { fn default() -> Self { CriteriaBuilder::with_capacity(7) - .add(SumOfTypos) - .add(NumberOfWords) - .add(WordsProximity) - .add(SumOfWordsAttribute) - .add(SumOfWordsPosition) + .add(Typo) + .add(Words) + .add(Proximity) + .add(Attribute) + .add(WordsPosition) .add(Exact) .add(DocumentId) .build() @@ -119,3 +119,165 @@ impl<'a> AsRef<[Box]> for Criteria<'a> { &self.inner } } + +fn prepare_query_distances<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, +) { + for document in documents { + if !document.processed_distances.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.raw_matches.iter() { + if postings_lists[m.postings_list].is_empty() { continue } + + let range = query_enhancer.replacement(m.query_index as u32); + let new_len = cmp::max(range.end as usize, processed.len()); + processed.resize(new_len, None); + + for index in range { + let index = index as usize; + processed[index] = match processed[index] { + Some(distance) if distance > m.distance => Some(m.distance), + Some(distance) => Some(distance), + None => Some(m.distance), + }; + } + } + + document.processed_distances = processed; + } +} + +fn prepare_raw_matches<'a, 'tag, 'txn>( + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], +) { + for document in documents { + if !document.processed_matches.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.raw_matches.iter() { + let postings_list = &postings_lists[m.postings_list]; + processed.reserve(postings_list.len()); + for di in postings_list.as_ref() { + let simple_match = SimpleMatch { + query_index: m.query_index, + distance: m.distance, + attribute: di.attribute, + word_index: di.word_index, + is_exact: m.is_exact, + }; + processed.push(simple_match); + } + } + + let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); + document.processed_matches = processed.into_vec(); + } +} + +fn multiword_rewrite_matches( + matches: &mut [SimpleMatch], + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], +) -> SetBuf +{ + matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); + + let mut padded_matches = Vec::with_capacity(matches.len()); + + // let before_padding = Instant::now(); + // for each attribute of each document + for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { + // padding will only be applied + // to word indices in the same attribute + let mut padding = 0; + let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); + + // for each match at the same position + // in this document attribute + while let Some(same_word_index) = iter.next() { + // find the biggest padding + let mut biggest = 0; + for match_ in same_word_index { + let mut replacement = query_enhancer.replacement(match_.query_index as u32); + let replacement_len = replacement.len(); + let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); + + if let Some(query_index) = replacement.next() { + let word_index = match_.word_index + padding as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + let mut found = false; + + // look ahead and if there already is a match + // corresponding to this padding word, abort the padding + 'padding: for (x, next_group) in nexts.enumerate() { + for (i, query_index) in replacement.clone().enumerate().skip(x) { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; + + for nmatch_ in next_group { + let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); + let query_index = rep.next().unwrap() as u16; + if query_index == padmatch.query_index { + if !found { + // if we find a corresponding padding for the + // first time we must push preceding paddings + for (i, query_index) in replacement.clone().enumerate().take(i) + { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + biggest = biggest.max(i + 1); + } + } + + padded_matches.push(padmatch); + found = true; + continue 'padding; + } + } + } + + // if we do not find a corresponding padding in the + // next groups so stop here and pad what was found + break; + } + + if !found { + // if no padding was found in the following matches + // we must insert the entire padding + for (i, query_index) in replacement.enumerate() { + let word_index = match_.word_index + padding as u16 + (i + 1) as u16; + let query_index = query_index as u16; + let match_ = SimpleMatch { query_index, word_index, ..*match_ }; + padded_matches.push(match_); + } + + biggest = biggest.max(replacement_len - 1); + } + } + + padding += biggest; + } + } + + // debug!("padding matches took {:.02?}", before_padding.elapsed()); + + // With this check we can see that the loop above takes something + // like 43% of the search time even when no rewrite is needed. + // assert_eq!(before_matches, padded_matches); + + SetBuf::from_dirty(padded_matches) +} diff --git a/meilisearch-core/src/criterion/number_of_words.rs b/meilisearch-core/src/criterion/number_of_words.rs deleted file mode 100644 index 6c1218e2f..000000000 --- a/meilisearch-core/src/criterion/number_of_words.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn number_of_query_words(query_index: &[u32]) -> usize { - query_index.linear_group().count() -} - -#[derive(Debug, Clone, Copy)] -pub struct NumberOfWords; - -impl Criterion for NumberOfWords { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - number_of_query_words(query_index) - }; - let rhs = { - let query_index = rhs.query_index(); - number_of_query_words(query_index) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "NumberOfWords" - } -} diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs new file mode 100644 index 000000000..c9c534ca8 --- /dev/null +++ b/meilisearch-core/src/criterion/proximity.rs @@ -0,0 +1,79 @@ +use std::cmp::{self, Ordering}; + +use compact_arena::SmallArena; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_raw_matches}; + +pub struct Proximity; + +impl Criterion for Proximity { + fn name(&self) -> &str { "proximity" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { + const MAX_DISTANCE: u16 = 8; + + fn index_proximity(lhs: u16, rhs: u16) -> u16 { + if lhs < rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + } + } + + fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { + if lhs.attribute != rhs.attribute { MAX_DISTANCE } + else { index_proximity(lhs.word_index, rhs.word_index) } + } + + fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { + let mut min_prox = u16::max_value(); + for a in lhs { + for b in rhs { + let prox = attribute_proximity(*a, *b); + min_prox = cmp::min(min_prox, prox); + } + } + min_prox + } + + fn matches_proximity(matches: &[SimpleMatch],) -> u16 { + let mut proximity = 0; + let mut iter = matches.linear_group_by_key(|m| m.query_index); + + // iterate over groups by windows of size 2 + let mut last = iter.next(); + while let (Some(lhs), Some(rhs)) = (last, iter.next()) { + proximity += min_proximity(lhs, rhs); + last = Some(rhs); + } + + proximity + } + + let lhs = matches_proximity(&lhs.processed_matches); + let rhs = matches_proximity(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/sort_by_attr.rs b/meilisearch-core/src/criterion/sort_by_attr.rs index 89595e5a5..ea1c016da 100644 --- a/meilisearch-core/src/criterion/sort_by_attr.rs +++ b/meilisearch-core/src/criterion/sort_by_attr.rs @@ -2,9 +2,13 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; +use compact_arena::SmallArena; +use meilisearch_schema::{Schema, SchemaAttr}; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; use crate::criterion::Criterion; use crate::{RankedMap, RawDocument}; -use meilisearch_schema::{Schema, SchemaAttr}; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -28,11 +32,11 @@ use meilisearch_schema::{Schema, SchemaAttr}; /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; /// /// let builder = CriteriaBuilder::with_capacity(8) -/// .add(SumOfTypos) -/// .add(NumberOfWords) -/// .add(WordsProximity) -/// .add(SumOfWordsAttribute) -/// .add(SumOfWordsPosition) +/// .add(Typo) +/// .add(Words) +/// .add(Proximity) +/// .add(Attribute) +/// .add(WordsPosition) /// .add(Exact) /// .add(custom_ranking) /// .add(DocumentId); @@ -86,8 +90,28 @@ impl<'a> SortByAttr<'a> { } } -impl<'a> Criterion for SortByAttr<'a> { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { +impl Criterion for SortByAttr<'_> { + fn name(&self) -> &str { + "sort by attribute" + } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + // ... + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { let lhs = self.ranked_map.get(lhs.id, self.attr); let rhs = self.ranked_map.get(rhs.id, self.attr); @@ -105,10 +129,6 @@ impl<'a> Criterion for SortByAttr<'a> { (None, None) => Ordering::Equal, } } - - fn name(&self) -> &str { - "SortByAttr" - } } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/meilisearch-core/src/criterion/sum_of_typos.rs b/meilisearch-core/src/criterion/sum_of_typos.rs deleted file mode 100644 index 5cad73b42..000000000 --- a/meilisearch-core/src/criterion/sum_of_typos.rs +++ /dev/null @@ -1,116 +0,0 @@ -use std::cmp::Ordering; - -use slice_group_by::GroupBy; - -use crate::criterion::Criterion; -use crate::RawDocument; - -// This function is a wrong logarithmic 10 function. -// It is safe to panic on input number higher than 3, -// the number of typos is never bigger than that. -#[inline] -fn custom_log10(n: u8) -> f32 { - match n { - 0 => 0.0, // log(1) - 1 => 0.30102, // log(2) - 2 => 0.47712, // log(3) - 3 => 0.60205, // log(4) - _ => panic!("invalid number"), - } -} - -#[inline] -fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { - let mut number_words: usize = 0; - let mut sum_typos = 0.0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_typos += custom_log10(distance[index]); - number_words += 1; - index += group.len(); - } - - (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfTypos; - -impl Criterion for SumOfTypos { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let distance = lhs.distance(); - sum_matches_typos(query_index, distance) - }; - - let rhs = { - let query_index = rhs.query_index(); - let distance = rhs.distance(); - sum_matches_typos(query_index, distance) - }; - - lhs.cmp(&rhs).reverse() - } - - fn name(&self) -> &str { - "SumOfTypos" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "Geox CEO" - // - // doc0: "Geox SpA: CEO and Executive" - // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" - #[test] - fn one_typo_reference() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 0]; - - let query_index1 = &[0, 1]; - let distance1 = &[1, 0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "bouton manchette" - // - // doc0: "bouton manchette" - // doc1: "bouton" - #[test] - fn no_typo() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 0]; - - let query_index1 = &[0]; - let distance1 = &[0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } - - // typing: "bouton manchztte" - // - // doc0: "bouton manchette" - // doc1: "bouton" - #[test] - fn one_typo() { - let query_index0 = &[0, 1]; - let distance0 = &[0, 1]; - - let query_index1 = &[0]; - let distance1 = &[0]; - - let doc0 = sum_matches_typos(query_index0, distance0); - let doc1 = sum_matches_typos(query_index1, distance1); - assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/sum_of_words_attribute.rs b/meilisearch-core/src/criterion/sum_of_words_attribute.rs deleted file mode 100644 index 472d771b7..000000000 --- a/meilisearch-core/src/criterion/sum_of_words_attribute.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { - let mut sum_attributes = 0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_attributes += attribute[index] as usize; - index += group.len(); - } - - sum_attributes -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfWordsAttribute; - -impl Criterion for SumOfWordsAttribute { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let attribute = lhs.attribute(); - sum_matches_attributes(query_index, attribute) - }; - - let rhs = { - let query_index = rhs.query_index(); - let attribute = rhs.attribute(); - sum_matches_attributes(query_index, attribute) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "SumOfWordsAttribute" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: { 0. "Soulier bleu", 1. "bla bla bla" } - // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" } - #[test] - fn title_vs_description() { - let query_index0 = &[0]; - let attribute0 = &[0]; - - let query_index1 = &[0]; - let attribute1 = &[1]; - - let doc0 = sum_matches_attributes(query_index0, attribute0); - let doc1 = sum_matches_attributes(query_index1, attribute1); - assert_eq!(doc0.cmp(&doc1), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/sum_of_words_position.rs b/meilisearch-core/src/criterion/sum_of_words_position.rs deleted file mode 100644 index 70b8843dc..000000000 --- a/meilisearch-core/src/criterion/sum_of_words_position.rs +++ /dev/null @@ -1,64 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::Ordering; - -#[inline] -fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { - let mut sum_word_index = 0; - let mut index = 0; - - for group in query_index.linear_group() { - sum_word_index += word_index[index] as usize; - index += group.len(); - } - - sum_word_index -} - -#[derive(Debug, Clone, Copy)] -pub struct SumOfWordsPosition; - -impl Criterion for SumOfWordsPosition { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let word_index = lhs.word_index(); - sum_matches_attribute_index(query_index, word_index) - }; - - let rhs = { - let query_index = rhs.query_index(); - let word_index = rhs.word_index(); - sum_matches_attribute_index(query_index, word_index) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "SumOfWordsPosition" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // typing: "soulier" - // - // doc0: "Soulier bleu" - // doc1: "Botte rouge et soulier noir" - #[test] - fn easy_case() { - let query_index0 = &[0]; - let word_index0 = &[0]; - - let query_index1 = &[0]; - let word_index1 = &[3]; - - let doc0 = sum_matches_attribute_index(query_index0, word_index0); - let doc1 = sum_matches_attribute_index(query_index1, word_index1); - assert_eq!(doc0.cmp(&doc1), Ordering::Less); - } -} diff --git a/meilisearch-core/src/criterion/typo.rs b/meilisearch-core/src/criterion/typo.rs new file mode 100644 index 000000000..d7907700d --- /dev/null +++ b/meilisearch-core/src/criterion/typo.rs @@ -0,0 +1,67 @@ +use std::cmp::Ordering; + +use compact_arena::SmallArena; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_query_distances}; + +pub struct Typo; + +impl Criterion for Typo { + fn name(&self) -> &str { "typo" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + // This function is a wrong logarithmic 10 function. + // It is safe to panic on input number higher than 3, + // the number of typos is never bigger than that. + #[inline] + fn custom_log10(n: u8) -> f32 { + match n { + 0 => 0.0, // log(1) + 1 => 0.30102, // log(2) + 2 => 0.47712, // log(3) + 3 => 0.60205, // log(4) + _ => panic!("invalid number"), + } + } + + #[inline] + fn compute_typos(distances: &[Option]) -> usize { + let mut number_words: usize = 0; + let mut sum_typos = 0.0; + + for distance in distances { + if let Some(distance) = distance { + sum_typos += custom_log10(*distance); + number_words += 1; + } + } + + (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize + } + + let lhs = compute_typos(&lhs.processed_distances); + let rhs = compute_typos(&rhs.processed_distances); + + lhs.cmp(&rhs).reverse() + } +} diff --git a/meilisearch-core/src/criterion/words.rs b/meilisearch-core/src/criterion/words.rs new file mode 100644 index 000000000..fbe3d9070 --- /dev/null +++ b/meilisearch-core/src/criterion/words.rs @@ -0,0 +1,43 @@ +use std::cmp::Ordering; + +use compact_arena::SmallArena; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_query_distances}; + +pub struct Words; + +impl Criterion for Words { + fn name(&self) -> &str { "words" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_query_distances(documents, query_enhancer, automatons, postings_lists); + } + + fn evaluate( + &self, + lhs: &RawDocument, + rhs: &RawDocument, + postings_lists: &SmallArena, + ) -> Ordering + { + #[inline] + fn number_of_query_words(distances: &[Option]) -> usize { + distances.iter().cloned().filter(Option::is_some).count() + } + + let lhs = number_of_query_words(&lhs.processed_distances); + let rhs = number_of_query_words(&rhs.processed_distances); + + lhs.cmp(&rhs).reverse() + } +} diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs new file mode 100644 index 000000000..7df3e1fbd --- /dev/null +++ b/meilisearch-core/src/criterion/words_position.rs @@ -0,0 +1,48 @@ +use std::cmp::Ordering; + +use compact_arena::SmallArena; +use slice_group_by::GroupBy; + +use crate::automaton::QueryEnhancer; +use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; +use crate::RawDocument; + +use super::{Criterion, prepare_raw_matches}; + +pub struct WordsPosition; + +impl Criterion for WordsPosition { + fn name(&self) -> &str { "words position" } + + fn prepare<'a, 'tag, 'txn>( + &self, + documents: &mut [RawDocument<'a, 'tag>], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], + ) { + prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + } + + fn evaluate<'a, 'tag, 'txn>( + &self, + lhs: &RawDocument<'a, 'tag>, + rhs: &RawDocument<'a, 'tag>, + postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Ordering + { + #[inline] + fn sum_words_position(matches: &[SimpleMatch]) -> usize { + let mut sum_words_position = 0; + for group in matches.linear_group_by_key(|bm| bm.query_index) { + sum_words_position += group[0].word_index as usize; + } + sum_words_position + } + + let lhs = sum_words_position(&lhs.processed_matches); + let rhs = sum_words_position(&rhs.processed_matches); + + lhs.cmp(&rhs) + } +} diff --git a/meilisearch-core/src/criterion/words_proximity.rs b/meilisearch-core/src/criterion/words_proximity.rs deleted file mode 100644 index 579bc7b8c..000000000 --- a/meilisearch-core/src/criterion/words_proximity.rs +++ /dev/null @@ -1,164 +0,0 @@ -use crate::criterion::Criterion; -use crate::RawDocument; -use slice_group_by::GroupBy; -use std::cmp::{self, Ordering}; - -const MAX_DISTANCE: u16 = 8; - -#[inline] -fn clone_tuple((a, b): (&T, &U)) -> (T, U) { - (a.clone(), b.clone()) -} - -fn index_proximity(lhs: u16, rhs: u16) -> u16 { - if lhs < rhs { - cmp::min(rhs - lhs, MAX_DISTANCE) - } else { - cmp::min(lhs - rhs, MAX_DISTANCE) + 1 - } -} - -fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { - if lattr != rattr { - return MAX_DISTANCE; - } - index_proximity(lwi, rwi) -} - -fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { - let mut min_prox = u16::max_value(); - - for a in lattr.iter().zip(lwi) { - for b in rattr.iter().zip(rwi) { - let a = clone_tuple(a); - let b = clone_tuple(b); - min_prox = cmp::min(min_prox, attribute_proximity(a, b)); - } - } - - min_prox -} - -fn matches_proximity( - query_index: &[u32], - distance: &[u8], - attribute: &[u16], - word_index: &[u16], -) -> u16 { - let mut query_index_groups = query_index.linear_group(); - let mut proximity = 0; - let mut index = 0; - - let get_attr_wi = |index: usize, group_len: usize| { - // retrieve the first distance group (with the lowest values) - let len = distance[index..index + group_len] - .linear_group() - .next() - .unwrap() - .len(); - - let rattr = &attribute[index..index + len]; - let rwi = &word_index[index..index + len]; - - (rattr, rwi) - }; - - let mut last = query_index_groups.next().map(|group| { - let attr_wi = get_attr_wi(index, group.len()); - index += group.len(); - attr_wi - }); - - // iter by windows of size 2 - while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { - let attr_wi = get_attr_wi(index, rhs.len()); - proximity += min_proximity(lhs, attr_wi); - last = Some(attr_wi); - index += rhs.len(); - } - - proximity -} - -#[derive(Debug, Clone, Copy)] -pub struct WordsProximity; - -impl Criterion for WordsProximity { - fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { - let lhs = { - let query_index = lhs.query_index(); - let distance = lhs.distance(); - let attribute = lhs.attribute(); - let word_index = lhs.word_index(); - matches_proximity(query_index, distance, attribute, word_index) - }; - - let rhs = { - let query_index = rhs.query_index(); - let distance = rhs.distance(); - let attribute = rhs.attribute(); - let word_index = rhs.word_index(); - matches_proximity(query_index, distance, attribute, word_index) - }; - - lhs.cmp(&rhs) - } - - fn name(&self) -> &str { - "WordsProximity" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn three_different_attributes() { - // "soup" "of the" "the day" - // - // { id: 0, attr: 0, attr_index: 0 } - // { id: 1, attr: 1, attr_index: 0 } - // { id: 2, attr: 1, attr_index: 1 } - // { id: 2, attr: 2, attr_index: 0 } - // { id: 3, attr: 3, attr_index: 1 } - - let query_index = &[0, 1, 2, 2, 3]; - let distance = &[0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 2, 3]; - let word_index = &[0, 0, 1, 0, 1]; - - // soup -> of = 8 - // + of -> the = 1 - // + the -> day = 8 (not 1) - assert_eq!( - matches_proximity(query_index, distance, attribute, word_index), - 17 - ); - } - - #[test] - fn two_different_attributes() { - // "soup day" "soup of the day" - // - // { id: 0, attr: 0, attr_index: 0 } - // { id: 0, attr: 1, attr_index: 0 } - // { id: 1, attr: 1, attr_index: 1 } - // { id: 2, attr: 1, attr_index: 2 } - // { id: 3, attr: 0, attr_index: 1 } - // { id: 3, attr: 1, attr_index: 3 } - - let query_index = &[0, 0, 1, 2, 3, 3]; - let distance = &[0, 0, 0, 0, 0, 0]; - let attribute = &[0, 1, 1, 1, 0, 1]; - let word_index = &[0, 0, 1, 2, 1, 3]; - - // soup -> of = 1 - // + of -> the = 1 - // + the -> day = 1 - assert_eq!( - matches_proximity(query_index, distance, attribute, word_index), - 3 - ); - } -} diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs deleted file mode 100644 index a82dbf123..000000000 --- a/meilisearch-core/src/criterion2.rs +++ /dev/null @@ -1,514 +0,0 @@ -use std::cmp::{self, Ordering, Reverse}; -use std::borrow::Cow; -use std::sync::atomic::{self, AtomicUsize}; - -use slice_group_by::{GroupBy, GroupByMut}; -use compact_arena::SmallArena; -use sdset::{Set, SetBuf}; -use log::debug; - -use crate::{DocIndex, DocumentId}; -use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton}; -use crate::automaton::QueryEnhancer; - -type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; - -pub trait Criterion { - fn name(&self) -> &str; - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ); - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering; - - #[inline] - fn eq<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> bool - { - self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal - } -} - -fn prepare_query_distances<'a, 'tag, 'txn>( - documents: &mut [RawDocument<'a, 'tag>], - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - postings_lists: &PostingsListsArena<'tag, 'txn>, -) { - for document in documents { - if !document.processed_distances.is_empty() { continue } - - let mut processed = Vec::new(); - for m in document.raw_matches.iter() { - if postings_lists[m.postings_list].is_empty() { continue } - - let range = query_enhancer.replacement(m.query_index as u32); - let new_len = cmp::max(range.end as usize, processed.len()); - processed.resize(new_len, None); - - for index in range { - let index = index as usize; - processed[index] = match processed[index] { - Some(distance) if distance > m.distance => Some(m.distance), - Some(distance) => Some(distance), - None => Some(m.distance), - }; - } - } - - document.processed_distances = processed; - } -} - -pub struct Typo; - -impl Criterion for Typo { - fn name(&self) -> &str { "typo" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_query_distances(documents, query_enhancer, automatons, postings_lists); - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - // This function is a wrong logarithmic 10 function. - // It is safe to panic on input number higher than 3, - // the number of typos is never bigger than that. - #[inline] - fn custom_log10(n: u8) -> f32 { - match n { - 0 => 0.0, // log(1) - 1 => 0.30102, // log(2) - 2 => 0.47712, // log(3) - 3 => 0.60205, // log(4) - _ => panic!("invalid number"), - } - } - - #[inline] - fn compute_typos(distances: &[Option]) -> usize { - let mut number_words: usize = 0; - let mut sum_typos = 0.0; - - for distance in distances { - if let Some(distance) = distance { - sum_typos += custom_log10(*distance); - number_words += 1; - } - } - - (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize - } - - let lhs = compute_typos(&lhs.processed_distances); - let rhs = compute_typos(&rhs.processed_distances); - - lhs.cmp(&rhs).reverse() - } -} - -pub struct Words; - -impl Criterion for Words { - fn name(&self) -> &str { "words" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_query_distances(documents, query_enhancer, automatons, postings_lists); - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - #[inline] - fn number_of_query_words(distances: &[Option]) -> usize { - distances.iter().cloned().filter(Option::is_some).count() - } - - let lhs = number_of_query_words(&lhs.processed_distances); - let rhs = number_of_query_words(&rhs.processed_distances); - - lhs.cmp(&rhs).reverse() - } -} - -fn prepare_raw_matches<'a, 'tag, 'txn>( - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], -) { - for document in documents { - if !document.processed_matches.is_empty() { continue } - - let mut processed = Vec::new(); - for m in document.raw_matches.iter() { - let postings_list = &postings_lists[m.postings_list]; - processed.reserve(postings_list.len()); - for di in postings_list.as_ref() { - let simple_match = SimpleMatch { - query_index: m.query_index, - distance: m.distance, - attribute: di.attribute, - word_index: di.word_index, - is_exact: m.is_exact, - }; - processed.push(simple_match); - } - } - - let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); - document.processed_matches = processed.into_vec(); - } -} - -pub struct Proximity; - -impl Criterion for Proximity { - fn name(&self) -> &str { "proximity" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering - { - const MAX_DISTANCE: u16 = 8; - - fn index_proximity(lhs: u16, rhs: u16) -> u16 { - if lhs < rhs { - cmp::min(rhs - lhs, MAX_DISTANCE) - } else { - cmp::min(lhs - rhs, MAX_DISTANCE) + 1 - } - } - - fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 { - if lhs.attribute != rhs.attribute { MAX_DISTANCE } - else { index_proximity(lhs.word_index, rhs.word_index) } - } - - fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 { - let mut min_prox = u16::max_value(); - for a in lhs { - for b in rhs { - let prox = attribute_proximity(*a, *b); - min_prox = cmp::min(min_prox, prox); - } - } - min_prox - } - - fn matches_proximity(matches: &[SimpleMatch],) -> u16 { - let mut proximity = 0; - let mut iter = matches.linear_group_by_key(|m| m.query_index); - - // iterate over groups by windows of size 2 - let mut last = iter.next(); - while let (Some(lhs), Some(rhs)) = (last, iter.next()) { - proximity += min_proximity(lhs, rhs); - last = Some(rhs); - } - - proximity - } - - let lhs = matches_proximity(&lhs.processed_matches); - let rhs = matches_proximity(&rhs.processed_matches); - - lhs.cmp(&rhs) - } -} - -pub struct Attribute; - -impl Criterion for Attribute { - fn name(&self) -> &str { "attribute" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering - { - #[inline] - fn best_attribute(matches: &[SimpleMatch]) -> u16 { - let mut best_attribute = u16::max_value(); - for group in matches.linear_group_by_key(|bm| bm.query_index) { - best_attribute = cmp::min(best_attribute, group[0].attribute); - } - best_attribute - } - - let lhs = best_attribute(&lhs.processed_matches); - let rhs = best_attribute(&rhs.processed_matches); - - lhs.cmp(&rhs) - } -} - -pub struct WordsPosition; - -impl Criterion for WordsPosition { - fn name(&self) -> &str { "words position" } - - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut PostingsListsArena<'tag, 'txn>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &PostingsListsArena<'tag, 'txn>, - ) -> Ordering - { - #[inline] - fn sum_words_position(matches: &[SimpleMatch]) -> usize { - let mut sum_words_position = 0; - for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_words_position += group[0].word_index as usize; - } - sum_words_position - } - - let lhs = sum_words_position(&lhs.processed_matches); - let rhs = sum_words_position(&rhs.processed_matches); - - lhs.cmp(&rhs) - } -} - -pub struct Exact; - -impl Criterion for Exact { - fn name(&self) -> &str { "exact" } - - fn prepare( - &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); - } - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - #[inline] - fn sum_exact_query_words(matches: &[BareMatch]) -> usize { - let mut sum_exact_query_words = 0; - - for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_exact_query_words += group[0].is_exact as usize; - } - - sum_exact_query_words - } - - let lhs = sum_exact_query_words(&lhs.raw_matches); - let rhs = sum_exact_query_words(&rhs.raw_matches); - - lhs.cmp(&rhs).reverse() - } -} - -pub struct StableDocId; - -impl Criterion for StableDocId { - fn name(&self) -> &str { "stable document id" } - - fn prepare( - &self, - documents: &mut [RawDocument], - postings_lists: &mut PostingsListsArena, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - // ... - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &PostingsListsArena, - ) -> Ordering - { - let lhs = &lhs.raw_matches[0].document_id; - let rhs = &rhs.raw_matches[0].document_id; - - lhs.cmp(rhs) - } -} - -pub fn multiword_rewrite_matches( - matches: &mut [SimpleMatch], - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], -) -> SetBuf -{ - matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); - - let mut padded_matches = Vec::with_capacity(matches.len()); - - // let before_padding = Instant::now(); - // for each attribute of each document - for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { - // padding will only be applied - // to word indices in the same attribute - let mut padding = 0; - let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index); - - // for each match at the same position - // in this document attribute - while let Some(same_word_index) = iter.next() { - // find the biggest padding - let mut biggest = 0; - for match_ in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index as u32); - let replacement_len = replacement.len(); - let nexts = iter.remainder().linear_group_by_key(|m| m.word_index); - - if let Some(query_index) = replacement.next() { - let word_index = match_.word_index + padding as u16; - let query_index = query_index as u16; - let match_ = SimpleMatch { query_index, word_index, ..*match_ }; - padded_matches.push(match_); - } - - let mut found = false; - - // look ahead and if there already is a match - // corresponding to this padding word, abort the padding - 'padding: for (x, next_group) in nexts.enumerate() { - for (i, query_index) in replacement.clone().enumerate().skip(x) { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let query_index = query_index as u16; - let padmatch = SimpleMatch { query_index, word_index, ..*match_ }; - - for nmatch_ in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index as u32); - let query_index = rep.next().unwrap() as u16; - if query_index == padmatch.query_index { - if !found { - // if we find a corresponding padding for the - // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) - { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let query_index = query_index as u16; - let match_ = SimpleMatch { query_index, word_index, ..*match_ }; - padded_matches.push(match_); - biggest = biggest.max(i + 1); - } - } - - padded_matches.push(padmatch); - found = true; - continue 'padding; - } - } - } - - // if we do not find a corresponding padding in the - // next groups so stop here and pad what was found - break; - } - - if !found { - // if no padding was found in the following matches - // we must insert the entire padding - for (i, query_index) in replacement.enumerate() { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let query_index = query_index as u16; - let match_ = SimpleMatch { query_index, word_index, ..*match_ }; - padded_matches.push(match_); - } - - biggest = biggest.max(replacement_len - 1); - } - } - - padding += biggest; - } - } - - // debug!("padding matches took {:.02?}", before_padding.elapsed()); - - // With this check we can see that the loop above takes something - // like 43% of the search time even when no rewrite is needed. - // assert_eq!(before_matches, padded_matches); - - SetBuf::from_dirty(padded_matches) -} diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 3a54168b4..01fb05372 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -20,7 +20,6 @@ mod update; // TODO replace mod bucket_sort; -mod criterion2; pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::error::{Error, MResult}; @@ -31,62 +30,13 @@ pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; -#[doc(hidden)] -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct TmpMatch { - pub query_index: u32, - pub distance: u8, - pub attribute: u16, - pub word_index: u16, - pub is_exact: bool, -} - #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, pub highlights: Vec, - #[cfg(test)] - pub matches: Vec, -} - -impl Document { - #[cfg(not(test))] - fn from_raw(raw: RawDocument) -> Document { - Document { - id: raw.id, - highlights: raw.highlights, - } - } - - #[cfg(test)] - fn from_raw(raw: RawDocument) -> Document { - let len = raw.query_index().len(); - let mut matches = Vec::with_capacity(len); - - let query_index = raw.query_index(); - let distance = raw.distance(); - let attribute = raw.attribute(); - let word_index = raw.word_index(); - let is_exact = raw.is_exact(); - - for i in 0..len { - let match_ = TmpMatch { - query_index: query_index[i], - distance: distance[i], - attribute: attribute[i], - word_index: word_index[i], - is_exact: is_exact[i], - }; - matches.push(match_); - } - - Document { - id: raw.id, - matches, - highlights: raw.highlights, - } - } + // #[cfg(test)] + // pub matches: Vec, } #[cfg(test)] diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index c862ae2a2..3a9750ec0 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -1,21 +1,8 @@ -use hashbrown::HashMap; -use std::convert::TryFrom; use std::ops::Range; -use std::rc::Rc; -use std::time::{Duration, Instant}; -use std::{cmp, mem}; - -use fst::{IntoStreamer, Streamer}; -use log::debug; -use sdset::SetBuf; -use slice_group_by::{GroupBy, GroupByMut}; +use std::time::Duration; use crate::{bucket_sort::bucket_sort, database::MainT}; -use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer}; -use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; -use crate::levenshtein::prefix_damerau_levenshtein; -use crate::raw_document::{raw_documents_from, RawDocument}; -use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; +use crate::{criterion::Criteria, Document, DocumentId}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; pub struct QueryBuilder<'c, 'f, 'd> { @@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> { synonyms_store: store::Synonyms, } -fn multiword_rewrite_matches( - mut matches: Vec<(DocumentId, TmpMatch)>, - query_enhancer: &QueryEnhancer, -) -> SetBuf<(DocumentId, TmpMatch)> { - let mut padded_matches = Vec::with_capacity(matches.len()); - - let before_sort = Instant::now(); - // we sort the matches by word index to make them rewritable - matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); - debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); - - let before_padding = Instant::now(); - // for each attribute of each document - for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { - // padding will only be applied - // to word indices in the same attribute - let mut padding = 0; - let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); - - // for each match at the same position - // in this document attribute - while let Some(same_word_index) = iter.next() { - // find the biggest padding - let mut biggest = 0; - for (id, match_) in same_word_index { - let mut replacement = query_enhancer.replacement(match_.query_index); - let replacement_len = replacement.len(); - let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); - - if let Some(query_index) = replacement.next() { - let word_index = match_.word_index + padding as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - } - - let mut found = false; - - // look ahead and if there already is a match - // corresponding to this padding word, abort the padding - 'padding: for (x, next_group) in nexts.enumerate() { - for (i, query_index) in replacement.clone().enumerate().skip(x) { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let padmatch = TmpMatch { - query_index, - word_index, - ..*match_ - }; - - for (_, nmatch_) in next_group { - let mut rep = query_enhancer.replacement(nmatch_.query_index); - let query_index = rep.next().unwrap(); - if query_index == padmatch.query_index { - if !found { - // if we find a corresponding padding for the - // first time we must push preceding paddings - for (i, query_index) in replacement.clone().enumerate().take(i) - { - let word_index = - match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - biggest = biggest.max(i + 1); - } - } - - padded_matches.push((*id, padmatch)); - found = true; - continue 'padding; - } - } - } - - // if we do not find a corresponding padding in the - // next groups so stop here and pad what was found - break; - } - - if !found { - // if no padding was found in the following matches - // we must insert the entire padding - for (i, query_index) in replacement.enumerate() { - let word_index = match_.word_index + padding as u16 + (i + 1) as u16; - let match_ = TmpMatch { - query_index, - word_index, - ..*match_ - }; - padded_matches.push((*id, match_)); - } - - biggest = biggest.max(replacement_len - 1); - } - } - - padding += biggest; - } - } - - for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { - document_matches.sort_unstable(); - } - - debug!("padding matches took {:.02?}", before_padding.elapsed()); - - // With this check we can see that the loop above takes something - // like 43% of the search time even when no rewrite is needed. - // assert_eq!(before_matches, padded_matches); - - SetBuf::new_unchecked(padded_matches) -} - -fn fetch_raw_documents( - reader: &heed::RoTxn, - automatons_groups: &[AutomatonGroup], - query_enhancer: &QueryEnhancer, - searchables: Option<&ReorderedAttrs>, - main_store: store::Main, - postings_lists_store: store::PostingsLists, -) -> MResult> { - let mut matches = Vec::new(); - let mut highlights = Vec::new(); - - let words = match main_store.words_fst(reader)? { - Some(words) => words, - None => return Ok(Vec::new()), - }; - - let before_automatons_groups_loop = Instant::now(); - let mut doc_indexes_rewrite = Duration::default(); - let mut retrieve_postings_lists = Duration::default(); - let mut stream_reserve = Duration::default(); - let mut covered_area_time = Duration::default(); - let mut eval_time = Duration::default(); - - for group in automatons_groups { - let AutomatonGroup { is_phrase_query, automatons } = group; - let phrase_query_len = automatons.len(); - - let mut tmp_matches = Vec::new(); - for (id, automaton) in automatons.into_iter().enumerate() { - let Automaton { index, is_exact, query_len, query, .. } = automaton; - let dfa = automaton.dfa(); - - let before_stream_loop = Instant::now(); - let mut stream_count = 0; - - let mut stream = words.search(&dfa).into_stream(); - while let Some(input) = stream.next() { - let before_eval_time = Instant::now(); - let distance = dfa.eval(input).to_u8(); - eval_time += before_eval_time.elapsed(); - - let is_exact = *is_exact && distance == 0 && input.len() == *query_len; - - stream_count += 1; - - let before_covered_area = Instant::now(); - let covered_area = if *query_len > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; - covered_area_time += before_covered_area.elapsed(); - - let before_retrieve_postings_lists = Instant::now(); - let doc_indexes = match postings_lists_store.postings_list(reader, input)? { - Some(doc_indexes) => doc_indexes, - None => continue, - }; - retrieve_postings_lists += before_retrieve_postings_lists.elapsed(); - - let before_stream_reserve = Instant::now(); - tmp_matches.reserve(doc_indexes.len()); - stream_reserve += before_stream_reserve.elapsed(); - - let before_doc_indexes_rewrite = Instant::now(); - for di in doc_indexes.as_ref() { - let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); - if let Some(attribute) = attribute { - let match_ = TmpMatch { - query_index: *index as u32, - distance, - attribute, - word_index: di.word_index, - is_exact, - }; - - let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value()); - let covered_area = cmp::min(covered_area, di.char_length); - - let highlight = Highlight { - attribute: di.attribute, - char_index: di.char_index, - char_length: covered_area, - }; - - tmp_matches.push((di.document_id, id, match_, highlight)); - } - } - doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed(); - } - debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count); - } - - if *is_phrase_query { - tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index)); - for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) { - for window in group.windows(2) { - let (ida, ia, ma, ha) = window[0]; - let (idb, ib, mb, hb) = window[1]; - - debug_assert_eq!(ida, idb); - - // if matches must follow and actually follows themselves - if ia + 1 == ib && ma.word_index + 1 == mb.word_index { - // TODO we must make it work for phrase query longer than 2 - // if the second match is the last phrase query word - if ib + 1 == phrase_query_len { - // insert first match - matches.push((ida, ma)); - highlights.push((ida, ha)); - - // insert second match - matches.push((idb, mb)); - highlights.push((idb, hb)); - } - } - } - } - } else { - let before_rerewrite = Instant::now(); - - matches.reserve(tmp_matches.len()); - highlights.reserve(tmp_matches.len()); - - for (id, _, match_, highlight) in tmp_matches { - matches.push((id, match_)); - highlights.push((id, highlight)); - } - debug!("rerewrite took {:.02?}", before_rerewrite.elapsed()); - } - } - debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); - debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite); - debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists); - debug!("stream reserve took {:.02?}", stream_reserve); - debug!("covered area took {:.02?}", covered_area_time); - debug!("eval value took {:.02?}", eval_time); - - // { - // let mut cloned = matches.clone(); - // let before_sort_test = Instant::now(); - // cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance)); - // debug!("sorting test took {:.02?}", before_sort_test.elapsed()); - // } - - let before_multiword_rewrite_matches = Instant::now(); - debug!("number of matches before rewrite {}", matches.len()); - debug!("{:?}", query_enhancer); - let matches = multiword_rewrite_matches(matches, &query_enhancer); - debug!("number of matches after rewrite {}", matches.len()); - debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); - - let before_highlight_sorting = Instant::now(); - let highlights = { - highlights.sort_unstable_by_key(|(id, _)| *id); - SetBuf::new_unchecked(highlights) - }; - debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); - - let before_raw_documents = Instant::now(); - let raw_documents = raw_documents_from(matches, highlights); - debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); - debug!("documents to worry about: {}", raw_documents.len()); - - Ok(raw_documents) -} - impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { pub fn new( main: store::Main, @@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { reader, query, range, - // self.criteria, + self.criteria, self.main_store, self.postings_lists_store, self.documents_fields_counts_store, diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 5ba660d11..4096eeaba 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,183 +1,89 @@ -use std::fmt; -use std::sync::Arc; - +use compact_arena::SmallArena; +use itertools::EitherOrBoth; use sdset::SetBuf; -use slice_group_by::GroupBy; -use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; +use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; -#[derive(Clone)] -pub struct RawDocument { - pub id: DocumentId, - pub matches: SharedMatches, - pub highlights: Vec, - pub fields_counts: Option>, +pub struct RawDocument<'a, 'tag> { + pub id: crate::DocumentId, + pub raw_matches: &'a mut [BareMatch<'tag>], + pub processed_matches: Vec, + /// The list of minimum `distance` found + pub processed_distances: Vec>, } -impl RawDocument { - pub fn query_index(&self) -> &[u32] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { - &self - .matches - .matches - .query_index - .get_unchecked(r.start..r.end) +impl<'a, 'tag> RawDocument<'a, 'tag> { + pub fn new<'txn>( + raw_matches: &'a mut [BareMatch<'tag>], + automatons: &[QueryWordAutomaton], + postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + ) -> Option> + { + raw_matches.sort_unstable_by_key(|m| m.query_index); + + let mut previous_word = None; + for i in 0..raw_matches.len() { + let a = &raw_matches[i]; + let auta = &automatons[a.query_index as usize]; + + match auta.phrase_query { + Some((0, _)) => { + let b = match raw_matches.get(i + 1) { + Some(b) => b, + None => { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + continue; + } + }; + + if a.query_index + 1 != b.query_index { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + continue + } + + let pla = &postings_lists[a.postings_list]; + let plb = &postings_lists[b.postings_list]; + + let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) + }); + + let mut newa = Vec::new(); + let mut newb = Vec::new(); + + for eb in iter { + if let EitherOrBoth::Both(a, b) = eb { + newa.push(*a); + newb.push(*b); + } + } + + if !newa.is_empty() { + previous_word = Some(a.query_index); + } + + postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa)); + postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb)); + }, + Some((1, _)) => { + if previous_word.take() != Some(a.query_index - 1) { + postings_lists[a.postings_list].rewrite_with(SetBuf::default()); + } + }, + Some((_, _)) => unreachable!(), + None => (), + } } - } - pub fn distance(&self) -> &[u8] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } - } - - pub fn attribute(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } - } - - pub fn word_index(&self) -> &[u16] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { - &self - .matches - .matches - .word_index - .get_unchecked(r.start..r.end) + if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { + return None } - } - pub fn is_exact(&self) -> &[bool] { - let r = self.matches.range; - // it is safe because construction/modifications - // can only be done in this module - unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } - } -} - -impl fmt::Debug for RawDocument { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("RawDocument {\r\n")?; - f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "query_index", - self.query_index() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "distance", - self.distance() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "attribute", - self.attribute() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "word_index", - self.word_index() - ))?; - f.write_fmt(format_args!( - "{:>15}: {:^5?},\r\n", - "is_exact", - self.is_exact() - ))?; - f.write_str("}")?; - Ok(()) - } -} - -pub fn raw_documents_from( - matches: SetBuf<(DocumentId, TmpMatch)>, - highlights: SetBuf<(DocumentId, Highlight)> -) -> Vec { - let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); - let mut matches2 = Matches::with_capacity(matches.len()); - - let matches = matches.linear_group_by_key(|(id, _)| *id); - let highlights = highlights.linear_group_by_key(|(id, _)| *id); - - for (mgroup, hgroup) in matches.zip(highlights) { - assert_eq!(mgroup[0].0, hgroup[0].0); - - let document_id = mgroup[0].0; - let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); - let end = start + mgroup.len(); - let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - let fields_counts = None; - - docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); - // TODO we could try to keep both data - // - the data oriented one and, - // - the raw one, the one that comes from the arguments of this function - // This way we would be able to only produce data oriented lazily. - // - // For example the default first criterion is `SumOfTypos` - // and just needs the `query_index` and the `distance` fields. - // It would probably be good to avoid wasting time sorting other fields of documents - // that will never ever reach the second criterion. - matches2.extend_from_slice(mgroup); - } - - let matches = Arc::new(matches2); - docs_ranges - .into_iter() - .map(|(id, range, highlights, fields_counts)| { - let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument { id, matches, highlights, fields_counts } + Some(RawDocument { + id: raw_matches[0].document_id, + raw_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), }) - .collect() -} - -#[derive(Debug, Copy, Clone)] -struct Range { - start: usize, - end: usize, -} - -#[derive(Clone)] -pub struct SharedMatches { - range: Range, - matches: Arc, -} - -#[derive(Clone)] -struct Matches { - query_index: Vec, - distance: Vec, - attribute: Vec, - word_index: Vec, - is_exact: Vec, -} - -impl Matches { - fn with_capacity(cap: usize) -> Matches { - Matches { - query_index: Vec::with_capacity(cap), - distance: Vec::with_capacity(cap), - attribute: Vec::with_capacity(cap), - word_index: Vec::with_capacity(cap), - is_exact: Vec::with_capacity(cap), - } - } - - fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) { - for (_, match_) in matches { - self.query_index.push(match_.query_index); - self.distance.push(match_.distance); - self.attribute.push(match_.attribute); - self.word_index.push(match_.word_index); - self.is_exact.push(match_.is_exact); - } } } diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 8079f7168..fb995750d 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -310,11 +310,11 @@ impl<'a> SearchBuilder<'a> { if let Some(ranking_rules_order) = ranking_order { for rule in ranking_rules_order { match rule.as_str() { - "_sum_of_typos" => builder.push(SumOfTypos), - "_number_of_words" => builder.push(NumberOfWords), - "_word_proximity" => builder.push(WordsProximity), - "_sum_of_words_attribute" => builder.push(SumOfWordsAttribute), - "_sum_of_words_position" => builder.push(SumOfWordsPosition), + "_typo" => builder.push(Typo), + "_words" => builder.push(Words), + "_proximity" => builder.push(Proximity), + "_attribute" => builder.push(Attribute), + "_words_position" => builder.push(WordsPosition), "_exact" => builder.push(Exact), _ => { let order = match ranking_rules.get(rule.as_str()) { @@ -340,11 +340,11 @@ impl<'a> SearchBuilder<'a> { builder.push(DocumentId); return Ok(Some(builder.build())); } else { - builder.push(SumOfTypos); - builder.push(NumberOfWords); - builder.push(WordsProximity); - builder.push(SumOfWordsAttribute); - builder.push(SumOfWordsPosition); + builder.push(Typo); + builder.push(Words); + builder.push(Proximity); + builder.push(Attribute); + builder.push(WordsPosition); builder.push(Exact); for (rule, order) in ranking_rules.iter() { let custom_ranking = match order { From 86ee0cbd6e700743f3e14f4b4142403167d6d4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Dec 2019 17:36:53 +0100 Subject: [PATCH 15/23] Introduce bucket_sort_with_distinct function --- meilisearch-core/src/automaton/mod.rs | 292 +------------------------- meilisearch-core/src/bucket_sort.rs | 255 +++++++++++++++++++--- meilisearch-core/src/lib.rs | 34 +++ meilisearch-core/src/query_builder.rs | 18 +- meilisearch-core/src/raw_document.rs | 2 +- 5 files changed, 272 insertions(+), 329 deletions(-) diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index ecf99ee1c..ef9bf5324 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -1,17 +1,7 @@ mod dfa; mod query_enhancer; -use std::cmp::Reverse; -use std::{cmp, fmt, vec}; - -use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::DFA; -use meilisearch_tokenizer::{is_cjk, split_query_string}; -use log::debug; - -use crate::database::MainT; -use crate::error::MResult; -use crate::store; +use meilisearch_tokenizer::is_cjk; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; pub use self::query_enhancer::QueryEnhancer; @@ -19,122 +9,6 @@ pub use self::query_enhancer::QueryEnhancerBuilder; pub const NGRAMS: usize = 3; -pub struct AutomatonProducer { - automatons: Vec, -} - -impl AutomatonProducer { - pub fn new( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_list_store: store::PostingsLists, - synonyms_store: store::Synonyms, - ) -> MResult<(AutomatonProducer, QueryEnhancer)> { - let (automatons, query_enhancer) = generate_automatons( - reader, - query, - main_store, - postings_list_store, - synonyms_store, - )?; - - for (i, group) in automatons.iter().enumerate() { - debug!("all automatons: group {} automatons {:?}", i, group.automatons); - } - - Ok((AutomatonProducer { automatons }, query_enhancer)) - } - - pub fn into_iter(self) -> vec::IntoIter { - self.automatons.into_iter() - } -} - -#[derive(Debug)] -pub struct AutomatonGroup { - pub is_phrase_query: bool, - pub automatons: Vec, -} - -impl AutomatonGroup { - fn normal(automatons: Vec) -> AutomatonGroup { - AutomatonGroup { - is_phrase_query: false, - automatons, - } - } - - fn phrase_query(automatons: Vec) -> AutomatonGroup { - AutomatonGroup { - is_phrase_query: true, - automatons, - } - } -} - -pub struct Automaton { - pub index: usize, - pub ngram: usize, - pub query_len: usize, - pub is_exact: bool, - pub is_prefix: bool, - pub query: String, -} - -impl fmt::Debug for Automaton { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Automaton") - .field("index", &self.index) - .field("query", &self.query) - .field("is_prefix", &self.is_prefix) - .finish() - } -} - -impl Automaton { - pub fn dfa(&self) -> DFA { - if self.is_prefix { - build_prefix_dfa(&self.query) - } else { - build_dfa(&self.query) - } - } - - fn exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: false, - query: query.to_string(), - } - } - - fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: true, - is_prefix: true, - query: query.to_string(), - } - } - - fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton { - Automaton { - index, - ngram, - query_len: query.len(), - is_exact: false, - is_prefix: false, - query: query.to_string(), - } - } -} - pub fn normalize_str(string: &str) -> String { let mut string = string.to_lowercase(); @@ -144,167 +18,3 @@ pub fn normalize_str(string: &str) -> String { string } - -pub fn split_best_frequency<'a>( - reader: &heed::RoTxn, - word: &'a str, - postings_lists_store: store::PostingsLists, -) -> MResult> { - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let left_freq = postings_lists_store - .postings_list(reader, left.as_ref())? - .map_or(0, |i| i.len()); - - let right_freq = postings_lists_store - .postings_list(reader, right.as_ref())? - .map_or(0, |i| i.len()); - - let min_freq = cmp::min(left_freq, right_freq); - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); - } - } - - Ok(best.map(|(_, l, r)| (l, r))) -} - -fn generate_automatons( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - synonym_store: store::Synonyms, -) -> MResult<(Vec, QueryEnhancer)> { - let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let synonyms = match main_store.synonyms_fst(reader)? { - Some(synonym) => synonym, - None => fst::Set::default(), - }; - - let mut automaton_index = 0; - let mut automatons = Vec::new(); - let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - - // We must not declare the original words to the query enhancer - // *but* we need to push them in the automatons list first - let mut original_automatons = Vec::new(); - let mut original_words = query_words.iter().peekable(); - while let Some(word) = original_words.next() { - let has_following_word = original_words.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - - let automaton = if not_prefix_dfa { - Automaton::exact(automaton_index, 1, word) - } else { - Automaton::prefix_exact(automaton_index, 1, word) - }; - automaton_index += 1; - original_automatons.push(automaton); - } - - automatons.push(AutomatonGroup::normal(original_automatons)); - - for n in 1..=NGRAMS { - let mut ngrams = query_words.windows(n).enumerate().peekable(); - while let Some((query_index, ngram_slice)) = ngrams.next() { - let query_range = query_index..query_index + n; - let ngram_nb_words = ngram_slice.len(); - let ngram = ngram_slice.join(" "); - - let has_following_word = ngrams.peek().is_some(); - let not_prefix_dfa = - has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); - - // automaton of synonyms of the ngrams - let normalized = normalize_str(&ngram); - let lev = if not_prefix_dfa { - build_dfa(&normalized) - } else { - build_prefix_dfa(&normalized) - }; - - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(base) = stream.next() { - // only trigger alternatives when the last word has been typed - // i.e. "new " do not but "new yo" triggers alternatives to "new york" - let base = std::str::from_utf8(base).unwrap(); - let base_nb_words = split_query_string(base).count(); - if ngram_nb_words != base_nb_words { - continue; - } - - if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); - let nb_synonym_words = synonyms_words.len(); - - let real_query_index = automaton_index; - enhancer_builder.declare( - query_range.clone(), - real_query_index, - &synonyms_words, - ); - - for synonym in synonyms_words { - let automaton = if nb_synonym_words == 1 { - Automaton::exact(automaton_index, n, synonym) - } else { - Automaton::non_exact(automaton_index, n, synonym) - }; - automaton_index += 1; - automatons.push(AutomatonGroup::normal(vec![automaton])); - } - } - } - } - - if n == 1 { - if let Some((left, right)) = - split_best_frequency(reader, &normalized, postings_lists_store)? - { - let a = Automaton::exact(automaton_index, 1, left); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - - let b = Automaton::exact(automaton_index, 1, right); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - - automatons.push(AutomatonGroup::phrase_query(vec![a, b])); - } - } else { - // automaton of concatenation of query words - let concat = ngram_slice.concat(); - let normalized = normalize_str(&concat); - - let real_query_index = automaton_index; - enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - - let automaton = Automaton::exact(automaton_index, n, &normalized); - automaton_index += 1; - automatons.push(AutomatonGroup::normal(vec![automaton])); - } - } - } - - // order automatons, the most important first, - // we keep the original automatons at the front. - automatons[1..].sort_by_key(|group| { - let a = group.automatons.first().unwrap(); - ( - Reverse(a.is_exact), - a.ngram, - Reverse(group.automatons.len()), - ) - }); - - Ok((automatons, enhancer_builder.build())) -} diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 0c5fbdee3..9502f2562 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,5 +1,5 @@ use std::ops::Deref; -use std::fmt; +use std::{cmp, fmt}; use std::borrow::Cow; use std::mem; use std::ops::Range; @@ -8,43 +8,68 @@ use std::time::{Duration, Instant}; use compact_arena::{SmallArena, Idx32, mk_arena}; use fst::{IntoStreamer, Streamer}; +use hashbrown::HashMap; use levenshtein_automata::DFA; use log::debug; use meilisearch_tokenizer::{is_cjk, split_query_string}; -use meilisearch_types::{DocIndex, Highlight}; +use meilisearch_types::DocIndex; use sdset::{Set, SetBuf}; use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::NGRAMS; -use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; -use crate::automaton::{normalize_str, split_best_frequency}; +use crate::automaton::normalize_str; +use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::criterion::Criteria; -use crate::levenshtein::prefix_damerau_levenshtein; +use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; -pub fn bucket_sort<'c>( +pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, query: &str, range: Range, + filter: Option, criteria: Criteria<'c>, main_store: store::Main, postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, ) -> MResult> +where + FI: Fn(DocumentId) -> bool, { + // We delegate the filter work to the distinct query builder, + // specifying a distinct rule that has no effect. + if filter.is_some() { + let distinct = |_| None; + let distinct_size = 1; + return bucket_sort_with_distinct( + reader, + query, + range, + filter, + distinct, + distinct_size, + criteria, + main_store, + postings_lists_store, + documents_fields_counts_store, + synonyms_store, + ); + } + let (automatons, query_enhancer) = - construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; + construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; debug!("{:?}", query_enhancer); let before_postings_lists_fetching = Instant::now(); mk_arena!(arena); - let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + let mut bare_matches = + fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; debug!("bare matches ({}) retrieved in {:.02?}", bare_matches.len(), before_postings_lists_fetching.elapsed(), @@ -69,9 +94,6 @@ pub fn bucket_sort<'c>( before_raw_documents_building.elapsed(), ); - dbg!(mem::size_of::()); - dbg!(mem::size_of::()); - let mut groups = vec![raw_documents.as_mut_slice()]; 'criteria: for criterion in criteria.as_ref() { @@ -103,31 +125,166 @@ pub fn bucket_sort<'c>( } let iter = raw_documents.into_iter().skip(range.start).take(range.len()); - let iter = iter.map(|d| { - let highlights = d.raw_matches.iter().flat_map(|sm| { - let postings_list = &arena[sm.postings_list]; - let input = postings_list.input(); - let query = &automatons[sm.query_index as usize].query; - postings_list.iter().map(move |m| { - let covered_area = if query.len() > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; - Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 } - }) - }).collect(); - - Document { - id: d.id, - highlights, - #[cfg(test)] matches: Vec::new(), - } - }); + let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena)); Ok(iter.collect()) } +pub fn bucket_sort_with_distinct<'c, FI, FD>( + reader: &heed::RoTxn, + query: &str, + range: Range, + filter: Option, + distinct: FD, + distinct_size: usize, + criteria: Criteria<'c>, + main_store: store::Main, + postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, + synonyms_store: store::Synonyms, +) -> MResult> +where + FI: Fn(DocumentId) -> bool, + FD: Fn(DocumentId) -> Option, +{ + let (automatons, query_enhancer) = + construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; + + let before_postings_lists_fetching = Instant::now(); + mk_arena!(arena); + let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; + debug!("bare matches ({}) retrieved in {:.02?}", + bare_matches.len(), + before_postings_lists_fetching.elapsed(), + ); + + let before_raw_documents_presort = Instant::now(); + bare_matches.sort_unstable_by_key(|sm| sm.document_id); + debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed()); + + let before_raw_documents_building = Instant::now(); + let mut prefiltered_documents = 0; + let mut raw_documents = Vec::new(); + for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + prefiltered_documents += 1; + if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { + raw_documents.push(raw_document); + } + } + debug!("creating {} (original {}) candidates documents took {:.02?}", + raw_documents.len(), + prefiltered_documents, + before_raw_documents_building.elapsed(), + ); + + let mut groups = vec![raw_documents.as_mut_slice()]; + let mut key_cache = HashMap::new(); + + let mut filter_map = HashMap::new(); + // these two variables informs on the current distinct map and + // on the raw offset of the start of the group where the + // range.start bound is located according to the distinct function + let mut distinct_map = DistinctMap::new(distinct_size); + let mut distinct_raw_offset = 0; + + 'criteria: for criterion in criteria.as_ref() { + let tmp_groups = mem::replace(&mut groups, Vec::new()); + let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); + let mut documents_seen = 0; + + for mut group in tmp_groups { + // if this group does not overlap with the requested range, + // push it without sorting and splitting it + if documents_seen + group.len() < distinct_raw_offset { + documents_seen += group.len(); + groups.push(group); + continue; + } + + let before_criterion_preparation = Instant::now(); + criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); + debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + + let before_criterion_sort = Instant::now(); + group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); + debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); + + for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { + // we must compute the real distinguished len of this sub-group + for document in group.iter() { + let filter_accepted = match &filter { + Some(filter) => { + let entry = filter_map.entry(document.id); + *entry.or_insert_with(|| (filter)(document.id)) + } + None => true, + }; + + if filter_accepted { + let entry = key_cache.entry(document.id); + let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new)); + + match key.clone() { + Some(key) => buf_distinct.register(key), + None => buf_distinct.register_without_key(), + }; + } + + // the requested range end is reached: stop computing distinct + if buf_distinct.len() >= range.end { + break; + } + } + + documents_seen += group.len(); + groups.push(group); + + // if this sub-group does not overlap with the requested range + // we must update the distinct map and its start index + if buf_distinct.len() < range.start { + buf_distinct.transfert_to_internal(); + distinct_raw_offset = documents_seen; + } + + // we have sort enough documents if the last document sorted is after + // the end of the requested range, we can continue to the next criterion + if buf_distinct.len() >= range.end { + continue 'criteria; + } + } + } + } + + // once we classified the documents related to the current + // automatons we save that as the next valid result + let mut seen = BufferedDistinctMap::new(&mut distinct_map); + + let mut documents = Vec::with_capacity(range.len()); + for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) { + let filter_accepted = match &filter { + Some(_) => filter_map.remove(&raw_document.id).unwrap(), + None => true, + }; + + if filter_accepted { + let key = key_cache.remove(&raw_document.id).unwrap(); + let distinct_accepted = match key { + Some(key) => seen.register(key), + None => seen.register_without_key(), + }; + + if distinct_accepted && seen.len() > range.start { + documents.push(Document::from_raw(raw_document, &automatons, &arena)); + if documents.len() == range.len() { + break; + } + } + } + } + + Ok(documents) +} + pub struct BareMatch<'tag> { pub document_id: DocumentId, pub query_index: u16, @@ -257,7 +414,7 @@ fn fetch_matches<'txn, 'tag>( postings_lists_store: store::PostingsLists, ) -> MResult>> { - let mut before_words_fst = Instant::now(); + let before_words_fst = Instant::now(); let words = match main_store.words_fst(reader)? { Some(words) => words, None => return Ok(Vec::new()), @@ -273,7 +430,7 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton; + let QueryWordAutomaton { query, is_exact, .. } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; @@ -381,7 +538,35 @@ impl QueryWordAutomaton { } } -fn construct_automatons2( +fn split_best_frequency<'a>( + reader: &heed::RoTxn, + word: &'a str, + postings_lists_store: store::PostingsLists, +) -> MResult> { + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let left_freq = postings_lists_store + .postings_list(reader, left.as_ref())? + .map_or(0, |i| i.len()); + + let right_freq = postings_lists_store + .postings_list(reader, right.as_ref())? + .map_or(0, |i| i.len()); + + let min_freq = cmp::min(left_freq, right_freq); + if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { + best = Some((min_freq, left, right)); + } + } + + Ok(best.map(|(_, l, r)| (l, r))) +} + +fn construct_automatons( reader: &heed::RoTxn, query: &str, main_store: store::Main, diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 01fb05372..fb1975a0b 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -30,6 +30,10 @@ pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; +use compact_arena::SmallArena; +use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; +use crate::levenshtein::prefix_damerau_levenshtein; + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { pub id: DocumentId, @@ -39,6 +43,36 @@ pub struct Document { // pub matches: Vec, } +impl Document { + pub fn from_raw<'a, 'tag, 'txn>( + raw_document: RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + ) -> Document + { + let highlights = raw_document.raw_matches.iter().flat_map(|sm| { + let postings_list = &arena[sm.postings_list]; + let input = postings_list.input(); + let query = &automatons[sm.query_index as usize].query; + postings_list.iter().map(move |m| { + let covered_area = if query.len() > input.len() { + input.len() + } else { + prefix_damerau_levenshtein(query.as_bytes(), input).1 + }; + + Highlight { + attribute: m.attribute, + char_index: m.char_index, + char_length: covered_area as u16, + } + }) + }).collect(); + + Document { id: raw_document.id, highlights } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 3a9750ec0..c0a12e34f 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -1,7 +1,8 @@ use std::ops::Range; use std::time::Duration; -use crate::{bucket_sort::bucket_sort, database::MainT}; +use crate::database::MainT; +use crate::bucket_sort::{bucket_sort, bucket_sort_with_distinct}; use crate::{criterion::Criteria, Document, DocumentId}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; @@ -85,11 +86,24 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { range: Range, ) -> MResult> { match self.distinct { - Some((distinct, distinct_size)) => unimplemented!("distinct"), + Some((distinct, distinct_size)) => bucket_sort_with_distinct( + reader, + query, + range, + self.filter, + distinct, + distinct_size, + self.criteria, + self.main_store, + self.postings_lists_store, + self.documents_fields_counts_store, + self.synonyms_store, + ), None => bucket_sort( reader, query, range, + self.filter, self.criteria, self.main_store, self.postings_lists_store, diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 4096eeaba..8e511d7eb 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -44,7 +44,7 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { let pla = &postings_lists[a.postings_list]; let plb = &postings_lists[b.postings_list]; - let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { + let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| { a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index)) }); From d75339a27173f7d9771a4cd3fae9789bf4bc9558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Dec 2019 18:37:26 +0100 Subject: [PATCH 16/23] Prefer summing the attribute --- meilisearch-core/src/criterion/attribute.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs index 3dc6e4282..cad5664c0 100644 --- a/meilisearch-core/src/criterion/attribute.rs +++ b/meilisearch-core/src/criterion/attribute.rs @@ -1,4 +1,4 @@ -use std::cmp::{self, Ordering}; +use std::cmp::Ordering; use compact_arena::SmallArena; use slice_group_by::GroupBy; @@ -32,16 +32,16 @@ impl Criterion for Attribute { ) -> Ordering { #[inline] - fn best_attribute(matches: &[SimpleMatch]) -> u16 { - let mut best_attribute = u16::max_value(); + fn sum_of_attribute(matches: &[SimpleMatch]) -> usize { + let mut sum_of_attribute = 0; for group in matches.linear_group_by_key(|bm| bm.query_index) { - best_attribute = cmp::min(best_attribute, group[0].attribute); + sum_of_attribute += group[0].attribute as usize; } - best_attribute + sum_of_attribute } - let lhs = best_attribute(&lhs.processed_matches); - let rhs = best_attribute(&rhs.processed_matches); + let lhs = sum_of_attribute(&lhs.processed_matches); + let rhs = sum_of_attribute(&rhs.processed_matches); lhs.cmp(&rhs) } From d93e35cace2218263780c6cf042101393a60b41c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Dec 2019 11:33:39 +0100 Subject: [PATCH 17/23] Introduce ContextMut and Context structs --- meilisearch-core/src/bucket_sort.rs | 43 +++++++++++++++---- meilisearch-core/src/criterion/attribute.rs | 27 +++--------- meilisearch-core/src/criterion/document_id.rs | 24 +---------- meilisearch-core/src/criterion/exact.rs | 24 ++--------- meilisearch-core/src/criterion/mod.rs | 42 +++++++++++------- meilisearch-core/src/criterion/proximity.rs | 29 ++++--------- .../src/criterion/sort_by_attr.rs | 25 +---------- meilisearch-core/src/criterion/typo.rs | 20 +++------ meilisearch-core/src/criterion/words.rs | 26 +++-------- .../src/criterion/words_position.rs | 25 +++++------ 10 files changed, 106 insertions(+), 179 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 9502f2562..e61858d99 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -21,7 +21,7 @@ use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::automaton::normalize_str; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; -use crate::criterion::Criteria; +use crate::criterion::{Criteria, Context, ContextMut}; use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; @@ -61,7 +61,7 @@ where ); } - let (automatons, query_enhancer) = + let (mut automatons, mut query_enhancer) = construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; debug!("{:?}", query_enhancer); @@ -102,14 +102,27 @@ where for mut group in tmp_groups { let before_criterion_preparation = Instant::now(); - criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); + + let ctx = ContextMut { + postings_lists: &mut arena, + query_enhancer: &mut query_enhancer, + automatons: &mut automatons, + }; + + criterion.prepare(ctx, &mut group); debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + let ctx = Context { + postings_lists: &arena, + query_enhancer: &query_enhancer, + automatons: &automatons, + }; + let before_criterion_sort = Instant::now(); - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); + group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { debug!("{:?} produced a group of size {}", criterion.name(), group.len()); documents_seen += group.len(); @@ -147,7 +160,7 @@ where FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, { - let (automatons, query_enhancer) = + let (mut automatons, mut query_enhancer) = construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; let before_postings_lists_fetching = Instant::now(); @@ -201,15 +214,27 @@ where continue; } + let ctx = ContextMut { + postings_lists: &mut arena, + query_enhancer: &mut query_enhancer, + automatons: &mut automatons, + }; + let before_criterion_preparation = Instant::now(); - criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons); + criterion.prepare(ctx, &mut group); debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); + let ctx = Context { + postings_lists: &arena, + query_enhancer: &query_enhancer, + automatons: &automatons, + }; + let before_criterion_sort = Instant::now(); - group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena)); + group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b)); debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed()); - for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) { + for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) { // we must compute the real distinguished len of this sub-group for document in group.iter() { let filter_accepted = match &filter { diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs index cad5664c0..4baebf66a 100644 --- a/meilisearch-core/src/criterion/attribute.rs +++ b/meilisearch-core/src/criterion/attribute.rs @@ -1,36 +1,23 @@ use std::cmp::Ordering; - -use compact_arena::SmallArena; use slice_group_by::GroupBy; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; use crate::RawDocument; - -use super::{Criterion, prepare_raw_matches}; +use crate::bucket_sort::SimpleMatch; +use super::{Criterion, Context, ContextMut, prepare_raw_matches}; pub struct Attribute; impl Criterion for Attribute { fn name(&self) -> &str { "attribute" } - fn prepare<'a, 'tag, 'txn>( + fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], + ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons); } - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { #[inline] fn sum_of_attribute(matches: &[SimpleMatch]) -> usize { let mut sum_of_attribute = 0; diff --git a/meilisearch-core/src/criterion/document_id.rs b/meilisearch-core/src/criterion/document_id.rs index 596194bca..f54a43779 100644 --- a/meilisearch-core/src/criterion/document_id.rs +++ b/meilisearch-core/src/criterion/document_id.rs @@ -1,34 +1,14 @@ use std::cmp::Ordering; - use compact_arena::SmallArena; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; use crate::RawDocument; -use super::Criterion; +use super::{Criterion, Context}; pub struct DocumentId; impl Criterion for DocumentId { fn name(&self) -> &str { "stable document id" } - fn prepare( - &self, - documents: &mut [RawDocument], - postings_lists: &mut SmallArena, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - // ... - } - - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &SmallArena, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { let lhs = &lhs.id; let rhs = &rhs.id; diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index d82f69462..56a81c9ee 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,37 +1,21 @@ use std::cmp::{Ordering, Reverse}; - -use compact_arena::SmallArena; use slice_group_by::GroupBy; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton}; use crate::RawDocument; -use super::Criterion; +use crate::bucket_sort::BareMatch; +use super::{Criterion, Context, ContextMut}; pub struct Exact; impl Criterion for Exact { fn name(&self) -> &str { "exact" } - fn prepare( - &self, - documents: &mut [RawDocument], - postings_lists: &mut SmallArena, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { + fn prepare(&self, _ctx: ContextMut, documents: &mut [RawDocument]) { for document in documents { document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); } } - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &SmallArena, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { #[inline] fn sum_exact_query_words(matches: &[BareMatch]) -> usize { let mut sum_exact_query_words = 0; diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index 0d54d89f2..40b75cf0d 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -29,33 +29,45 @@ pub use self::sort_by_attr::SortByAttr; pub trait Criterion { fn name(&self) -> &str; - fn prepare<'a, 'tag, 'txn>( + fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ); + ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) { + /* ... */ + } - fn evaluate<'a, 'tag, 'txn>( + fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + lhs: &RawDocument<'r, 'tag>, + rhs: &RawDocument<'r, 'tag>, ) -> Ordering; #[inline] - fn eq<'a, 'tag, 'txn>( + fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + lhs: &RawDocument<'r, 'tag>, + rhs: &RawDocument<'r, 'tag>, ) -> bool { - self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal + self.evaluate(ctx, lhs, rhs) == Ordering::Equal } } +pub struct ContextMut<'p, 'tag, 'txn, 'q, 'a> { + pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>, + pub query_enhancer: &'q mut QueryEnhancer, + pub automatons: &'a mut [QueryWordAutomaton], +} + +pub struct Context<'p, 'tag, 'txn, 'q, 'a> { + pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>, + pub query_enhancer: &'q QueryEnhancer, + pub automatons: &'a [QueryWordAutomaton], +} + #[derive(Default)] pub struct CriteriaBuilder<'a> { inner: Vec>, diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs index c9c534ca8..7437fb2c9 100644 --- a/meilisearch-core/src/criterion/proximity.rs +++ b/meilisearch-core/src/criterion/proximity.rs @@ -1,38 +1,25 @@ use std::cmp::{self, Ordering}; - -use compact_arena::SmallArena; use slice_group_by::GroupBy; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; +use crate::bucket_sort::{SimpleMatch}; use crate::RawDocument; +use super::{Criterion, Context, ContextMut, prepare_raw_matches}; -use super::{Criterion, prepare_raw_matches}; +const MAX_DISTANCE: u16 = 8; pub struct Proximity; impl Criterion for Proximity { fn name(&self) -> &str { "proximity" } - fn prepare<'a, 'tag, 'txn>( + fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], + ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons); } - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, - ) -> Ordering - { - const MAX_DISTANCE: u16 = 8; - + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { fn index_proximity(lhs: u16, rhs: u16) -> u16 { if lhs < rhs { cmp::min(rhs - lhs, MAX_DISTANCE) diff --git a/meilisearch-core/src/criterion/sort_by_attr.rs b/meilisearch-core/src/criterion/sort_by_attr.rs index ea1c016da..3fd801550 100644 --- a/meilisearch-core/src/criterion/sort_by_attr.rs +++ b/meilisearch-core/src/criterion/sort_by_attr.rs @@ -1,14 +1,9 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; - -use compact_arena::SmallArena; use meilisearch_schema::{Schema, SchemaAttr}; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; -use crate::criterion::Criterion; use crate::{RankedMap, RawDocument}; +use super::{Criterion, Context}; /// An helper struct that permit to sort documents by /// some of their stored attributes. @@ -95,23 +90,7 @@ impl Criterion for SortByAttr<'_> { "sort by attribute" } - fn prepare<'a, 'tag, 'txn>( - &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], - ) { - // ... - } - - fn evaluate<'a, 'tag, 'txn>( - &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { let lhs = self.ranked_map.get(lhs.id, self.attr); let rhs = self.ranked_map.get(rhs.id, self.attr); diff --git a/meilisearch-core/src/criterion/typo.rs b/meilisearch-core/src/criterion/typo.rs index d7907700d..8dcf9b578 100644 --- a/meilisearch-core/src/criterion/typo.rs +++ b/meilisearch-core/src/criterion/typo.rs @@ -6,30 +6,22 @@ use crate::automaton::QueryEnhancer; use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; use crate::RawDocument; -use super::{Criterion, prepare_query_distances}; +use super::{Criterion, Context, ContextMut, prepare_query_distances}; pub struct Typo; impl Criterion for Typo { fn name(&self) -> &str { "typo" } - fn prepare<'a, 'tag, 'txn>( + fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], + ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], ) { - prepare_query_distances(documents, query_enhancer, automatons, postings_lists); + prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists); } - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &SmallArena, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { // This function is a wrong logarithmic 10 function. // It is safe to panic on input number higher than 3, // the number of typos is never bigger than that. diff --git a/meilisearch-core/src/criterion/words.rs b/meilisearch-core/src/criterion/words.rs index fbe3d9070..edfd3eb2f 100644 --- a/meilisearch-core/src/criterion/words.rs +++ b/meilisearch-core/src/criterion/words.rs @@ -1,35 +1,21 @@ use std::cmp::Ordering; - -use compact_arena::SmallArena; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; use crate::RawDocument; - -use super::{Criterion, prepare_query_distances}; +use super::{Criterion, Context, ContextMut, prepare_query_distances}; pub struct Words; impl Criterion for Words { fn name(&self) -> &str { "words" } - fn prepare<'a, 'tag, 'txn>( + fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], + ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], ) { - prepare_query_distances(documents, query_enhancer, automatons, postings_lists); + prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists); } - fn evaluate( - &self, - lhs: &RawDocument, - rhs: &RawDocument, - postings_lists: &SmallArena, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { #[inline] fn number_of_query_words(distances: &[Option]) -> usize { distances.iter().cloned().filter(Option::is_some).count() diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs index 7df3e1fbd..cb9ec32f5 100644 --- a/meilisearch-core/src/criterion/words_position.rs +++ b/meilisearch-core/src/criterion/words_position.rs @@ -1,34 +1,29 @@ use std::cmp::Ordering; -use compact_arena::SmallArena; use slice_group_by::GroupBy; -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton}; use crate::RawDocument; - -use super::{Criterion, prepare_raw_matches}; +use crate::bucket_sort::SimpleMatch; +use super::{Criterion, Context, ContextMut, prepare_raw_matches}; pub struct WordsPosition; impl Criterion for WordsPosition { fn name(&self) -> &str { "words position" } - fn prepare<'a, 'tag, 'txn>( + fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - documents: &mut [RawDocument<'a, 'tag>], - postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, - query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], + ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], ) { - prepare_raw_matches(documents, postings_lists, query_enhancer, automatons); + prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons); } - fn evaluate<'a, 'tag, 'txn>( + fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - lhs: &RawDocument<'a, 'tag>, - rhs: &RawDocument<'a, 'tag>, - postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, + ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, + lhs: &RawDocument<'r, 'tag>, + rhs: &RawDocument<'r, 'tag>, ) -> Ordering { #[inline] From 746e6e170cc99ae58ebc627aab0c752d0b40d3af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Dec 2019 11:14:12 +0100 Subject: [PATCH 18/23] Make the test pass again --- meilisearch-core/src/bucket_sort.rs | 9 +- meilisearch-core/src/criterion/attribute.rs | 12 +- meilisearch-core/src/criterion/document_id.rs | 1 - meilisearch-core/src/criterion/exact.rs | 11 +- meilisearch-core/src/criterion/mod.rs | 23 +- meilisearch-core/src/criterion/proximity.rs | 12 +- meilisearch-core/src/criterion/typo.rs | 18 +- meilisearch-core/src/criterion/words.rs | 12 +- .../src/criterion/words_position.rs | 22 +- meilisearch-core/src/lib.rs | 17 +- meilisearch-core/src/query_builder.rs | 404 ++++++++---------- meilisearch-core/src/raw_document.rs | 5 +- 12 files changed, 269 insertions(+), 277 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index e61858d99..91a1c7058 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -104,12 +104,14 @@ where let before_criterion_preparation = Instant::now(); let ctx = ContextMut { + reader, postings_lists: &mut arena, query_enhancer: &mut query_enhancer, automatons: &mut automatons, + documents_fields_counts_store, }; - criterion.prepare(ctx, &mut group); + criterion.prepare(ctx, &mut group)?; debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); let ctx = Context { @@ -215,13 +217,15 @@ where } let ctx = ContextMut { + reader, postings_lists: &mut arena, query_enhancer: &mut query_enhancer, automatons: &mut automatons, + documents_fields_counts_store, }; let before_criterion_preparation = Instant::now(); - criterion.prepare(ctx, &mut group); + criterion.prepare(ctx, &mut group)?; debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); let ctx = Context { @@ -329,7 +333,6 @@ impl fmt::Debug for BareMatch<'_> { } } -// TODO remove that #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct SimpleMatch { pub query_index: u16, diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs index 4baebf66a..ca0af19f0 100644 --- a/meilisearch-core/src/criterion/attribute.rs +++ b/meilisearch-core/src/criterion/attribute.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; use slice_group_by::GroupBy; -use crate::RawDocument; +use crate::{RawDocument, MResult}; use crate::bucket_sort::SimpleMatch; use super::{Criterion, Context, ContextMut, prepare_raw_matches}; @@ -9,12 +9,14 @@ pub struct Attribute; impl Criterion for Attribute { fn name(&self) -> &str { "attribute" } - fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, documents: &mut [RawDocument<'r, 'tag>], - ) { - prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons); + ) -> MResult<()> + { + prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer); + Ok(()) } fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { diff --git a/meilisearch-core/src/criterion/document_id.rs b/meilisearch-core/src/criterion/document_id.rs index f54a43779..2795423f2 100644 --- a/meilisearch-core/src/criterion/document_id.rs +++ b/meilisearch-core/src/criterion/document_id.rs @@ -1,5 +1,4 @@ use std::cmp::Ordering; -use compact_arena::SmallArena; use crate::RawDocument; use super::{Criterion, Context}; diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index 56a81c9ee..381fe1d28 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,6 +1,6 @@ use std::cmp::{Ordering, Reverse}; use slice_group_by::GroupBy; -use crate::RawDocument; +use crate::{RawDocument, MResult}; use crate::bucket_sort::BareMatch; use super::{Criterion, Context, ContextMut}; @@ -9,10 +9,16 @@ pub struct Exact; impl Criterion for Exact { fn name(&self) -> &str { "exact" } - fn prepare(&self, _ctx: ContextMut, documents: &mut [RawDocument]) { + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( + &self, + _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { for document in documents { document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); } + Ok(()) } fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { @@ -29,7 +35,6 @@ impl Criterion for Exact { let lhs = sum_exact_query_words(&lhs.raw_matches); let rhs = sum_exact_query_words(&rhs.raw_matches); - lhs.cmp(&rhs).reverse() } } diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index 40b75cf0d..49f94c1aa 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -4,9 +4,10 @@ use compact_arena::SmallArena; use sdset::SetBuf; use slice_group_by::GroupBy; +use crate::{store, RawDocument, MResult}; use crate::automaton::QueryEnhancer; use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; -use crate::RawDocument; +use crate::database::MainT; mod typo; mod words; @@ -29,12 +30,13 @@ pub use self::sort_by_attr::SortByAttr; pub trait Criterion { fn name(&self) -> &str; - fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, - documents: &mut [RawDocument<'r, 'tag>], - ) { - /* ... */ + _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + _documents: &mut [RawDocument<'r, 'tag>], + ) -> MResult<()> + { + Ok(()) } fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( @@ -56,10 +58,12 @@ pub trait Criterion { } } -pub struct ContextMut<'p, 'tag, 'txn, 'q, 'a> { +pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> { + pub reader: &'h heed::RoTxn, pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>, pub query_enhancer: &'q mut QueryEnhancer, pub automatons: &'a mut [QueryWordAutomaton], + pub documents_fields_counts_store: store::DocumentsFieldsCounts, } pub struct Context<'p, 'tag, 'txn, 'q, 'a> { @@ -135,7 +139,6 @@ impl<'a> AsRef<[Box]> for Criteria<'a> { fn prepare_query_distances<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], postings_lists: &SmallArena<'tag, PostingsListView<'txn>>, ) { for document in documents { @@ -167,7 +170,6 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], ) { for document in documents { if !document.processed_matches.is_empty() { continue } @@ -188,7 +190,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( } } - let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons); + let processed = multiword_rewrite_matches(&mut processed, query_enhancer); document.processed_matches = processed.into_vec(); } } @@ -196,7 +198,6 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( fn multiword_rewrite_matches( matches: &mut [SimpleMatch], query_enhancer: &QueryEnhancer, - automatons: &[QueryWordAutomaton], ) -> SetBuf { matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs index 7437fb2c9..35466c4e8 100644 --- a/meilisearch-core/src/criterion/proximity.rs +++ b/meilisearch-core/src/criterion/proximity.rs @@ -1,7 +1,7 @@ use std::cmp::{self, Ordering}; use slice_group_by::GroupBy; use crate::bucket_sort::{SimpleMatch}; -use crate::RawDocument; +use crate::{RawDocument, MResult}; use super::{Criterion, Context, ContextMut, prepare_raw_matches}; const MAX_DISTANCE: u16 = 8; @@ -11,12 +11,14 @@ pub struct Proximity; impl Criterion for Proximity { fn name(&self) -> &str { "proximity" } - fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, documents: &mut [RawDocument<'r, 'tag>], - ) { - prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons); + ) -> MResult<()> + { + prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer); + Ok(()) } fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { diff --git a/meilisearch-core/src/criterion/typo.rs b/meilisearch-core/src/criterion/typo.rs index 8dcf9b578..2b43c50a9 100644 --- a/meilisearch-core/src/criterion/typo.rs +++ b/meilisearch-core/src/criterion/typo.rs @@ -1,11 +1,5 @@ use std::cmp::Ordering; - -use compact_arena::SmallArena; - -use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{PostingsListView, QueryWordAutomaton}; -use crate::RawDocument; - +use crate::{RawDocument, MResult}; use super::{Criterion, Context, ContextMut, prepare_query_distances}; pub struct Typo; @@ -13,12 +7,14 @@ pub struct Typo; impl Criterion for Typo { fn name(&self) -> &str { "typo" } - fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, documents: &mut [RawDocument<'r, 'tag>], - ) { - prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists); + ) -> MResult<()> + { + prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); + Ok(()) } fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { diff --git a/meilisearch-core/src/criterion/words.rs b/meilisearch-core/src/criterion/words.rs index edfd3eb2f..cfe7c9664 100644 --- a/meilisearch-core/src/criterion/words.rs +++ b/meilisearch-core/src/criterion/words.rs @@ -1,5 +1,5 @@ use std::cmp::Ordering; -use crate::RawDocument; +use crate::{RawDocument, MResult}; use super::{Criterion, Context, ContextMut, prepare_query_distances}; pub struct Words; @@ -7,12 +7,14 @@ pub struct Words; impl Criterion for Words { fn name(&self) -> &str { "words" } - fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, documents: &mut [RawDocument<'r, 'tag>], - ) { - prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists); + ) -> MResult<()> + { + prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists); + Ok(()) } fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs index cb9ec32f5..c149a063e 100644 --- a/meilisearch-core/src/criterion/words_position.rs +++ b/meilisearch-core/src/criterion/words_position.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::RawDocument; use crate::bucket_sort::SimpleMatch; +use crate::{RawDocument, MResult}; use super::{Criterion, Context, ContextMut, prepare_raw_matches}; pub struct WordsPosition; @@ -11,21 +9,17 @@ pub struct WordsPosition; impl Criterion for WordsPosition { fn name(&self) -> &str { "words position" } - fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>( + fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, documents: &mut [RawDocument<'r, 'tag>], - ) { - prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons); + ) -> MResult<()> + { + prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer); + Ok(()) } - fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>( - &self, - ctx: &Context<'p, 'tag, 'txn, 'q, 'a>, - lhs: &RawDocument<'r, 'tag>, - rhs: &RawDocument<'r, 'tag>, - ) -> Ordering - { + fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { #[inline] fn sum_words_position(matches: &[SimpleMatch]) -> usize { let mut sum_words_position = 0; diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index fb1975a0b..ad16182a1 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -34,13 +34,13 @@ use compact_arena::SmallArena; use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; use crate::levenshtein::prefix_damerau_levenshtein; -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Document { pub id: DocumentId, pub highlights: Vec, - // #[cfg(test)] - // pub matches: Vec, + #[cfg(test)] + pub matches: Vec, } impl Document { @@ -69,7 +69,16 @@ impl Document { }) }).collect(); - Document { id: raw_document.id, highlights } + #[cfg(not(test))] + { + Document { id: raw_document.id, highlights } + } + + #[cfg(test)] + { + let matches = raw_document.processed_matches; + Document { id: raw_document.id, highlights, matches } + } } } diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index c0a12e34f..066632279 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -126,10 +126,11 @@ mod tests { use sdset::SetBuf; use tempfile::TempDir; + use crate::DocIndex; use crate::automaton::normalize_str; + use crate::bucket_sort::SimpleMatch; use crate::database::Database; use crate::store::Index; - use crate::DocIndex; fn set_from_stream<'f, I, S>(stream: I) -> Set where @@ -308,9 +309,9 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -331,7 +332,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -342,7 +343,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -364,7 +365,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -375,7 +376,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -408,7 +409,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -419,7 +420,7 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -446,17 +447,17 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -467,17 +468,17 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -488,17 +489,17 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 3, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 3, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 5, .. })); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 5, .. })); assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -534,21 +535,18 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -559,21 +557,18 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -604,20 +599,20 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // NY ± york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // NY ± new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // NY ± york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // NY ± new assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // york = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // new = NY assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new = NY assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -628,14 +623,14 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, .. })); // york assert_matches!(matches.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 1, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 0, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 1, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 0, .. })); // new assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -662,13 +657,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -677,18 +672,18 @@ mod tests { let results = builder.query(&reader, "new york subway", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -727,22 +722,19 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -752,23 +744,20 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC - // because one-word to one-word ^^^^ - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC + // because one-word to one-word ^^^^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(iter.next(), None); // position rewritten ^ + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); } @@ -808,24 +797,21 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -836,24 +822,21 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC // because one-word to one-word ^^^^ - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway assert_matches!(iter.next(), None); // position rewritten ^ }); assert_matches!(iter.next(), None); @@ -897,33 +880,33 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -936,39 +919,33 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(iter.next(), Some(TmpMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city - assert_matches!(iter.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground - assert_matches!(iter.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -995,16 +972,16 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1030,32 +1007,32 @@ mod tests { let results = builder.query(&reader, "NY subway ", 0..20).unwrap(); let mut iter = results.into_iter(); - assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { - let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway - assert_matches!(matches.next(), None); - }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway + assert_matches!(matches.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut matches = matches.into_iter(); + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // story + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1084,13 +1061,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut matches = matches.into_iter(); - assert_matches!(matches.next(), Some(TmpMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC - assert_matches!(matches.next(), Some(TmpMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC - assert_matches!(matches.next(), Some(TmpMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC - assert_matches!(matches.next(), Some(TmpMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long - assert_matches!(matches.next(), Some(TmpMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground - assert_matches!(matches.next(), Some(TmpMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train - assert_matches!(matches.next(), Some(TmpMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // long + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = underground + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // subway = train + assert_matches!(matches.next(), Some(SimpleMatch { query_index: 6, word_index: 6, is_exact: true, .. })); // cool assert_matches!(matches.next(), None); }); assert_matches!(iter.next(), None); @@ -1115,13 +1092,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1132,13 +1109,13 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1149,13 +1126,12 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, .. })); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, .. })); assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // téléphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, distance: 1, word_index: 0, is_exact: false, .. })); // iphone | telephone assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1177,10 +1153,10 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone - assert_matches!(iter.next(), Some(TmpMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1205,8 +1181,8 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); @@ -1237,14 +1213,14 @@ mod tests { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search - assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 8e511d7eb..1a11b293a 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,7 +1,6 @@ use compact_arena::SmallArena; use itertools::EitherOrBoth; use sdset::SetBuf; - use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; pub struct RawDocument<'a, 'tag> { @@ -10,6 +9,9 @@ pub struct RawDocument<'a, 'tag> { pub processed_matches: Vec, /// The list of minimum `distance` found pub processed_distances: Vec>, + /// Does this document contains a field + /// with one word that is exactly matching + pub contains_one_word_field: bool, } impl<'a, 'tag> RawDocument<'a, 'tag> { @@ -84,6 +86,7 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { raw_matches, processed_matches: Vec::new(), processed_distances: Vec::new(), + contains_one_word_field: false, }) } } From 7d677508652b293c9f49d7dd9be751e25bf5ee3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Dec 2019 11:33:22 +0100 Subject: [PATCH 19/23] Reintroduce exacteness for one word document field --- meilisearch-core/src/criterion/exact.rs | 50 ++++++++++++++++++++++--- meilisearch-core/src/query_builder.rs | 46 ++++++++++++++++++++--- 2 files changed, 85 insertions(+), 11 deletions(-) diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index 381fe1d28..a1b54caeb 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,4 +1,6 @@ use std::cmp::{Ordering, Reverse}; +use std::collections::hash_map::{HashMap, Entry}; +use meilisearch_schema::SchemaAttr; use slice_group_by::GroupBy; use crate::{RawDocument, MResult}; use crate::bucket_sort::BareMatch; @@ -11,13 +13,44 @@ impl Criterion for Exact { fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>( &self, - _ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, + ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>, documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + let store = ctx.documents_fields_counts_store; + let reader = ctx.reader; + + 'documents: for doc in documents { + doc.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + + // mark the document if we find a "one word field" that matches + let mut fields_counts = HashMap::new(); + for group in doc.raw_matches.linear_group_by_key(|bm| bm.query_index) { + for group in group.linear_group_by_key(|bm| bm.is_exact) { + if !group[0].is_exact { break } + + for bm in group { + for di in ctx.postings_lists[bm.postings_list].as_ref() { + + let attr = SchemaAttr(di.attribute); + let count = match fields_counts.entry(attr) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let count = store.document_field_count(reader, doc.id, attr)?; + *entry.insert(count) + }, + }; + + if count == Some(1) { + doc.contains_one_word_field = true; + continue 'documents + } + } + } + } + } } + Ok(()) } @@ -33,8 +66,13 @@ impl Criterion for Exact { sum_exact_query_words } - let lhs = sum_exact_query_words(&lhs.raw_matches); - let rhs = sum_exact_query_words(&rhs.raw_matches); - lhs.cmp(&rhs).reverse() + // does it contains a "one word field" + lhs.contains_one_word_field.cmp(&rhs.contains_one_word_field).reverse() + // if not, with document contains the more exact words + .then_with(|| { + let lhs = sum_exact_query_words(&lhs.raw_matches); + let rhs = sum_exact_query_words(&rhs.raw_matches); + lhs.cmp(&rhs).reverse() + }) } } diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 066632279..20e9ba917 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -284,11 +284,7 @@ mod tests { writer.commit().unwrap(); - TempDatabase { - database, - index, - _tempdir: tempdir, - } + TempDatabase { database, index, _tempdir: tempdir } } } @@ -1162,6 +1158,46 @@ mod tests { assert_matches!(iter.next(), None); } + #[test] + fn exact_field_count_one_word() { + let store = TempDatabase::from_iter(vec![ + ("searchengine", &[doc_index(0, 0)][..]), + ("searchengine", &[doc_index(1, 0)][..]), + ("blue", &[doc_index(1, 1)][..]), + ("searchangine", &[doc_index(2, 0)][..]), + ("searchengine", &[doc_index(3, 0)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "searchengine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(3), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 1, .. })); // searchengine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + #[test] fn simple_phrase_query_splitting() { let store = TempDatabase::from_iter(vec![ From 4be23efe6642e25ef95f3a50f93055d64cea39b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Dec 2019 11:46:53 +0100 Subject: [PATCH 20/23] Remove the AttrCount type Could probably be reintroduced later --- meilisearch-core/src/lib.rs | 2 +- meilisearch-types/src/lib.rs | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index ad16182a1..a4dedbb20 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -28,7 +28,7 @@ pub use self::ranked_map::RankedMap; pub use self::raw_document::RawDocument; pub use self::store::Index; pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; -pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount}; +pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; use compact_arena::SmallArena; use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index 3419c61fd..c02281a5f 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -63,11 +63,3 @@ pub struct Highlight { /// without needing to run the tokenizer again. pub char_length: u16, } - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))] -#[repr(C)] -pub struct AttrCount { - pub attr: u16, - pub count: u16, -} From 48e8778881263b6172f97c9c5409abb893d794f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Dec 2019 11:49:56 +0100 Subject: [PATCH 21/23] Clean up the modules declarations --- meilisearch-core/src/lib.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index a4dedbb20..e9c744189 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -3,7 +3,7 @@ extern crate assert_matches; mod automaton; -pub mod criterion; +mod bucket_sort; mod database; mod distinct_map; mod error; @@ -12,14 +12,12 @@ mod number; mod query_builder; mod ranked_map; mod raw_document; -pub mod raw_indexer; mod reordered_attrs; +mod update; +pub mod criterion; +pub mod raw_indexer; pub mod serde; pub mod store; -mod update; - -// TODO replace -mod bucket_sort; pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT}; pub use self::error::{Error, MResult}; From a4dd033ccf6e3d542f2aa7cc520b5ab49fec30b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Dec 2019 12:38:54 +0100 Subject: [PATCH 22/23] Rename raw_matches into bare_matches --- meilisearch-core/src/bucket_sort.rs | 8 ++++---- meilisearch-core/src/criterion/attribute.rs | 4 ++-- meilisearch-core/src/criterion/exact.rs | 8 ++++---- meilisearch-core/src/criterion/mod.rs | 6 +++--- meilisearch-core/src/criterion/proximity.rs | 4 ++-- .../src/criterion/words_position.rs | 4 ++-- meilisearch-core/src/lib.rs | 2 +- meilisearch-core/src/raw_document.rs | 18 +++++++++--------- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 91a1c7058..2234c6529 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -82,9 +82,9 @@ where let before_raw_documents_building = Instant::now(); let mut prefiltered_documents = 0; let mut raw_documents = Vec::new(); - for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { + if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena) { raw_documents.push(raw_document); } } @@ -180,9 +180,9 @@ where let before_raw_documents_building = Instant::now(); let mut prefiltered_documents = 0; let mut raw_documents = Vec::new(); - for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { + for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) { + if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena) { raw_documents.push(raw_document); } } diff --git a/meilisearch-core/src/criterion/attribute.rs b/meilisearch-core/src/criterion/attribute.rs index ca0af19f0..cf9efb41b 100644 --- a/meilisearch-core/src/criterion/attribute.rs +++ b/meilisearch-core/src/criterion/attribute.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use slice_group_by::GroupBy; use crate::{RawDocument, MResult}; use crate::bucket_sort::SimpleMatch; -use super::{Criterion, Context, ContextMut, prepare_raw_matches}; +use super::{Criterion, Context, ContextMut, prepare_bare_matches}; pub struct Attribute; @@ -15,7 +15,7 @@ impl Criterion for Attribute { documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer); + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); Ok(()) } diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index a1b54caeb..5425d2cc9 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -21,11 +21,11 @@ impl Criterion for Exact { let reader = ctx.reader; 'documents: for doc in documents { - doc.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); + doc.bare_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); // mark the document if we find a "one word field" that matches let mut fields_counts = HashMap::new(); - for group in doc.raw_matches.linear_group_by_key(|bm| bm.query_index) { + for group in doc.bare_matches.linear_group_by_key(|bm| bm.query_index) { for group in group.linear_group_by_key(|bm| bm.is_exact) { if !group[0].is_exact { break } @@ -70,8 +70,8 @@ impl Criterion for Exact { lhs.contains_one_word_field.cmp(&rhs.contains_one_word_field).reverse() // if not, with document contains the more exact words .then_with(|| { - let lhs = sum_exact_query_words(&lhs.raw_matches); - let rhs = sum_exact_query_words(&rhs.raw_matches); + let lhs = sum_exact_query_words(&lhs.bare_matches); + let rhs = sum_exact_query_words(&rhs.bare_matches); lhs.cmp(&rhs).reverse() }) } diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index 49f94c1aa..8d6c8b1f6 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -145,7 +145,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>( if !document.processed_distances.is_empty() { continue } let mut processed = Vec::new(); - for m in document.raw_matches.iter() { + for m in document.bare_matches.iter() { if postings_lists[m.postings_list].is_empty() { continue } let range = query_enhancer.replacement(m.query_index as u32); @@ -166,7 +166,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>( } } -fn prepare_raw_matches<'a, 'tag, 'txn>( +fn prepare_bare_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, query_enhancer: &QueryEnhancer, @@ -175,7 +175,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>( if !document.processed_matches.is_empty() { continue } let mut processed = Vec::new(); - for m in document.raw_matches.iter() { + for m in document.bare_matches.iter() { let postings_list = &postings_lists[m.postings_list]; processed.reserve(postings_list.len()); for di in postings_list.as_ref() { diff --git a/meilisearch-core/src/criterion/proximity.rs b/meilisearch-core/src/criterion/proximity.rs index 35466c4e8..2f3698bae 100644 --- a/meilisearch-core/src/criterion/proximity.rs +++ b/meilisearch-core/src/criterion/proximity.rs @@ -2,7 +2,7 @@ use std::cmp::{self, Ordering}; use slice_group_by::GroupBy; use crate::bucket_sort::{SimpleMatch}; use crate::{RawDocument, MResult}; -use super::{Criterion, Context, ContextMut, prepare_raw_matches}; +use super::{Criterion, Context, ContextMut, prepare_bare_matches}; const MAX_DISTANCE: u16 = 8; @@ -17,7 +17,7 @@ impl Criterion for Proximity { documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer); + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); Ok(()) } diff --git a/meilisearch-core/src/criterion/words_position.rs b/meilisearch-core/src/criterion/words_position.rs index c149a063e..387f0d635 100644 --- a/meilisearch-core/src/criterion/words_position.rs +++ b/meilisearch-core/src/criterion/words_position.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; use slice_group_by::GroupBy; use crate::bucket_sort::SimpleMatch; use crate::{RawDocument, MResult}; -use super::{Criterion, Context, ContextMut, prepare_raw_matches}; +use super::{Criterion, Context, ContextMut, prepare_bare_matches}; pub struct WordsPosition; @@ -15,7 +15,7 @@ impl Criterion for WordsPosition { documents: &mut [RawDocument<'r, 'tag>], ) -> MResult<()> { - prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer); + prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer); Ok(()) } diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index e9c744189..478870504 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -48,7 +48,7 @@ impl Document { arena: &SmallArena<'tag, PostingsListView<'txn>>, ) -> Document { - let highlights = raw_document.raw_matches.iter().flat_map(|sm| { + let highlights = raw_document.bare_matches.iter().flat_map(|sm| { let postings_list = &arena[sm.postings_list]; let input = postings_list.input(); let query = &automatons[sm.query_index as usize].query; diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 1a11b293a..de486d906 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -5,7 +5,7 @@ use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsLis pub struct RawDocument<'a, 'tag> { pub id: crate::DocumentId, - pub raw_matches: &'a mut [BareMatch<'tag>], + pub bare_matches: &'a mut [BareMatch<'tag>], pub processed_matches: Vec, /// The list of minimum `distance` found pub processed_distances: Vec>, @@ -16,21 +16,21 @@ pub struct RawDocument<'a, 'tag> { impl<'a, 'tag> RawDocument<'a, 'tag> { pub fn new<'txn>( - raw_matches: &'a mut [BareMatch<'tag>], + bare_matches: &'a mut [BareMatch<'tag>], automatons: &[QueryWordAutomaton], postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, ) -> Option> { - raw_matches.sort_unstable_by_key(|m| m.query_index); + bare_matches.sort_unstable_by_key(|m| m.query_index); let mut previous_word = None; - for i in 0..raw_matches.len() { - let a = &raw_matches[i]; + for i in 0..bare_matches.len() { + let a = &bare_matches[i]; let auta = &automatons[a.query_index as usize]; match auta.phrase_query { Some((0, _)) => { - let b = match raw_matches.get(i + 1) { + let b = match bare_matches.get(i + 1) { Some(b) => b, None => { postings_lists[a.postings_list].rewrite_with(SetBuf::default()); @@ -77,13 +77,13 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { } } - if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { + if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) { return None } Some(RawDocument { - id: raw_matches[0].document_id, - raw_matches, + id: bare_matches[0].document_id, + bare_matches, processed_matches: Vec::new(), processed_distances: Vec::new(), contains_one_word_field: false, From 40c0b14d1cff83e48e4f8eb460a31964d61937ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Dec 2019 13:22:54 +0100 Subject: [PATCH 23/23] Reintroduce searchable attributes and reordering --- meilisearch-core/src/bucket_sort.rs | 11 ++- meilisearch-core/src/lib.rs | 100 ++++++++++++++++++------ meilisearch-core/src/query_builder.rs | 85 +++++++++++++++++++- meilisearch-core/src/raw_document.rs | 19 +++++ meilisearch-core/src/reordered_attrs.rs | 26 +++--- 5 files changed, 198 insertions(+), 43 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 2234c6529..5a819962e 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -33,6 +33,7 @@ pub fn bucket_sort<'c, FI>( range: Range, filter: Option, criteria: Criteria<'c>, + searchable_attrs: Option, main_store: store::Main, postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, @@ -54,6 +55,7 @@ where distinct, distinct_size, criteria, + searchable_attrs, main_store, postings_lists_store, documents_fields_counts_store, @@ -84,7 +86,7 @@ where let mut raw_documents = Vec::new(); for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena) { + if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { raw_documents.push(raw_document); } } @@ -140,7 +142,7 @@ where } let iter = raw_documents.into_iter().skip(range.start).take(range.len()); - let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena)); + let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref())); Ok(iter.collect()) } @@ -153,6 +155,7 @@ pub fn bucket_sort_with_distinct<'c, FI, FD>( distinct: FD, distinct_size: usize, criteria: Criteria<'c>, + searchable_attrs: Option, main_store: store::Main, postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, @@ -182,7 +185,7 @@ where let mut raw_documents = Vec::new(); for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { prefiltered_documents += 1; - if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena) { + if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) { raw_documents.push(raw_document); } } @@ -303,7 +306,7 @@ where }; if distinct_accepted && seen.len() > range.start { - documents.push(Document::from_raw(raw_document, &automatons, &arena)); + documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref())); if documents.len() == range.len() { break; } diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 478870504..ea36abd42 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -31,6 +31,7 @@ pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; use compact_arena::SmallArena; use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; use crate::levenshtein::prefix_damerau_levenshtein; +use crate::reordered_attrs::ReorderedAttrs; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct Document { @@ -41,42 +42,91 @@ pub struct Document { pub matches: Vec, } +fn highlights_from_raw_document<'a, 'tag, 'txn>( + raw_document: &RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, +) -> Vec +{ + let mut highlights = Vec::new(); + + for bm in raw_document.bare_matches.iter() { + let postings_list = &arena[bm.postings_list]; + let input = postings_list.input(); + let query = &automatons[bm.query_index as usize].query; + + for di in postings_list.iter() { + let covered_area = if query.len() > input.len() { + input.len() + } else { + prefix_damerau_levenshtein(query.as_bytes(), input).1 + }; + + let attribute = searchable_attrs + .and_then(|sa| sa.reverse(di.attribute)) + .unwrap_or(di.attribute); + + let highlight = Highlight { + attribute: attribute, + char_index: di.char_index, + char_length: covered_area as u16, + }; + + highlights.push(highlight); + } + } + + highlights +} + impl Document { + #[cfg(not(test))] pub fn from_raw<'a, 'tag, 'txn>( raw_document: RawDocument<'a, 'tag>, automatons: &[QueryWordAutomaton], arena: &SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, ) -> Document { - let highlights = raw_document.bare_matches.iter().flat_map(|sm| { - let postings_list = &arena[sm.postings_list]; - let input = postings_list.input(); - let query = &automatons[sm.query_index as usize].query; - postings_list.iter().map(move |m| { - let covered_area = if query.len() > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; + let highlights = highlights_from_raw_document( + &raw_document, + automatons, + arena, + searchable_attrs, + ); - Highlight { - attribute: m.attribute, - char_index: m.char_index, - char_length: covered_area as u16, - } - }) - }).collect(); + Document { id: raw_document.id, highlights } + } - #[cfg(not(test))] - { - Document { id: raw_document.id, highlights } + #[cfg(test)] + pub fn from_raw<'a, 'tag, 'txn>( + raw_document: RawDocument<'a, 'tag>, + automatons: &[QueryWordAutomaton], + arena: &SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, + ) -> Document + { + use crate::bucket_sort::SimpleMatch; + + let highlights = highlights_from_raw_document( + &raw_document, + automatons, + arena, + searchable_attrs, + ); + + let mut matches = Vec::new(); + for sm in raw_document.processed_matches { + let attribute = searchable_attrs + .and_then(|sa| sa.reverse(sm.attribute)) + .unwrap_or(sm.attribute); + + matches.push(SimpleMatch { attribute, ..sm }); } + matches.sort_unstable(); - #[cfg(test)] - { - let matches = raw_document.processed_matches; - Document { id: raw_document.id, highlights, matches } - } + Document { id: raw_document.id, highlights, matches } } } diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 20e9ba917..e46858241 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -73,9 +73,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { } pub fn add_searchable_attribute(&mut self, attribute: u16) { - let reorders = self - .searchable_attrs - .get_or_insert_with(ReorderedAttrs::new); + let reorders = self.searchable_attrs.get_or_insert_with(ReorderedAttrs::new); reorders.insert_attribute(attribute); } @@ -94,6 +92,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { distinct, distinct_size, self.criteria, + self.searchable_attrs, self.main_store, self.postings_lists_store, self.documents_fields_counts_store, @@ -105,6 +104,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { range, self.filter, self.criteria, + self.searchable_attrs, self.main_store, self.postings_lists_store, self.documents_fields_counts_store, @@ -181,6 +181,16 @@ mod tests { } } + const fn doc_attr_index(document_id: u64, attribute: u16, word_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute, + word_index, + char_index: 0, + char_length: 0, + } + } + pub struct TempDatabase { database: Database, index: Index, @@ -1261,4 +1271,73 @@ mod tests { }); assert_matches!(iter.next(), None); } + + #[test] + fn searchable_attributes() { + let store = TempDatabase::from_iter(vec![ + ("search", &[doc_attr_index(0, 0, 0)][..]), + ("engine", &[doc_attr_index(0, 0, 1)][..]), + + ("search", &[doc_attr_index(1, 1, 0)][..]), + ("engine", &[doc_attr_index(1, 1, 1)][..]), + ]); + + let db = &store.database; + let reader = db.main_read_txn().unwrap(); + + let builder = store.query_builder(); + let results = builder.query(&reader, "search engine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + // reorderer the searchable attributes + let mut builder = store.query_builder(); + builder.add_searchable_attribute(1); + builder.add_searchable_attribute(0); + + let results = builder.query(&reader, "search engine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + // remove a searchable attributes + let mut builder = store.query_builder(); + builder.add_searchable_attribute(1); + + let results = builder.query(&reader, "search engine", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search + assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } } diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index de486d906..f047de8e8 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,7 +1,9 @@ use compact_arena::SmallArena; use itertools::EitherOrBoth; use sdset::SetBuf; +use crate::DocIndex; use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; +use crate::reordered_attrs::ReorderedAttrs; pub struct RawDocument<'a, 'tag> { pub id: crate::DocumentId, @@ -19,8 +21,25 @@ impl<'a, 'tag> RawDocument<'a, 'tag> { bare_matches: &'a mut [BareMatch<'tag>], automatons: &[QueryWordAutomaton], postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>, + searchable_attrs: Option<&ReorderedAttrs>, ) -> Option> { + if let Some(reordered_attrs) = searchable_attrs { + for bm in bare_matches.iter() { + let postings_list = &postings_lists[bm.postings_list]; + + let mut rewritten = Vec::new(); + for di in postings_list.iter() { + if let Some(attribute) = reordered_attrs.get(di.attribute) { + rewritten.push(DocIndex { attribute, ..*di }); + } + } + + let new_postings = SetBuf::from_dirty(rewritten); + postings_lists[bm.postings_list].rewrite_with(new_postings); + } + } + bare_matches.sort_unstable_by_key(|m| m.query_index); let mut previous_word = None; diff --git a/meilisearch-core/src/reordered_attrs.rs b/meilisearch-core/src/reordered_attrs.rs index b2f9f1d6c..590cac7b2 100644 --- a/meilisearch-core/src/reordered_attrs.rs +++ b/meilisearch-core/src/reordered_attrs.rs @@ -1,27 +1,31 @@ +use std::cmp; + #[derive(Default, Clone)] pub struct ReorderedAttrs { - count: usize, reorders: Vec>, + reverse: Vec, } impl ReorderedAttrs { pub fn new() -> ReorderedAttrs { - ReorderedAttrs { - count: 0, - reorders: Vec::new(), - } + ReorderedAttrs { reorders: Vec::new(), reverse: Vec::new() } } pub fn insert_attribute(&mut self, attribute: u16) { - self.reorders.resize(attribute as usize + 1, None); - self.reorders[attribute as usize] = Some(self.count as u16); - self.count += 1; + let new_len = cmp::max(attribute as usize + 1, self.reorders.len()); + self.reorders.resize(new_len, None); + self.reorders[attribute as usize] = Some(self.reverse.len() as u16); + self.reverse.push(attribute); } pub fn get(&self, attribute: u16) -> Option { - match self.reorders.get(attribute as usize) { - Some(Some(attribute)) => Some(*attribute), - _ => None, + match self.reorders.get(attribute as usize)? { + Some(attribute) => Some(*attribute), + None => None, } } + + pub fn reverse(&self, attribute: u16) -> Option { + self.reverse.get(attribute as usize).copied() + } }