From ef6a4db18246e26d0fe13ac84b8f9074cafc9869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 29 Nov 2019 16:31:47 +0100 Subject: [PATCH] Before improving fields AttrCount Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion --- meilisearch-core/src/automaton/mod.rs | 11 ++++- meilisearch-core/src/criterion/exact.rs | 21 ++++---- meilisearch-core/src/query_builder.rs | 64 ++++++++++++++----------- meilisearch-core/src/raw_document.rs | 43 +++++++++-------- 4 files changed, 79 insertions(+), 60 deletions(-) diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index a803eee8e..782049942 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -2,7 +2,7 @@ mod dfa; mod query_enhancer; use std::cmp::Reverse; -use std::{cmp, vec}; +use std::{cmp, fmt, vec}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; @@ -68,7 +68,6 @@ impl AutomatonGroup { } } -#[derive(Debug)] pub struct Automaton { pub index: usize, pub ngram: usize, @@ -78,6 +77,14 @@ pub struct Automaton { pub query: String, } +impl fmt::Debug for Automaton { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Automaton") + .field("query", &self.query) + .finish() + } +} + impl Automaton { pub fn dfa(&self) -> DFA { if self.is_prefix { diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs index c3e7aba9c..55a19001b 100644 --- a/meilisearch-core/src/criterion/exact.rs +++ b/meilisearch-core/src/criterion/exact.rs @@ -1,18 +1,17 @@ use std::cmp::Ordering; -use meilisearch_schema::SchemaAttr; use sdset::Set; use slice_group_by::GroupBy; use crate::criterion::Criterion; -use crate::RawDocument; +use crate::{AttrCount, RawDocument}; #[inline] fn number_exact_matches( query_index: &[u32], attribute: &[u16], is_exact: &[bool], - fields_counts: &Set<(SchemaAttr, u16)>, + fields_counts: &Set, ) -> usize { let mut count = 0; let mut index = 0; @@ -25,8 +24,8 @@ fn number_exact_matches( if *is_exact { found_exact = true; let attr = &attribute[index + pos]; - if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) { - let (_, count) = fields_counts[pos]; + if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) { + let AttrCount { count, .. } = fields_counts[pos]; if count == 1 { return usize::max_value(); } @@ -50,7 +49,7 @@ impl Criterion for Exact { let query_index = lhs.query_index(); let is_exact = lhs.is_exact(); let attribute = lhs.attribute(); - let fields_counts = &lhs.fields_counts; + let fields_counts = lhs.fields_counts.as_ref().unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -59,7 +58,7 @@ impl Criterion for Exact { let query_index = rhs.query_index(); let is_exact = rhs.is_exact(); let attribute = rhs.attribute(); - let fields_counts = &rhs.fields_counts; + let fields_counts = rhs.fields_counts.as_ref().unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -86,7 +85,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -95,7 +94,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[false]; - let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -113,7 +112,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; @@ -122,7 +121,7 @@ mod tests { let query_index = &[0]; let attribute = &[0]; let is_exact = &[true]; - let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); + let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap(); number_exact_matches(query_index, attribute, is_exact, fields_counts) }; diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index 87b4e9021..489b0db43 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -6,6 +6,7 @@ use std::time::{Duration, Instant}; use std::{cmp, mem}; use fst::{IntoStreamer, Streamer}; +use log::debug; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; @@ -14,7 +15,7 @@ use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhanc use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::levenshtein::prefix_damerau_levenshtein; use crate::raw_document::{raw_documents_from, RawDocument}; -use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch}; +use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount}; use crate::{reordered_attrs::ReorderedAttrs, store, MResult}; pub struct QueryBuilder<'c, 'f, 'd> { @@ -146,27 +147,18 @@ fn fetch_raw_documents( searchables: Option<&ReorderedAttrs>, main_store: store::Main, postings_lists_store: store::PostingsLists, - documents_fields_counts_store: store::DocumentsFieldsCounts, ) -> MResult> { let mut matches = Vec::new(); let mut highlights = Vec::new(); + let before_automatons_groups_loop = Instant::now(); for group in automatons_groups { - let AutomatonGroup { - is_phrase_query, - automatons, - } = group; + let AutomatonGroup { is_phrase_query, automatons } = group; let phrase_query_len = automatons.len(); let mut tmp_matches = Vec::new(); for (id, automaton) in automatons.into_iter().enumerate() { - let Automaton { - index, - is_exact, - query_len, - query, - .. - } = automaton; + let Automaton { index, is_exact, query_len, query, .. } = automaton; let dfa = automaton.dfa(); let words = match main_store.words_fst(reader)? { @@ -250,26 +242,26 @@ fn fetch_raw_documents( } } } + debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed()); + let before_multiword_rewrite_matches = Instant::now(); let matches = multiword_rewrite_matches(matches, &query_enhancer); + debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed()); + + let before_highlight_sorting = Instant::now(); let highlights = { highlights.sort_unstable_by_key(|(id, _)| *id); SetBuf::new_unchecked(highlights) }; + debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed()); - let fields_counts = { - let mut fields_counts = Vec::new(); - for group in matches.linear_group_by_key(|(id, ..)| *id) { - let id = group[0].0; - for result in documents_fields_counts_store.document_fields_counts(reader, id)? { - let (attr, count) = result?; - fields_counts.push((id, attr, count)); - } - } - SetBuf::new(fields_counts).unwrap() - }; - Ok(raw_documents_from(matches, highlights, fields_counts)) + let before_raw_documents = Instant::now(); + let raw_documents = raw_documents_from(matches, highlights); + debug!("raw_documents took {:.02?}", before_raw_documents.elapsed()); + debug!("documents to worry about: {}", raw_documents.len()); + + Ok(raw_documents) } impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { @@ -434,6 +426,11 @@ where for auts in automaton_producer { automatons.push(auts); + for (i, group) in automatons.iter().enumerate() { + debug!("group {} automatons {:?}", i, group.automatons); + } + + let before_fetch_raw_documents = Instant::now(); // we must retrieve the documents associated // with the current automatons let mut raw_documents = fetch_raw_documents( @@ -443,8 +440,8 @@ where searchable_attrs.as_ref(), main_store, postings_lists_store, - documents_fields_counts_store, )?; + debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed()); // stop processing when time is running out if let Some(timeout) = timeout { @@ -468,6 +465,20 @@ where continue; } + // we must pull the fields counts of these documents + // TODO it would be great to had a "dependency" thing for each criterion + // and make it so that we can be lazy on pulling/computing some data. + if criterion.name() == "Exact" { + for document in group.iter_mut() { + let mut fields_counts = Vec::new(); + for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? { + let (attr, count) = result?; + fields_counts.push(AttrCount { attr: attr.0, count }); + } + document.fields_counts = Some(SetBuf::new(fields_counts).unwrap()); + } + } + group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { @@ -561,7 +572,6 @@ where searchable_attrs.as_ref(), main_store, postings_lists_store, - documents_fields_counts_store, )?; // stop processing when time is running out diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 1ecb9322d..a37531131 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,18 +1,18 @@ use std::fmt; use std::sync::Arc; -use meilisearch_schema::SchemaAttr; use sdset::SetBuf; use slice_group_by::GroupBy; +use log::debug; -use crate::{DocumentId, Highlight, TmpMatch}; +use crate::{DocumentId, Highlight, TmpMatch, AttrCount}; #[derive(Clone)] pub struct RawDocument { pub id: DocumentId, pub matches: SharedMatches, pub highlights: Vec, - pub fields_counts: SetBuf<(SchemaAttr, u16)>, + pub fields_counts: Option>, } impl RawDocument { @@ -100,44 +100,47 @@ impl fmt::Debug for RawDocument { pub fn raw_documents_from( matches: SetBuf<(DocumentId, TmpMatch)>, - highlights: SetBuf<(DocumentId, Highlight)>, - fields_counts: SetBuf<(DocumentId, SchemaAttr, u16)>, + highlights: SetBuf<(DocumentId, Highlight)> ) -> Vec { let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); let matches = matches.linear_group_by_key(|(id, _)| *id); let highlights = highlights.linear_group_by_key(|(id, _)| *id); - let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id); - for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) { - debug_assert_eq!(mgroup[0].0, hgroup[0].0); - debug_assert_eq!(mgroup[0].0, fgroup[0].0); + let mut loops_count = 0; + + for (mgroup, hgroup) in matches.zip(highlights) { + loops_count += 1; + assert_eq!(mgroup[0].0, hgroup[0].0); let document_id = mgroup[0].0; let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); let end = start + mgroup.len(); let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap(); + let fields_counts = None; docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); + // TODO we could try to keep both data + // - the data oriented one and the raw one, + // - the one that comes from the arguments of this function + // This way we would be able to only produce data oriented lazily. + // + // For example the default first criterion is `SumOfTypos` + // and just needs the `query_index` and the `distance` fields. + // It would probably be good to avoid wasting time sorting other fields of documents + // that will never ever reach the second criterion. matches2.extend_from_slice(mgroup); } + debug!("loops_counts number is {}", loops_count); + let matches = Arc::new(matches2); docs_ranges .into_iter() .map(|(id, range, highlights, fields_counts)| { - let matches = SharedMatches { - range, - matches: matches.clone(), - }; - RawDocument { - id, - matches, - highlights, - fields_counts, - } + let matches = SharedMatches { range, matches: matches.clone() }; + RawDocument { id, matches, highlights, fields_counts } }) .collect() }