From ef6a4db18246e26d0fe13ac84b8f9074cafc9869 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Fri, 29 Nov 2019 16:31:47 +0100
Subject: [PATCH] Before improving fields AttrCount

Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs

ugly-test: Make the fields_count fetching lazy

Just before running the exactness criterion
---
 meilisearch-core/src/automaton/mod.rs   | 11 ++++-
 meilisearch-core/src/criterion/exact.rs | 21 ++++----
 meilisearch-core/src/query_builder.rs   | 64 ++++++++++++++-----------
 meilisearch-core/src/raw_document.rs    | 43 +++++++++--------
 4 files changed, 79 insertions(+), 60 deletions(-)
diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs
index a803eee8e..782049942 100644
--- a/meilisearch-core/src/automaton/mod.rs
+++ b/meilisearch-core/src/automaton/mod.rs
@@ -2,7 +2,7 @@ mod dfa;
 mod query_enhancer;
 
 use std::cmp::Reverse;
-use std::{cmp, vec};
+use std::{cmp, fmt, vec};
 
 use fst::{IntoStreamer, Streamer};
 use levenshtein_automata::DFA;
@@ -68,7 +68,6 @@ impl AutomatonGroup {
     }
 }
 
-#[derive(Debug)]
 pub struct Automaton {
     pub index: usize,
     pub ngram: usize,
@@ -78,6 +77,14 @@ pub struct Automaton {
     pub query: String,
 }
 
+impl fmt::Debug for Automaton {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Automaton")
+            .field("query", &self.query)
+            .finish()
+    }
+}
+
 impl Automaton {
     pub fn dfa(&self) -> DFA {
         if self.is_prefix {
diff --git a/meilisearch-core/src/criterion/exact.rs b/meilisearch-core/src/criterion/exact.rs
index c3e7aba9c..55a19001b 100644
--- a/meilisearch-core/src/criterion/exact.rs
+++ b/meilisearch-core/src/criterion/exact.rs
@@ -1,18 +1,17 @@
 use std::cmp::Ordering;
 
-use meilisearch_schema::SchemaAttr;
 use sdset::Set;
 use slice_group_by::GroupBy;
 
 use crate::criterion::Criterion;
-use crate::RawDocument;
+use crate::{AttrCount, RawDocument};
 
 #[inline]
 fn number_exact_matches(
     query_index: &[u32],
     attribute: &[u16],
     is_exact: &[bool],
-    fields_counts: &Set<(SchemaAttr, u16)>,
+    fields_counts: &Set<AttrCount>,
 ) -> usize {
     let mut count = 0;
     let mut index = 0;
@@ -25,8 +24,8 @@ fn number_exact_matches(
             if *is_exact {
                 found_exact = true;
                 let attr = &attribute[index + pos];
-                if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
-                    let (_, count) = fields_counts[pos];
+                if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) {
+                    let AttrCount { count, .. } = fields_counts[pos];
                     if count == 1 {
                         return usize::max_value();
                     }
@@ -50,7 +49,7 @@ impl Criterion for Exact {
             let query_index = lhs.query_index();
             let is_exact = lhs.is_exact();
             let attribute = lhs.attribute();
-            let fields_counts = &lhs.fields_counts;
+            let fields_counts = lhs.fields_counts.as_ref().unwrap();
 
             number_exact_matches(query_index, attribute, is_exact, fields_counts)
         };
@@ -59,7 +58,7 @@ impl Criterion for Exact {
             let query_index = rhs.query_index();
             let is_exact = rhs.is_exact();
             let attribute = rhs.attribute();
-            let fields_counts = &rhs.fields_counts;
+            let fields_counts = rhs.fields_counts.as_ref().unwrap();
 
             number_exact_matches(query_index, attribute, is_exact, fields_counts)
         };
@@ -86,7 +85,7 @@ mod tests {
             let query_index = &[0];
             let attribute = &[0];
             let is_exact = &[true];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
+            let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
 
             number_exact_matches(query_index, attribute, is_exact, fields_counts)
         };
@@ -95,7 +94,7 @@ mod tests {
             let query_index = &[0];
             let attribute = &[0];
             let is_exact = &[false];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
+            let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
 
             number_exact_matches(query_index, attribute, is_exact, fields_counts)
         };
@@ -113,7 +112,7 @@ mod tests {
             let query_index = &[0];
             let attribute = &[0];
             let is_exact = &[true];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
+            let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap();
 
             number_exact_matches(query_index, attribute, is_exact, fields_counts)
         };
@@ -122,7 +121,7 @@ mod tests {
             let query_index = &[0];
             let attribute = &[0];
             let is_exact = &[true];
-            let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
+            let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap();
 
             number_exact_matches(query_index, attribute, is_exact, fields_counts)
         };
diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs
index 87b4e9021..489b0db43 100644
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@@ -6,6 +6,7 @@ use std::time::{Duration, Instant};
 use std::{cmp, mem};
 
 use fst::{IntoStreamer, Streamer};
+use log::debug;
 use sdset::SetBuf;
 use slice_group_by::{GroupBy, GroupByMut};
 
@@ -14,7 +15,7 @@ use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhanc
 use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
 use crate::levenshtein::prefix_damerau_levenshtein;
 use crate::raw_document::{raw_documents_from, RawDocument};
-use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
+use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
 use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
 
 pub struct QueryBuilder<'c, 'f, 'd> {
@@ -146,27 +147,18 @@ fn fetch_raw_documents(
     searchables: Option<&ReorderedAttrs>,
     main_store: store::Main,
     postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
 ) -> MResult<Vec<RawDocument>> {
     let mut matches = Vec::new();
     let mut highlights = Vec::new();
 
+    let before_automatons_groups_loop = Instant::now();
     for group in automatons_groups {
-        let AutomatonGroup {
-            is_phrase_query,
-            automatons,
-        } = group;
+        let AutomatonGroup { is_phrase_query, automatons } = group;
         let phrase_query_len = automatons.len();
 
         let mut tmp_matches = Vec::new();
         for (id, automaton) in automatons.into_iter().enumerate() {
-            let Automaton {
-                index,
-                is_exact,
-                query_len,
-                query,
-                ..
-            } = automaton;
+            let Automaton { index, is_exact, query_len, query, .. } = automaton;
             let dfa = automaton.dfa();
 
             let words = match main_store.words_fst(reader)? {
@@ -250,26 +242,26 @@ fn fetch_raw_documents(
             }
         }
     }
+    debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
 
+    let before_multiword_rewrite_matches = Instant::now();
     let matches = multiword_rewrite_matches(matches, &query_enhancer);
+    debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
+
+    let before_highlight_sorting = Instant::now();
     let highlights = {
         highlights.sort_unstable_by_key(|(id, _)| *id);
         SetBuf::new_unchecked(highlights)
     };
+    debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
 
-    let fields_counts = {
-        let mut fields_counts = Vec::new();
-        for group in matches.linear_group_by_key(|(id, ..)| *id) {
-            let id = group[0].0;
-            for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
-                let (attr, count) = result?;
-                fields_counts.push((id, attr, count));
-            }
-        }
-        SetBuf::new(fields_counts).unwrap()
-    };
 
-    Ok(raw_documents_from(matches, highlights, fields_counts))
+    let before_raw_documents = Instant::now();
+    let raw_documents = raw_documents_from(matches, highlights);
+    debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
+    debug!("documents to worry about: {}", raw_documents.len());
+
+    Ok(raw_documents)
 }
 
 impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
@@ -434,6 +426,11 @@ where
     for auts in automaton_producer {
         automatons.push(auts);
 
+        for (i, group) in automatons.iter().enumerate() {
+            debug!("group {} automatons {:?}", i, group.automatons);
+        }
+
+        let before_fetch_raw_documents = Instant::now();
         // we must retrieve the documents associated
         // with the current automatons
         let mut raw_documents = fetch_raw_documents(
@@ -443,8 +440,8 @@ where
             searchable_attrs.as_ref(),
             main_store,
             postings_lists_store,
-            documents_fields_counts_store,
         )?;
+        debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());
 
         // stop processing when time is running out
         if let Some(timeout) = timeout {
@@ -468,6 +465,20 @@ where
                     continue;
                 }
 
+                // we must pull the fields counts of these documents
+                // TODO it would be great to had a "dependency" thing for each criterion
+                //      and make it so that we can be lazy on pulling/computing some data.
+                if criterion.name() == "Exact" {
+                    for document in group.iter_mut() {
+                        let mut fields_counts = Vec::new();
+                        for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
+                            let (attr, count) = result?;
+                            fields_counts.push(AttrCount { attr: attr.0, count });
+                        }
+                        document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
+                    }
+                }
+
                 group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
 
                 for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
@@ -561,7 +572,6 @@ where
             searchable_attrs.as_ref(),
             main_store,
             postings_lists_store,
-            documents_fields_counts_store,
         )?;
 
         // stop processing when time is running out
diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs
index 1ecb9322d..a37531131 100644
--- a/meilisearch-core/src/raw_document.rs
+++ b/meilisearch-core/src/raw_document.rs
@@ -1,18 +1,18 @@
 use std::fmt;
 use std::sync::Arc;
 
-use meilisearch_schema::SchemaAttr;
 use sdset::SetBuf;
 use slice_group_by::GroupBy;
+use log::debug;
 
-use crate::{DocumentId, Highlight, TmpMatch};
+use crate::{DocumentId, Highlight, TmpMatch, AttrCount};
 
 #[derive(Clone)]
 pub struct RawDocument {
     pub id: DocumentId,
     pub matches: SharedMatches,
     pub highlights: Vec<Highlight>,
-    pub fields_counts: SetBuf<(SchemaAttr, u16)>,
+    pub fields_counts: Option<SetBuf<AttrCount>>,
 }
 
 impl RawDocument {
@@ -100,44 +100,47 @@ impl fmt::Debug for RawDocument {
 
 pub fn raw_documents_from(
     matches: SetBuf<(DocumentId, TmpMatch)>,
-    highlights: SetBuf<(DocumentId, Highlight)>,
-    fields_counts: SetBuf<(DocumentId, SchemaAttr, u16)>,
+    highlights: SetBuf<(DocumentId, Highlight)>
 ) -> Vec<RawDocument> {
     let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
     let mut matches2 = Matches::with_capacity(matches.len());
 
     let matches = matches.linear_group_by_key(|(id, _)| *id);
     let highlights = highlights.linear_group_by_key(|(id, _)| *id);
-    let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
 
-    for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
-        debug_assert_eq!(mgroup[0].0, hgroup[0].0);
-        debug_assert_eq!(mgroup[0].0, fgroup[0].0);
+    let mut loops_count = 0;
+
+    for (mgroup, hgroup) in matches.zip(highlights) {
+        loops_count += 1;
+        assert_eq!(mgroup[0].0, hgroup[0].0);
 
         let document_id = mgroup[0].0;
         let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
         let end = start + mgroup.len();
         let highlights = hgroup.iter().map(|(_, h)| *h).collect();
-        let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
+        let fields_counts = None;
 
         docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
+        // TODO we could try to keep both data
+        //  - the data oriented one and the raw one,
+        //  - the one that comes from the arguments of this function
+        // This way we would be able to only produce data oriented lazily.
+        //
+        // For example the default first criterion is `SumOfTypos`
+        // and just needs the `query_index` and the `distance` fields.
+        // It would probably be good to avoid wasting time sorting other fields of documents
+        // that will never ever reach the second criterion.
         matches2.extend_from_slice(mgroup);
     }
 
+    debug!("loops_counts number is {}", loops_count);
+
     let matches = Arc::new(matches2);
     docs_ranges
         .into_iter()
         .map(|(id, range, highlights, fields_counts)| {
-            let matches = SharedMatches {
-                range,
-                matches: matches.clone(),
-            };
-            RawDocument {
-                id,
-                matches,
-                highlights,
-                fields_counts,
-            }
+            let matches = SharedMatches { range, matches: matches.clone() };
+            RawDocument { id, matches, highlights, fields_counts }
         })
         .collect()
 }