From 9c03bb3428d574a15e3770ddc18d81da477dfb91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Mon, 9 Dec 2019 15:30:14 +0100
Subject: [PATCH] First probably working phrase query doc filtering

---
 Cargo.lock                          |  1 +
 meilisearch-core/Cargo.toml         |  1 +
 meilisearch-core/src/bucket_sort.rs | 71 ++++++++++++++++++++++++++---
 meilisearch-core/src/criterion2.rs  | 50 +++++++++++++++-----
 4 files changed, 106 insertions(+), 17 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 8034a4add..ad3f3494b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -952,6 +952,7 @@ dependencies = [
  "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
  "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml
index 62da7cfb8..3455f755d 100644
--- a/meilisearch-core/Cargo.toml
+++ b/meilisearch-core/Cargo.toml
@@ -17,6 +17,7 @@ env_logger = "0.7.0"
 fst = { version = "0.3.5", default-features = false }
 hashbrown = { version = "0.6.0", features = ["serde"] }
 heed = "0.6.1"
+itertools = "0.8.2" # kill me please
 levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
 log = "0.4.8"
 meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs
index 303e94e50..ae0fdf63f 100644
--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@@ -59,11 +59,9 @@ pub fn bucket_sort<'c>(
     let before_raw_documents_building = Instant::now();
     let mut raw_documents = Vec::new();
     for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
-        raw_documents.push(RawDocument {
-            raw_matches,
-            processed_matches: Vec::new(),
-            processed_distances: Vec::new(),
-        });
+        if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &arena) {
+            raw_documents.push(raw_document);
+        }
     }
     debug!("creating {} candidates documents took {:.02?}",
         raw_documents.len(),
@@ -149,6 +147,57 @@ pub struct RawDocument<'a, 'tag> {
     pub processed_distances: Vec<Option<u8>>,
 }
 
+impl<'a, 'tag> RawDocument<'a, 'tag> {
+    fn new<'txn>(
+        raw_matches: &'a mut [BareMatch<'tag>],
+        automatons: &[QueryWordAutomaton],
+        postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
+    ) -> Option<RawDocument<'a, 'tag>>
+    {
+        raw_matches.sort_unstable_by_key(|m| m.query_index);
+
+        // debug!("{:?} {:?}", raw_matches[0].document_id, raw_matches);
+
+        let mut previous_word = None;
+        for i in 0..raw_matches.len() {
+            let a = &raw_matches[i];
+            let auta = &automatons[a.query_index as usize];
+
+            match auta.phrase_query {
+                Some((0, _)) => {
+                    previous_word = Some(a.query_index);
+                    let b = raw_matches.get(i + 1)?;
+                    if a.query_index + 1 != b.query_index {
+                        return None;
+                    }
+
+                    let pla = &postings_lists[a.postings_list];
+                    let plb = &postings_lists[b.postings_list];
+
+                    let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
+                        a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
+                    });
+
+                    if !iter.any(|eb| eb.is_both()) { return None }
+                },
+                Some((1, _)) => {
+                    if previous_word.take() != Some(a.query_index - 1) {
+                        return None;
+                    }
+                },
+                Some((_, _)) => unreachable!(),
+                None => (),
+            }
+        }
+
+        Some(RawDocument {
+            raw_matches,
+            processed_matches: Vec::new(),
+            processed_distances: Vec::new(),
+        })
+    }
+}
+
 pub struct BareMatch<'tag> {
     pub document_id: DocumentId,
     pub query_index: u16,
@@ -186,6 +235,15 @@ pub struct PostingsListView<'txn> {
     len: usize,
 }
 
+impl fmt::Debug for PostingsListView<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("PostingsListView")
+            .field("input", &std::str::from_utf8(&self.input).unwrap())
+            .field("postings_list", &self.as_ref())
+            .finish()
+    }
+}
+
 impl<'txn> PostingsListView<'txn> {
     pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
         let len = postings_list.len();
@@ -275,6 +333,7 @@ fn fetch_matches<'txn, 'tag>(
                 let input = Rc::from(input);
                 let postings_list = Rc::new(postings_list);
                 let postings_list_view = PostingsListView::new(input, postings_list);
+
                 let mut offset = 0;
                 for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
 
@@ -442,7 +501,7 @@ fn construct_automatons2(
                 }
             }
 
-            if false && n == 1 {
+            if true && n == 1 {
                 if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
                     let mut left_automaton = QueryWordAutomaton::exact(left);
                     left_automaton.phrase_query = Some((0, 2));
diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs
index 4c40b9969..3bfbe76ea 100644
--- a/meilisearch-core/src/criterion2.rs
+++ b/meilisearch-core/src/criterion2.rs
@@ -43,16 +43,42 @@ pub trait Criterion {
     }
 }
 
-fn prepare_query_distances(
-    documents: &mut [RawDocument],
+fn prepare_query_distances<'a, 'tag, 'txn>(
+    documents: &mut [RawDocument<'a, 'tag>],
     query_enhancer: &QueryEnhancer,
     automatons: &[QueryWordAutomaton],
+    postings_lists: &PostingsListsArena<'tag, 'txn>,
 ) {
     for document in documents {
         if !document.processed_distances.is_empty() { continue }
 
+        // debug!("{:?}", document.raw_matches[0].document_id);
+
         let mut processed = Vec::new();
-        for m in document.raw_matches.iter() {
+        let mut raw_matches = document.raw_matches.iter().peekable();
+        while let Some(m) = raw_matches.next() {
+
+            // let automaton = &automatons[m.query_index as usize];
+
+            // debug!("{:?} {:?}", m, automaton);
+            // debug!("{:?}", &postings_lists[m.postings_list]);
+
+            // match automaton.phrase_query {
+            //     Some((0, len)) => {
+            //         match raw_matches.peek() {
+            //             Some(BareMatch { query_index, .. }) => {
+            //                 if *query_index != m.query_index + 1 {
+            //                     raw_matches.next();
+            //                     continue
+            //                 }
+            //             },
+            //             None => continue,
+            //         }
+            //     },
+            //     Some((_, _)) => continue,
+            //     None => (),
+            // }
+
             // FIXME we really need to take splitted words into account
             //       those must be seen at the same level as the non-splitteds
             // if automatons[m.query_index as usize].phrase_query.is_some() {
@@ -73,6 +99,8 @@ fn prepare_query_distances(
             }
         }
 
+        // debug!("{:?}", processed);
+
         document.processed_distances = processed;
     }
 }
@@ -82,14 +110,14 @@ pub struct Typo;
 impl Criterion for Typo {
     fn name(&self) -> &str { "typo" }
 
-    fn prepare(
+    fn prepare<'a, 'tag, 'txn>(
         &self,
-        documents: &mut [RawDocument],
-        postings_lists: &mut PostingsListsArena,
+        documents: &mut [RawDocument<'a, 'tag>],
+        postings_lists: &mut PostingsListsArena<'tag, 'txn>,
         query_enhancer: &QueryEnhancer,
         automatons: &[QueryWordAutomaton],
     ) {
-        prepare_query_distances(documents, query_enhancer, automatons);
+        prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
     }
 
     fn evaluate(
@@ -140,14 +168,14 @@ pub struct Words;
 impl Criterion for Words {
     fn name(&self) -> &str { "words" }
 
-    fn prepare(
+    fn prepare<'a, 'tag, 'txn>(
         &self,
-        documents: &mut [RawDocument],
-        postings_lists: &mut PostingsListsArena,
+        documents: &mut [RawDocument<'a, 'tag>],
+        postings_lists: &mut PostingsListsArena<'tag, 'txn>,
         query_enhancer: &QueryEnhancer,
         automatons: &[QueryWordAutomaton],
     ) {
-        prepare_query_distances(documents, query_enhancer, automatons);
+        prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
     }
 
     fn evaluate(