From 0f4c0beffd0a25437c2e58cfb2f7686ab17da87f Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 11 Mar 2021 11:48:55 +0100
Subject: [PATCH 01/45] Introduce the Attribute criterion

---
 Cargo.lock                             |   7 ++
 milli/Cargo.toml                       |   1 +
 milli/src/search/criteria/attribute.rs | 133 +++++++++++++++++++++++++
 milli/src/search/criteria/mod.rs       |   4 +
 4 files changed, 145 insertions(+)
 create mode 100644 milli/src/search/criteria/attribute.rs
diff --git a/Cargo.lock b/Cargo.lock
index bbe86a2a7..065be362f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -122,6 +122,12 @@ version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
 
+[[package]]
+name = "big_s"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8"
+
 [[package]]
 name = "bincode"
 version = "1.3.1"
@@ -1251,6 +1257,7 @@ name = "milli"
 version = "0.1.1"
 dependencies = [
  "anyhow",
+ "big_s",
  "bstr",
  "byteorder",
  "chrono",
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index b198131c1..eefdfa7d5 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -52,6 +52,7 @@ logging_timer = "1.0.0"
 tinytemplate = "=1.1.0"
 
 [dev-dependencies]
+big_s = "1.0.2"
 criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"
diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
new file mode 100644
index 000000000..9c31740b1
--- /dev/null
+++ b/milli/src/search/criteria/attribute.rs
@@ -0,0 +1,133 @@
+use log::debug;
+use roaring::RoaringBitmap;
+
+use crate::search::criteria::Query;
+use crate::search::query_tree::Operation;
+use crate::search::WordDerivationsCache;
+use super::{Criterion, CriterionResult, Context};
+
+pub struct Attribute<'t> {
+    ctx: &'t dyn Context,
+    query_tree: Option<Operation>,
+    candidates: Option<RoaringBitmap>,
+    bucket_candidates: RoaringBitmap,
+    parent: Option<Box<dyn Criterion + 't>>,
+}
+
+impl<'t> Attribute<'t> {
+    pub fn initial(
+        ctx: &'t dyn Context,
+        query_tree: Option<Operation>,
+        candidates: Option<RoaringBitmap>,
+    ) -> Self
+    {
+        Attribute {
+            ctx,
+            query_tree,
+            candidates,
+            bucket_candidates: RoaringBitmap::new(),
+            parent: None,
+        }
+    }
+
+    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
+        Attribute {
+            ctx,
+            query_tree: None,
+            candidates: None,
+            bucket_candidates: RoaringBitmap::new(),
+            parent: Some(parent),
+        }
+    }
+}
+
+impl<'t> Criterion for Attribute<'t> {
+    #[logging_timer::time("Attribute::{}")]
+    fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
+        todo!("Attribute")
+    }
+}
+
+// TODO can we keep refs of Query
+fn explode_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
+    use crate::search::criteria::Operation::{And, Or, Consecutive};
+
+    fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Query>> {
+        match tail.split_first() {
+            Some((thead, tail)) => {
+                let tail = and_recurse(thead, tail);
+                let mut out = Vec::new();
+                for array in recurse(head) {
+                    for tail_array in &tail {
+                        let mut array = array.clone();
+                        array.extend(tail_array.iter().cloned());
+                        out.push(array);
+                    }
+                }
+                out
+            },
+            None => recurse(head),
+        }
+    }
+
+    fn recurse(op: &Operation) -> Vec<Vec<Query>> {
+        match op {
+            And(ops) | Consecutive(ops) => {
+                ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
+            },
+            Or(_, ops) => ops.into_iter().map(recurse).flatten().collect(),
+            Operation::Query(query) => vec![vec![query.clone()]],
+        }
+    }
+
+    recurse(query_tree)
+}
+
+#[cfg(test)]
+mod tests {
+    use big_s::S;
+
+    use crate::search::criteria::QueryKind;
+    use super::*;
+
+    #[test]
+    fn simple_explode_query_tree() {
+        let query_tree = Operation::Or(false, vec![
+            Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
+            Operation::And(vec![
+                Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
+                Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
+            ]),
+            Operation::And(vec![
+                Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
+                Operation::Or(false, vec![
+                    Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }),
+                    Operation::And(vec![
+                        Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }),
+                        Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
+                    ]),
+                ]),
+            ]),
+        ]);
+
+        let expected = vec![
+            vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }],
+            vec![
+                Query { prefix: false, kind: QueryKind::exact(S("manythe")) },
+                Query { prefix: false, kind: QueryKind::exact(S("fish")) },
+            ],
+            vec![
+                Query { prefix: false, kind: QueryKind::exact(S("many")) },
+                Query { prefix: false, kind: QueryKind::exact(S("thefish")) },
+            ],
+            vec![
+                Query { prefix: false, kind: QueryKind::exact(S("many")) },
+                Query { prefix: false, kind: QueryKind::exact(S("the")) },
+                Query { prefix: false, kind: QueryKind::exact(S("fish")) },
+            ],
+        ];
+
+        let result = explode_query_tree(&query_tree);
+        assert_eq!(expected, result);
+    }
+}
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 22f081871..8d9c21f6e 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -12,12 +12,14 @@ use self::typo::Typo;
 use self::words::Words;
 use self::asc_desc::AscDesc;
 use self::proximity::Proximity;
+use self::attribute::Attribute;
 use self::fetcher::Fetcher;
 
 mod typo;
 mod words;
 mod asc_desc;
 mod proximity;
+mod attribute;
 pub mod fetcher;
 
 pub trait Criterion {
@@ -139,6 +141,7 @@ impl<'t> CriteriaBuilder<'t> {
                     Name::Typo => Box::new(Typo::new(self, father)),
                     Name::Words => Box::new(Words::new(self, father)),
                     Name::Proximity => Box::new(Proximity::new(self, father)),
+                    Name::Attribute => Box::new(Attribute::new(self, father)),
                     Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?),
                     Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?),
                     _otherwise => father,
@@ -147,6 +150,7 @@ impl<'t> CriteriaBuilder<'t> {
                     Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())),
                     Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())),
                     Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())),
+                    Name::Attribute => Box::new(Attribute::initial(self, query_tree.take(), facet_candidates.take())),
                     Name::Asc(field) => {
                         Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
                     },

From 4ff67ec2ee16d9b02362c85ab582dad9898b4a66 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 11 Mar 2021 17:31:02 +0100
Subject: [PATCH 02/45] Implement attribute criterion for small amounts of
 candidates

---
 milli/src/search/criteria/attribute.rs | 164 +++++++++++++++++++++++--
 1 file changed, 157 insertions(+), 7 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 9c31740b1..7f8b5c622 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -1,10 +1,13 @@
-use log::debug;
+use std::collections::{BTreeMap, HashMap, btree_map};
+use std::mem::take;
+
 use roaring::RoaringBitmap;
 
+use crate::{search::build_dfa};
 use crate::search::criteria::Query;
-use crate::search::query_tree::Operation;
+use crate::search::query_tree::{Operation, QueryKind};
 use crate::search::WordDerivationsCache;
-use super::{Criterion, CriterionResult, Context};
+use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 
 pub struct Attribute<'t> {
     ctx: &'t dyn Context,
@@ -12,6 +15,8 @@ pub struct Attribute<'t> {
     candidates: Option<RoaringBitmap>,
     bucket_candidates: RoaringBitmap,
     parent: Option<Box<dyn Criterion + 't>>,
+    flattened_query_tree: Option<Vec<Vec<Query>>>,
+    current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
 }
 
 impl<'t> Attribute<'t> {
@@ -27,6 +32,8 @@ impl<'t> Attribute<'t> {
             candidates,
             bucket_candidates: RoaringBitmap::new(),
             parent: None,
+            flattened_query_tree: None,
+            current_buckets: None,
         }
     }
 
@@ -37,6 +44,8 @@ impl<'t> Attribute<'t> {
             candidates: None,
             bucket_candidates: RoaringBitmap::new(),
             parent: Some(parent),
+            flattened_query_tree: None,
+            current_buckets: None,
         }
     }
 }
@@ -44,12 +53,153 @@ impl<'t> Attribute<'t> {
 impl<'t> Criterion for Attribute<'t> {
     #[logging_timer::time("Attribute::{}")]
     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
-        todo!("Attribute")
+        loop {
+            match (&self.query_tree, &mut self.candidates) {
+                (_, Some(candidates)) if candidates.is_empty() => {
+                    return Ok(Some(CriterionResult {
+                        query_tree: self.query_tree.take(),
+                        candidates: self.candidates.take(),
+                        bucket_candidates: take(&mut self.bucket_candidates),
+                    }));
+                },
+                (Some(qt), Some(candidates)) => {
+                    let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt));
+                    let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() {
+                        current_buckets
+                    } else {
+                        let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
+                        self.current_buckets.get_or_insert(new_buckets.into_iter())
+                    };
+
+                    let found_candidates = if let Some((_score, candidates)) = current_buckets.next() {
+                        candidates
+                    } else {
+                        return Ok(Some(CriterionResult {
+                            query_tree: self.query_tree.take(),
+                            candidates: self.candidates.take(),
+                            bucket_candidates: take(&mut self.bucket_candidates),
+                        }));
+                    };
+                    candidates.difference_with(&found_candidates);
+
+                    let bucket_candidates = match self.parent {
+                        Some(_) => take(&mut self.bucket_candidates),
+                        None => found_candidates.clone(),
+                    };
+
+                    return Ok(Some(CriterionResult {
+                        query_tree: self.query_tree.clone(),
+                        candidates: Some(found_candidates),
+                        bucket_candidates: bucket_candidates,
+                    }));
+                },
+                (Some(qt), None) => {
+                    let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?;
+                    self.bucket_candidates.union_with(&query_tree_candidates);
+                    self.candidates = Some(query_tree_candidates);
+                },
+                (None, Some(_)) => {
+                    return Ok(Some(CriterionResult {
+                        query_tree: self.query_tree.take(),
+                        candidates: self.candidates.take(),
+                        bucket_candidates: take(&mut self.bucket_candidates),
+                    }));
+                },
+                (None, None) => {
+                    match self.parent.as_mut() {
+                        Some(parent) => {
+                            match parent.next(wdcache)? {
+                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
+                                    self.query_tree = query_tree;
+                                    self.candidates = candidates;
+                                    self.bucket_candidates.union_with(&bucket_candidates);
+                                    self.flattened_query_tree = None;
+                                    self.current_buckets = None;
+                                },
+                                None => return Ok(None),
+                            }
+                        },
+                        None => return Ok(None),
+                    }
+                },
+            }
+        }
     }
 }
 
+fn linear_compute_candidates(
+    ctx: &dyn Context,
+    branches: &Vec<Vec<Query>>,
+    allowed_candidates: &RoaringBitmap,
+) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
+{
+    fn compute_candidate_rank(branches: &Vec<Vec<Query>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
+        let mut min_rank = u64::max_value();
+        for branch in branches {
+            let mut branch_rank = 0;
+            for Query { prefix, kind } in branch {
+                // find the best position of the current word in the document.
+                let position =  match kind {
+                    QueryKind::Exact { word, .. } => {
+                        if *prefix {
+                            word_derivations(word, true, 0, &words_positions)
+                            .flat_map(|positions| positions.iter().next()).min()
+                        } else {
+                            words_positions.get(word)
+                                .map(|positions| positions.iter().next())
+                                .flatten()
+                        }
+                    },
+                    QueryKind::Tolerant { typo, word } => {
+                        word_derivations(word, *prefix, *typo, &words_positions)
+                            .flat_map(|positions| positions.iter().next()).min()
+                    },
+                };
+
+                // if a position is found, we add it to the branch score,
+                // otherwise the branch is considered as unfindable in this document and we break.
+                if let Some(position) = position {
+                    branch_rank += position as u64;
+                } else {
+                    branch_rank = u64::max_value();
+                    break;
+                }
+            }
+            min_rank = min_rank.min(branch_rank);
+        }
+
+        min_rank
+    }
+
+    fn word_derivations<'a>(
+        word: &str,
+        is_prefix: bool,
+        max_typo: u8,
+        words_positions: &'a HashMap<String, RoaringBitmap>,
+    ) -> impl Iterator<Item = &'a RoaringBitmap>
+    {
+        let dfa = build_dfa(word, max_typo, is_prefix);
+        words_positions.iter().filter_map(move |(document_word, positions)| {
+            use levenshtein_automata::Distance;
+            match dfa.eval(document_word) {
+                Distance::Exact(_) => Some(positions),
+                Distance::AtLeast(_) => None,
+            }
+        })
+    }
+
+    let mut candidates = BTreeMap::new();
+    for docid in allowed_candidates {
+        let words_positions = ctx.docid_words_positions(docid)?;
+        let rank = compute_candidate_rank(branches, words_positions);
+        candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
+    }
+
+    Ok(candidates)
+}
+
 // TODO can we keep refs of Query
-fn explode_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
+fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
     use crate::search::criteria::Operation::{And, Or, Consecutive};
 
     fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Query>> {
@@ -91,7 +241,7 @@ mod tests {
     use super::*;
 
     #[test]
-    fn simple_explode_query_tree() {
+    fn simple_flatten_query_tree() {
         let query_tree = Operation::Or(false, vec![
             Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
             Operation::And(vec![
@@ -127,7 +277,7 @@ mod tests {
             ],
         ];
 
-        let result = explode_query_tree(&query_tree);
+        let result = flatten_query_tree(&query_tree);
         assert_eq!(expected, result);
     }
 }

From 75e7b1e3dadb46c0761e07a14b3a70dfe6e3c01d Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 18 Mar 2021 13:49:55 +0100
Subject: [PATCH 03/45] Implement test Context methods

---
 milli/src/search/criteria/mod.rs | 99 +++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 40 deletions(-)

diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 8d9c21f6e..1d7026d71 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -366,6 +366,7 @@ pub mod test {
         word_prefix_docids: HashMap<String, RoaringBitmap>,
         word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
         word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
+        docid_words: HashMap<u32, Vec<String>>,
     }
 
     impl<'a> Context for TestContext<'a> {
@@ -399,8 +400,17 @@ pub mod test {
             self.word_prefix_docids.contains_key(&word.to_string())
         }
 
-        fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
-            todo!()
+        fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> {
+            if let Some(docid_words) = self.docid_words.get(&docid) {
+                Ok(docid_words
+                    .iter()
+                    .enumerate()
+                    .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))))
+                    .collect()
+                )
+            } else {
+                Ok(HashMap::new())
+            }
         }
     }
 
@@ -435,50 +445,58 @@ pub mod test {
                 s("morning")    => random_postings(rng,    125),
             };
 
+            let mut docid_words = HashMap::new();
+            for (word, docids) in word_docids.iter() {
+                for docid in docids {
+                    let words = docid_words.entry(docid).or_insert(vec![]);
+                    words.push(word.clone());
+                }
+            }
+
             let word_prefix_docids = hashmap!{
                 s("h")   => &word_docids[&s("hello")] | &word_docids[&s("hi")],
                 s("wor") => &word_docids[&s("word")]  | &word_docids[&s("world")],
                 s("20")  => &word_docids[&s("2020")]  | &word_docids[&s("2021")],
             };
 
-            let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")];
-            let hello_world_split = (hello_world.len() / 2) as usize;
-            let hello_world_1 = hello_world.iter().take(hello_world_split).collect();
-            let hello_world_2 = hello_world.iter().skip(hello_world_split).collect();
-
-            let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")];
-            let hello_word_split = (hello_word.len() / 2) as usize;
-            let hello_word_4 = hello_word.iter().take(hello_word_split).collect();
-            let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect();
-            let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect();
-            let word_pair_proximity_docids = hashmap!{
-                (s("good"), s("morning"), 1)   => &word_docids[&s("good")] & &word_docids[&s("morning")],
-                (s("hello"), s("world"), 1)   => hello_world_1,
-                (s("hello"), s("world"), 4)   => hello_world_2,
-                (s("this"), s("is"), 1)   => &word_docids[&s("this")] & &word_docids[&s("is")],
-                (s("is"), s("2021"), 1)   => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
-                (s("is"), s("2020"), 1)   => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
-                (s("this"), s("2021"), 2)   => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
-                (s("this"), s("2020"), 2)   => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
-                (s("word"), s("split"), 1)   => &word_docids[&s("word")] & &word_docids[&s("split")],
-                (s("world"), s("split"), 1)   => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")],
-                (s("hello"), s("word"), 4) => hello_word_4,
-                (s("hello"), s("word"), 6) => hello_word_6,
-                (s("hello"), s("word"), 7) => hello_word_7,
-                (s("split"), s("ngrams"), 3)   => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")],
-                (s("split"), s("ngrams"), 5)   => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
-                (s("this"), s("ngrams"), 1)   => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")],
-                (s("this"), s("ngrams"), 2)   => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
-            };
-
-            let word_prefix_pair_proximity_docids = hashmap!{
-                (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(),
-                (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(),
-                (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(),
-                (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(),
-                (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(),
-                (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(),
-            };
+            let mut word_pair_proximity_docids = HashMap::new();
+            let mut word_prefix_pair_proximity_docids = HashMap::new();
+            for (lword, lcandidates) in &word_docids {
+                for (rword, rcandidates) in &word_docids {
+                    if lword == rword { continue }
+                    let candidates = lcandidates & rcandidates;
+                    for candidate in candidates {
+                        if let Some(docid_words) = docid_words.get(&candidate) {
+                            let lposition = docid_words.iter().position(|w| w == lword).unwrap();
+                            let rposition = docid_words.iter().position(|w| w == rword).unwrap();
+                            let key = if lposition < rposition {
+                                (s(lword), s(rword), (rposition - lposition) as i32)
+                            } else {
+                                (s(lword), s(rword), (lposition - rposition + 1) as i32)
+                            };
+                            let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
+                            docids.push(candidate);
+                        }
+                    }
+                }
+                for (pword, pcandidates) in &word_prefix_docids {
+                    if lword.starts_with(pword) { continue }
+                    let candidates = lcandidates & pcandidates;
+                    for candidate in candidates {
+                        if let Some(docid_words) = docid_words.get(&candidate) {
+                            let lposition = docid_words.iter().position(|w| w == lword).unwrap();
+                            let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap();
+                            let key = if lposition < rposition {
+                                (s(lword), s(pword), (rposition - lposition) as i32)
+                            } else {
+                                (s(lword), s(pword), (lposition - rposition + 1) as i32)
+                            };
+                            let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new());
+                            docids.push(candidate);
+                        }
+                    }
+                }
+            }
 
             let mut keys = word_docids.keys().collect::<Vec<_>>();
             keys.sort_unstable();
@@ -490,6 +508,7 @@ pub mod test {
                 word_prefix_docids,
                 word_pair_proximity_docids,
                 word_prefix_pair_proximity_docids,
+                docid_words,
             }
         }
     }

From b0a417f342de6afe8678c628b5e3be9c30f9c302 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 11 Mar 2021 17:24:35 +0100
Subject: [PATCH 04/45] Introduce the word_level_position_docids Index database

---
 infos/src/main.rs                             |  1 +
 milli/src/heed_codec/mod.rs                   |  2 +
 .../heed_codec/str_level_position_codec.rs    | 42 +++++++++++++++++++
 milli/src/index.rs                            |  8 +++-
 milli/src/lib.rs                              |  2 +-
 milli/src/update/clear_documents.rs           |  1 +
 milli/src/update/delete_documents.rs          |  1 +
 7 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 milli/src/heed_codec/str_level_position_codec.rs

diff --git a/infos/src/main.rs b/infos/src/main.rs
index cc1727a68..356a5417c 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -319,6 +319,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         docid_word_positions,
         word_pair_proximity_docids,
         word_prefix_pair_proximity_docids,
+        word_level_position_docids,
         facet_field_id_value_docids,
         field_id_docid_facet_values: _,
         documents,
diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs
index a070c66eb..cc73cdc65 100644
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@@ -2,6 +2,7 @@ mod beu32_str_codec;
 mod obkv_codec;
 mod roaring_bitmap;
 mod roaring_bitmap_length;
+mod str_level_position_codec;
 mod str_str_u8_codec;
 pub mod facet;
 
@@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec;
 pub use self::obkv_codec::ObkvCodec;
 pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
 pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
+pub use self::str_level_position_codec::StrLevelPositionCodec;
 pub use self::str_str_u8_codec::StrStrU8Codec;
diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs
new file mode 100644
index 000000000..c421c04b5
--- /dev/null
+++ b/milli/src/heed_codec/str_level_position_codec.rs
@@ -0,0 +1,42 @@
+use std::borrow::Cow;
+use std::convert::TryInto;
+use std::mem::size_of;
+use std::str;
+
+pub struct StrLevelPositionCodec;
+
+impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
+    type DItem = (&'a str, u8, u32, u32);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
+
+        if bytes.len() < footer_len { return None }
+
+        let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
+        let word = str::from_utf8(word).ok()?;
+
+        let (level, bytes) = bytes.split_first()?;
+        let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?;
+        let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?;
+
+        Some((word, *level, left, right))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
+    type EItem = (&'a str, u8, u32, u32);
+
+    fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
+        let left = left.to_be_bytes();
+        let right = right.to_be_bytes();
+
+        let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len());
+        bytes.extend_from_slice(word.as_bytes());
+        bytes.push(*level);
+        bytes.extend_from_slice(&left[..]);
+        bytes.extend_from_slice(&right[..]);
+
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 045eabc3c..0659b207a 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution,
 use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId};
 use crate::{
     BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
-    ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec,
+    ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec,
 };
 use crate::facet::FacetType;
 use crate::fields_ids_map::FieldsIdsMap;
@@ -52,6 +52,8 @@ pub struct Index {
     pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
     /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
     pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
+    /// Maps the word, level and position range with the docids that corresponds to it.
+    pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
     /// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
     pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
     /// Maps the document id, the facet field id and the globally ordered value.
@@ -62,7 +64,7 @@ pub struct Index {
 
 impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
-        options.max_dbs(9);
+        options.max_dbs(10);
 
         let env = options.open(path)?;
         let main = env.create_poly_database(Some("main"))?;
@@ -71,6 +73,7 @@ impl Index {
         let docid_word_positions = env.create_database(Some("docid-word-positions"))?;
         let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
         let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
+        let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
         let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
         let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
         let documents = env.create_database(Some("documents"))?;
@@ -94,6 +97,7 @@ impl Index {
             docid_word_positions,
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
+            word_level_position_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index fe9bd828b..de5c6511e 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -22,7 +22,7 @@ use serde_json::{Map, Value};
 pub use self::criterion::{Criterion, default_criteria};
 pub use self::external_documents_ids::ExternalDocumentsIds;
 pub use self::fields_ids_map::FieldsIdsMap;
-pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
+pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec};
 pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
 pub use self::index::Index;
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 2c24d9c07..250e4b13a 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             docid_word_positions,
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
+            word_level_position_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 8a2ba9bbf..b60b7bac2 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -88,6 +88,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             docid_word_positions,
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
+            word_level_position_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,

From 9242f2f1d451807e45f29462c0126c992d8950af Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 11 Mar 2021 17:23:46 +0100
Subject: [PATCH 05/45] Store the first word positions levels

---
 .../update/index_documents/merge_function.rs  |   4 +
 milli/src/update/index_documents/mod.rs       |  42 +++-
 milli/src/update/index_documents/store.rs     |  58 +++++-
 milli/src/update/mod.rs                       |   2 +
 milli/src/update/words_level_positions.rs     | 184 ++++++++++++++++++
 5 files changed, 284 insertions(+), 6 deletions(-)
 create mode 100644 milli/src/update/words_level_positions.rs

diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs
index 6f24fcad9..54f994fc0 100644
--- a/milli/src/update/index_documents/merge_function.rs
+++ b/milli/src/update/index_documents/merge_function.rs
@@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -
     cbo_roaring_bitmap_merge(values)
 }
 
+pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
+    cbo_roaring_bitmap_merge(values)
+}
+
 pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     cbo_roaring_bitmap_merge(values)
 }
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 52949c13c..8fc35b654 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -18,11 +18,12 @@ use rayon::prelude::*;
 use serde::{Serialize, Deserialize};
 
 use crate::index::Index;
-use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep};
+use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep};
 use self::store::{Store, Readers};
 pub use self::merge_function::{
     main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
-    docid_word_positions_merge, documents_merge, facet_field_value_docids_merge,
+    docid_word_positions_merge, documents_merge,
+    word_level_position_docids_merge, facet_field_value_docids_merge,
     field_id_docid_facet_values_merge,
 };
 pub use self::transform::{Transform, TransformOutput};
@@ -402,6 +403,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         enum DatabaseType {
             Main,
             WordDocids,
+            WordLevel0PositionDocids,
             FacetLevel0ValuesDocids,
         }
 
@@ -467,6 +469,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
             let mut word_docids_readers = Vec::with_capacity(readers.len());
             let mut docid_word_positions_readers = Vec::with_capacity(readers.len());
             let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len());
+            let mut word_level_position_docids_readers = Vec::with_capacity(readers.len());
             let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len());
             let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len());
             let mut documents_readers = Vec::with_capacity(readers.len());
@@ -476,6 +479,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                     word_docids,
                     docid_word_positions,
                     words_pairs_proximities_docids,
+                    word_level_position_docids,
                     facet_field_value_docids,
                     field_id_docid_facet_values,
                     documents
@@ -484,6 +488,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                 word_docids_readers.push(word_docids);
                 docid_word_positions_readers.push(docid_word_positions);
                 words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids);
+                word_level_position_docids_readers.push(word_level_position_docids);
                 facet_field_value_docids_readers.push(facet_field_value_docids);
                 field_id_docid_facet_values_readers.push(field_id_docid_facet_values);
                 documents_readers.push(documents);
@@ -514,6 +519,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                         facet_field_value_docids_readers,
                         facet_field_value_docids_merge,
                     ),
+                    (
+                        DatabaseType::WordLevel0PositionDocids,
+                        word_level_position_docids_readers,
+                        word_level_position_docids_merge,
+                    ),
                 ]
                 .into_par_iter()
                 .for_each(|(dbtype, readers, merge)| {
@@ -569,7 +579,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         self.index.put_documents_ids(self.wtxn, &documents_ids)?;
 
         let mut database_count = 0;
-        let total_databases = 7;
+        let total_databases = 8;
 
         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
             databases_seen: 0,
@@ -661,7 +671,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                     )?;
                 },
                 DatabaseType::FacetLevel0ValuesDocids => {
-                    debug!("Writing the facet values docids into LMDB on disk...");
+                    debug!("Writing the facet level 0 values docids into LMDB on disk...");
                     let db = *self.index.facet_field_id_value_docids.as_polymorph();
                     write_into_lmdb_database(
                         self.wtxn,
@@ -671,6 +681,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                         write_method,
                     )?;
                 },
+                DatabaseType::WordLevel0PositionDocids => {
+                    debug!("Writing the word level 0 positions docids into LMDB on disk...");
+                    let db = *self.index.word_level_position_docids.as_polymorph();
+                    write_into_lmdb_database(
+                        self.wtxn,
+                        db,
+                        content,
+                        word_level_position_docids_merge,
+                        write_method,
+                    )?;
+                }
             }
 
             database_count += 1;
@@ -693,6 +714,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         }
         builder.execute()?;
 
+        // Run the words positions update operation.
+        let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        if let Some(value) = self.facet_level_group_size {
+            builder.level_group_size(value);
+        }
+        if let Some(value) = self.facet_min_level_size {
+            builder.min_level_size(value);
+        }
+        builder.execute()?;
+
         // Run the words prefixes update operation.
         let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
         builder.chunk_compression_type = self.chunk_compression_type;
diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs
index 0bd83b692..358552768 100644
--- a/milli/src/update/index_documents/store.rs
+++ b/milli/src/update/index_documents/store.rs
@@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId
 use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
 use super::merge_function::{
     main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
-    facet_field_value_docids_merge, field_id_docid_facet_values_merge,
+    word_level_position_docids_merge, facet_field_value_docids_merge,
+    field_id_docid_facet_values_merge,
 };
 
 const LMDB_MAX_KEY_LENGTH: usize = 511;
@@ -43,6 +44,7 @@ pub struct Readers {
     pub word_docids: Reader<FileFuse>,
     pub docid_word_positions: Reader<FileFuse>,
     pub words_pairs_proximities_docids: Reader<FileFuse>,
+    pub word_level_position_docids: Reader<FileFuse>,
     pub facet_field_value_docids: Reader<FileFuse>,
     pub field_id_docid_facet_values: Reader<FileFuse>,
     pub documents: Reader<FileFuse>,
@@ -69,6 +71,7 @@ pub struct Store<'s, A> {
     main_sorter: Sorter<MergeFn>,
     word_docids_sorter: Sorter<MergeFn>,
     words_pairs_proximities_docids_sorter: Sorter<MergeFn>,
+    word_level_position_docids_sorter: Sorter<MergeFn>,
     facet_field_value_docids_sorter: Sorter<MergeFn>,
     field_id_docid_facet_values_sorter: Sorter<MergeFn>,
     // MTBL writers
@@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
     ) -> anyhow::Result<Self>
     {
         // We divide the max memory by the number of sorter the Store have.
-        let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4));
+        let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
         let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
 
         let main_sorter = create_sorter(
@@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
             max_nb_chunks,
             max_memory,
         );
+        let word_level_position_docids_sorter = create_sorter(
+            word_level_position_docids_merge,
+            chunk_compression_type,
+            chunk_compression_level,
+            chunk_fusing_shrink_size,
+            max_nb_chunks,
+            max_memory,
+        );
         let facet_field_value_docids_sorter = create_sorter(
             facet_field_value_docids_merge,
             chunk_compression_type,
@@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
             main_sorter,
             word_docids_sorter,
             words_pairs_proximities_docids_sorter,
+            word_level_position_docids_sorter,
             facet_field_value_docids_sorter,
             field_id_docid_facet_values_sorter,
             // MTBL writers
@@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
 
         self.documents_writer.insert(document_id.to_be_bytes(), record)?;
         Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
+        Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
 
         words_positions.clear();
 
@@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
         Ok(())
     }
 
+    fn write_word_position_docids(
+        writer: &mut Sorter<MergeFn>,
+        document_id: DocumentId,
+        words_positions: &HashMap<String, SmallVec32<Position>>,
+    ) -> anyhow::Result<()>
+    {
+        let mut key_buffer = Vec::new();
+        let mut data_buffer = Vec::new();
+
+        for (word, positions) in words_positions {
+            key_buffer.clear();
+            key_buffer.extend_from_slice(word.as_bytes());
+            key_buffer.push(0); // level 0
+
+            for position in positions {
+                key_buffer.truncate(word.len());
+                let position_bytes = position.to_be_bytes();
+                key_buffer.extend_from_slice(position_bytes.as_bytes());
+                key_buffer.extend_from_slice(position_bytes.as_bytes());
+
+                data_buffer.clear();
+                let positions = RoaringBitmap::from_iter(Some(document_id));
+                // We serialize the positions into a buffer.
+                CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer)
+                    .with_context(|| "could not serialize positions")?;
+
+                // that we write under the generated key into MTBL
+                if lmdb_key_valid_size(&key_buffer) {
+                    writer.insert(&key_buffer, &data_buffer)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     fn write_facet_field_value_docids<I>(
         sorter: &mut Sorter<MergeFn>,
         iter: I,
@@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
         let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
         self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
 
+        let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
+        self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
+
         let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
         self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?;
 
@@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
         let main = writer_into_reader(main_wtr, shrink_size)?;
         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
         let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
+        let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
         let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?;
         let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?;
         let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
@@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
             word_docids,
             docid_word_positions,
             words_pairs_proximities_docids,
+            word_level_position_docids,
             facet_field_value_docids,
             field_id_docid_facet_values,
             documents,
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index c2df94468..1fc4890fb 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -6,6 +6,7 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc
 pub use self::settings::{Setting, Settings};
 pub use self::update_builder::UpdateBuilder;
 pub use self::update_step::UpdateIndexingStep;
+pub use self::words_level_positions::WordsLevelPositions;
 pub use self::words_prefixes::WordsPrefixes;
 
 mod available_documents_ids;
@@ -16,5 +17,6 @@ mod index_documents;
 mod settings;
 mod update_builder;
 mod update_step;
+mod words_level_positions;
 mod words_prefixes;
 
diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
new file mode 100644
index 000000000..983f82657
--- /dev/null
+++ b/milli/src/update/words_level_positions.rs
@@ -0,0 +1,184 @@
+use std::cmp;
+use std::fs::File;
+use std::num::NonZeroUsize;
+
+use grenad::{CompressionType, Reader, Writer, FileFuse};
+use heed::types::{ByteSlice, DecodeIgnore};
+use heed::{BytesEncode, Error};
+use log::debug;
+use roaring::RoaringBitmap;
+
+use crate::facet::FacetType;
+use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
+use crate::Index;
+use crate::update::index_documents::WriteMethod;
+use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
+
+pub struct WordsLevelPositions<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    pub(crate) chunk_compression_type: CompressionType,
+    pub(crate) chunk_compression_level: Option<u32>,
+    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    level_group_size: NonZeroUsize,
+    min_level_size: NonZeroUsize,
+    _update_id: u64,
+}
+
+impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
+    pub fn new(
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+        update_id: u64,
+    ) -> WordsLevelPositions<'t, 'u, 'i>
+    {
+        WordsLevelPositions {
+            wtxn,
+            index,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            chunk_fusing_shrink_size: None,
+            level_group_size: NonZeroUsize::new(4).unwrap(),
+            min_level_size: NonZeroUsize::new(5).unwrap(),
+            _update_id: update_id,
+        }
+    }
+
+    pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self {
+        self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap();
+        self
+    }
+
+    pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
+        self.min_level_size = value;
+        self
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        debug!("Computing and writing the word levels positions docids into LMDB on disk...");
+
+        clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?;
+
+        let entries = compute_positions_levels(
+            self.wtxn,
+            self.index.word_level_position_docids,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.level_group_size,
+            self.min_level_size,
+        )?;
+
+        write_into_lmdb_database(
+            self.wtxn,
+            *self.index.facet_field_id_value_docids.as_polymorph(),
+            entries,
+            |_, _| anyhow::bail!("invalid facet level merging"),
+            WriteMethod::GetMergePut,
+        )?;
+
+        Ok(())
+    }
+}
+
+fn clear_non_zero_levels_positions(
+    wtxn: &mut heed::RwTxn,
+    db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
+) -> heed::Result<()>
+{
+    let mut iter = db.iter_mut(wtxn)?.lazily_decode_data();
+    while let Some(result) = iter.next() {
+        let ((_, level, _, _), _) = result?;
+        if level != 0 {
+            iter.del_current()?;
+        }
+    }
+    Ok(())
+}
+
+/// Generates all the words positions levels (including the level zero).
+fn compute_positions_levels(
+    rtxn: &heed::RoTxn,
+    db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
+    compression_type: CompressionType,
+    compression_level: Option<u32>,
+    shrink_size: Option<u64>,
+    level_group_size: NonZeroUsize,
+    min_level_size: NonZeroUsize,
+) -> anyhow::Result<Reader<FileFuse>>
+{
+    // let first_level_size = db.prefix_iter(rtxn, &[field_id])?
+    //     .remap_types::<DecodeIgnore, DecodeIgnore>()
+    //     .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
+
+    // // It is forbidden to keep a cursor and write in a database at the same time with LMDB
+    // // therefore we write the facet levels entries into a grenad file before transfering them.
+    // let mut writer = tempfile::tempfile().and_then(|file| {
+    //     create_writer(compression_type, compression_level, file)
+    // })?;
+
+    // let level_0_range = {
+    //     let left = (field_id, 0, T::min_value(), T::min_value());
+    //     let right = (field_id, 0, T::max_value(), T::max_value());
+    //     left..=right
+    // };
+
+    // // Groups sizes are always a power of the original level_group_size and therefore a group
+    // // always maps groups of the previous level and never splits previous levels groups in half.
+    // let group_size_iter = (1u8..)
+    //     .map(|l| (l, level_group_size.get().pow(l as u32)))
+    //     .take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
+
+    // for (level, group_size) in group_size_iter {
+    //     let mut left = T::zero();
+    //     let mut right = T::zero();
+    //     let mut group_docids = RoaringBitmap::new();
+
+    //     let db = db.remap_key_type::<KC>();
+    //     for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
+    //         let ((_field_id, _level, value, _right), docids) = result?;
+
+    //         if i == 0 {
+    //             left = value;
+    //         } else if i % group_size == 0 {
+    //             // we found the first bound of the next group, we must store the left
+    //             // and right bounds associated with the docids.
+    //             write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
+
+    //             // We save the left bound for the new group and also reset the docids.
+    //             group_docids = RoaringBitmap::new();
+    //             left = value;
+    //         }
+
+    //         // The right bound is always the bound we run through.
+    //         group_docids.union_with(&docids);
+    //         right = value;
+    //     }
+
+    //     if !group_docids.is_empty() {
+    //         write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
+    //     }
+    // }
+
+    // writer_into_reader(writer, shrink_size)
+
+    todo!()
+}
+
+fn write_entry<T, KC>(
+    writer: &mut Writer<File>,
+    field_id: u8,
+    level: u8,
+    left: T,
+    right: T,
+    ids: &RoaringBitmap,
+) -> anyhow::Result<()>
+where
+    KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
+{
+    let key = (field_id, level, left, right);
+    let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?;
+    let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
+    writer.insert(&key, &data)?;
+    Ok(())
+}

From c765f277a3328be0bae4e9ae173de2fa61f23962 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 14:34:21 +0100
Subject: [PATCH 06/45] Introduce the WordsLevelPositions update

---
 milli/src/update/words_level_positions.rs | 117 +++++++++++-----------
 1 file changed, 61 insertions(+), 56 deletions(-)

diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index 983f82657..0a7bc484d 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -3,12 +3,11 @@ use std::fs::File;
 use std::num::NonZeroUsize;
 
 use grenad::{CompressionType, Reader, Writer, FileFuse};
-use heed::types::{ByteSlice, DecodeIgnore};
+use heed::types::DecodeIgnore;
 use heed::{BytesEncode, Error};
 use log::debug;
 use roaring::RoaringBitmap;
 
-use crate::facet::FacetType;
 use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
 use crate::Index;
 use crate::update::index_documents::WriteMethod;
@@ -69,12 +68,16 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
             self.min_level_size,
         )?;
 
+        // The previously computed entries also defines the level 0 entries
+        // so we can clear the database and append all of these entries.
+        self.index.word_level_position_docids.clear(self.wtxn)?;
+
         write_into_lmdb_database(
             self.wtxn,
             *self.index.facet_field_id_value_docids.as_polymorph(),
             entries,
             |_, _| anyhow::bail!("invalid facet level merging"),
-            WriteMethod::GetMergePut,
+            WriteMethod::Append,
         )?;
 
         Ok(())
@@ -107,77 +110,79 @@ fn compute_positions_levels(
     min_level_size: NonZeroUsize,
 ) -> anyhow::Result<Reader<FileFuse>>
 {
-    // let first_level_size = db.prefix_iter(rtxn, &[field_id])?
-    //     .remap_types::<DecodeIgnore, DecodeIgnore>()
-    //     .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
+    // It is forbidden to keep a cursor and write in a database at the same time with LMDB
+    // therefore we write the facet levels entries into a grenad file before transfering them.
+    let mut writer = tempfile::tempfile().and_then(|file| {
+        create_writer(compression_type, compression_level, file)
+    })?;
 
-    // // It is forbidden to keep a cursor and write in a database at the same time with LMDB
-    // // therefore we write the facet levels entries into a grenad file before transfering them.
-    // let mut writer = tempfile::tempfile().and_then(|file| {
-    //     create_writer(compression_type, compression_level, file)
-    // })?;
+    for result in db.iter(rtxn)? {
+        let ((word, level, left, right), docids) = result?;
 
-    // let level_0_range = {
-    //     let left = (field_id, 0, T::min_value(), T::min_value());
-    //     let right = (field_id, 0, T::max_value(), T::max_value());
-    //     left..=right
-    // };
+        let first_level_size = db.remap_data_type::<DecodeIgnore>()
+            .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))?
+            .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
 
-    // // Groups sizes are always a power of the original level_group_size and therefore a group
-    // // always maps groups of the previous level and never splits previous levels groups in half.
-    // let group_size_iter = (1u8..)
-    //     .map(|l| (l, level_group_size.get().pow(l as u32)))
-    //     .take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
+        let level_0_range = {
+            let left = (word, 0, u32::min_value(), u32::min_value());
+            let right = (word, 0, u32::max_value(), u32::max_value());
+            left..=right
+        };
 
-    // for (level, group_size) in group_size_iter {
-    //     let mut left = T::zero();
-    //     let mut right = T::zero();
-    //     let mut group_docids = RoaringBitmap::new();
+        // Groups sizes are always a power of the original level_group_size and therefore a group
+        // always maps groups of the previous level and never splits previous levels groups in half.
+        let group_size_iter = (1u8..)
+            .map(|l| (l, level_group_size.get().pow(l as u32)))
+            .take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
 
-    //     let db = db.remap_key_type::<KC>();
-    //     for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
-    //         let ((_field_id, _level, value, _right), docids) = result?;
+        // As specified in the documentation, we also write the level 0 entries.
+        write_level_entry(&mut writer, word, level, left, right, &docids)?;
 
-    //         if i == 0 {
-    //             left = value;
-    //         } else if i % group_size == 0 {
-    //             // we found the first bound of the next group, we must store the left
-    //             // and right bounds associated with the docids.
-    //             write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
+        for (level, group_size) in group_size_iter {
+            let mut left = 0;
+            let mut right = 0;
+            let mut group_docids = RoaringBitmap::new();
 
-    //             // We save the left bound for the new group and also reset the docids.
-    //             group_docids = RoaringBitmap::new();
-    //             left = value;
-    //         }
+            for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
+                let ((_field_id, _level, value, _right), docids) = result?;
 
-    //         // The right bound is always the bound we run through.
-    //         group_docids.union_with(&docids);
-    //         right = value;
-    //     }
+                if i == 0 {
+                    left = value;
+                } else if i % group_size == 0 {
+                    // we found the first bound of the next group, we must store the left
+                    // and right bounds associated with the docids.
+                    write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
 
-    //     if !group_docids.is_empty() {
-    //         write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?;
-    //     }
-    // }
+                    // We save the left bound for the new group and also reset the docids.
+                    group_docids = RoaringBitmap::new();
+                    left = value;
+                }
 
-    // writer_into_reader(writer, shrink_size)
+                // The right bound is always the bound we run through.
+                group_docids.union_with(&docids);
+                right = value;
+            }
 
-    todo!()
+            if !group_docids.is_empty() {
+                write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
+            }
+        }
+    }
+
+    writer_into_reader(writer, shrink_size)
 }
 
-fn write_entry<T, KC>(
+fn write_level_entry(
     writer: &mut Writer<File>,
-    field_id: u8,
+    word: &str,
     level: u8,
-    left: T,
-    right: T,
+    left: u32,
+    right: u32,
     ids: &RoaringBitmap,
 ) -> anyhow::Result<()>
-where
-    KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>,
 {
-    let key = (field_id, level, left, right);
-    let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?;
+    let key = (word, level, left, right);
+    let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;
     writer.insert(&key, &data)?;
     Ok(())

From 3a25137ee42d1f6d98db6f9e569baae40cb1949f Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 13:55:24 +0100
Subject: [PATCH 07/45] Expose and use the WordsLevelPositions update

---
 milli/src/update/index_documents/mod.rs | 17 +++++++++++++++++
 milli/src/update/update_builder.rs      | 20 +++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 8fc35b654..e7143bde0 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -263,6 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
     facet_min_level_size: Option<NonZeroUsize>,
     words_prefix_threshold: Option<f64>,
     max_prefix_length: Option<usize>,
+    words_positions_level_group_size: Option<NonZeroUsize>,
+    words_positions_min_level_size: Option<NonZeroUsize>,
     update_method: IndexDocumentsMethod,
     update_format: UpdateFormat,
     autogenerate_docids: bool,
@@ -290,6 +292,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
             facet_min_level_size: None,
             words_prefix_threshold: None,
             max_prefix_length: None,
+            words_positions_level_group_size: None,
+            words_positions_min_level_size: None,
             update_method: IndexDocumentsMethod::ReplaceDocuments,
             update_format: UpdateFormat::Json,
             autogenerate_docids: true,
@@ -740,6 +744,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         }
         builder.execute()?;
 
+        // Run the words level positions update operation.
+        let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        if let Some(value) = self.words_positions_level_group_size {
+            builder.level_group_size(value);
+        }
+        if let Some(value) = self.words_positions_min_level_size {
+            builder.min_level_size(value);
+        }
+        builder.execute()?;
+
         debug_assert_eq!(database_count, total_databases);
 
         info!("Transform output indexed in {:.02?}", before_indexing.elapsed());
diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs
index c966f72d2..9a4fb850e 100644
--- a/milli/src/update/update_builder.rs
+++ b/milli/src/update/update_builder.rs
@@ -2,7 +2,10 @@ use grenad::CompressionType;
 use rayon::ThreadPool;
 
 use crate::Index;
-use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes};
+use super::{
+    ClearDocuments, DeleteDocuments, IndexDocuments, Settings,
+    Facets, WordsPrefixes, WordsLevelPositions,
+};
 
 pub struct UpdateBuilder<'a> {
     pub(crate) log_every_n: Option<usize>,
@@ -150,4 +153,19 @@ impl<'a> UpdateBuilder<'a> {
 
         builder
     }
+
+    pub fn words_level_positions<'t, 'u, 'i>(
+        self,
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+    ) -> WordsLevelPositions<'t, 'u, 'i>
+    {
+        let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id);
+
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+
+        builder
+    }
 }

From e8cc7f9cee818ecc18fff3c65f7b7566fb75a836 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 14:32:00 +0100
Subject: [PATCH 08/45] Expose a route in the http-ui to update the
 WordsLevelPositions

---
 http-ui/src/main.rs | 46 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index b091985f3..dbf7aadce 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -229,6 +229,7 @@ enum UpdateMeta {
     Settings(Settings),
     Facets(Facets),
     WordsPrefixes(WordsPrefixes),
+    WordsLevelPositions(WordsLevelPositions),
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -281,6 +282,22 @@ struct WordsPrefixes {
     max_prefix_length: Option<usize>,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+#[serde(rename_all = "camelCase")]
+struct WordsLevelPositions {
+    level_group_size: Option<NonZeroUsize>,
+    min_level_size: Option<NonZeroUsize>,
+}
+
+// Any value that is present is considered Some value, including null.
+fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
+where T: Deserialize<'de>,
+      D: Deserializer<'de>
+{
+    Deserialize::deserialize(deserializer).map(Some)
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let opt = Opt::from_args();
@@ -493,6 +510,21 @@ async fn main() -> anyhow::Result<()> {
                         Ok(()) => wtxn.commit().map_err(Into::into),
                         Err(e) => Err(e)
                     }
+                },
+                UpdateMeta::WordsLevelPositions(levels) => {
+                    // We must use the write transaction of the update here.
+                    let mut wtxn = index_cloned.write_txn()?;
+                    let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned);
+                    if let Some(value) = levels.level_group_size {
+                        builder.level_group_size(value);
+                    }
+                    if let Some(value) = levels.min_level_size {
+                        builder.min_level_size(value);
+                    }
+                    match builder.execute() {
+                        Ok(()) => wtxn.commit().map_err(Into::into),
+                        Err(e) => Err(e.into())
+                    }
                 }
             };
 
@@ -923,6 +955,19 @@ async fn main() -> anyhow::Result<()> {
             warp::reply()
         });
 
+    let update_store_cloned = update_store.clone();
+    let update_status_sender_cloned = update_status_sender.clone();
+    let change_words_level_positions_route = warp::filters::method::post()
+        .and(warp::path!("words-level-positions"))
+        .and(warp::body::json())
+        .map(move |levels: WordsLevelPositions| {
+            let meta = UpdateMeta::WordsLevelPositions(levels);
+            let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
+            let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
+            eprintln!("update {} registered", update_id);
+            warp::reply()
+        });
+
     let update_store_cloned = update_store.clone();
     let update_status_sender_cloned = update_status_sender.clone();
     let abort_update_id_route = warp::filters::method::delete()
@@ -998,6 +1043,7 @@ async fn main() -> anyhow::Result<()> {
         .or(change_settings_route)
         .or(change_facet_levels_route)
         .or(change_words_prefixes_route)
+        .or(change_words_level_positions_route)
         .or(update_ws_route);
 
     let addr = SocketAddr::from_str(&opt.http_listen_addr)?;

From 6b1b42b928685f468507f0c2fccd8ff6a2925e99 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 14:22:01 +0100
Subject: [PATCH 09/45] Introduce an infos wordsLevelPositionsDocids subcommand

---
 infos/src/main.rs | 61 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 356a5417c..e4d59c641 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -19,9 +19,10 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids";
 const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids";
 const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
 const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
-const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids";
-const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values";
 const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
+const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
+const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
+const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
 const DOCUMENTS_DB_NAME: &str = "documents";
 
 const ALL_DATABASE_NAMES: &[&str] = &[
@@ -31,8 +32,9 @@ const ALL_DATABASE_NAMES: &[&str] = &[
     DOCID_WORD_POSITIONS_DB_NAME,
     WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
     WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
-    FACET_FIELD_ID_VALUE_DOCIDS_NAME,
-    FIELD_ID_DOCID_FACET_VALUES_NAME,
+    WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
+    FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
+    FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
     DOCUMENTS_DB_NAME,
 ];
 
@@ -114,6 +116,16 @@ enum Command {
         field_name: String,
     },
 
+    /// Outputs a CSV with the documents ids along with the word level positions where it appears.
+    WordsLevelPositionsDocids {
+        /// Display the whole documents ids in details.
+        #[structopt(long)]
+        full_display: bool,
+
+        /// The field name in the document.
+        words: Vec<String>,
+    },
+
     /// Outputs a CSV with the documents ids, words and the positions where this word appears.
     DocidsWordsPositions {
         /// Display the whole positions in detail.
@@ -221,6 +233,9 @@ fn main() -> anyhow::Result<()> {
         FacetValuesDocids { full_display, field_name } => {
             facet_values_docids(&index, &rtxn, !full_display, field_name)
         },
+        WordsLevelPositionsDocids { full_display, words } => {
+            words_level_positions_docids(&index, &rtxn, !full_display, words)
+        },
         DocidsWordsPositions { full_display, internal_documents_ids } => {
             docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
         },
@@ -525,6 +540,40 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam
     Ok(wtr.flush()?)
 }
 
+fn words_level_positions_docids(
+    index: &Index,
+    rtxn: &heed::RoTxn,
+    debug: bool,
+    words: Vec<String>,
+) -> anyhow::Result<()>
+{
+    let stdout = io::stdout();
+    let mut wtr = csv::Writer::from_writer(stdout.lock());
+    wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?;
+
+    for word in words.iter().map(AsRef::as_ref) {
+        let range = {
+            let left = (word, 0, u32::min_value(), u32::min_value());
+            let right = (word, u8::max_value(), u32::max_value(), u32::max_value());
+            left..=right
+        };
+        for result in index.word_level_position_docids.range(rtxn, &range)? {
+            let ((word, level, left, right), docids) = result?;
+            let level = level.to_string();
+            let count = docids.len().to_string();
+            let docids = if debug {
+                format!("{:?}", docids)
+            } else {
+                format!("{:?}", docids.iter().collect::<Vec<_>>())
+            };
+            let position_range = format!("{:?}", left..=right);
+            wtr.write_record(&[word, &level, &position_range, &count, &docids])?;
+        }
+    }
+
+    Ok(wtr.flush()?)
+}
+
 fn docids_words_positions(
     index: &Index,
     rtxn: &heed::RoTxn,
@@ -730,8 +779,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
             DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
             WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
             WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(),
-            FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(),
-            FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(),
+            FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(),
+            FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(),
             DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
             unknown => anyhow::bail!("unknown database {:?}", unknown),
         };

From 3069bf4f4a3ad50a89a1573b49dec92c61107678 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 15:40:38 +0100
Subject: [PATCH 10/45] Fix and improve the words-level-positions computation

---
 infos/src/main.rs                         |  6 ++--
 milli/src/update/index_documents/store.rs |  2 +-
 milli/src/update/words_level_positions.rs | 42 ++++++++---------------
 3 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index e4d59c641..c219c5758 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -558,7 +558,9 @@ fn words_level_positions_docids(
             left..=right
         };
         for result in index.word_level_position_docids.range(rtxn, &range)? {
-            let ((word, level, left, right), docids) = result?;
+            let ((w, level, left, right), docids) = result?;
+            if word != w { break }
+
             let level = level.to_string();
             let count = docids.len().to_string();
             let docids = if debug {
@@ -567,7 +569,7 @@ fn words_level_positions_docids(
                 format!("{:?}", docids.iter().collect::<Vec<_>>())
             };
             let position_range = format!("{:?}", left..=right);
-            wtr.write_record(&[word, &level, &position_range, &count, &docids])?;
+            wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
         }
     }
 
diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs
index 358552768..0f97476d9 100644
--- a/milli/src/update/index_documents/store.rs
+++ b/milli/src/update/index_documents/store.rs
@@ -388,7 +388,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
             key_buffer.push(0); // level 0
 
             for position in positions {
-                key_buffer.truncate(word.len());
+                key_buffer.truncate(word.len() + 1);
                 let position_bytes = position.to_be_bytes();
                 key_buffer.extend_from_slice(position_bytes.as_bytes());
                 key_buffer.extend_from_slice(position_bytes.as_bytes());
diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index 0a7bc484d..77cec246a 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -3,7 +3,7 @@ use std::fs::File;
 use std::num::NonZeroUsize;
 
 use grenad::{CompressionType, Reader, Writer, FileFuse};
-use heed::types::DecodeIgnore;
+use heed::types::{DecodeIgnore, Str};
 use heed::{BytesEncode, Error};
 use log::debug;
 use roaring::RoaringBitmap;
@@ -56,10 +56,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
     pub fn execute(self) -> anyhow::Result<()> {
         debug!("Computing and writing the word levels positions docids into LMDB on disk...");
 
-        clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?;
-
         let entries = compute_positions_levels(
             self.wtxn,
+            self.index.word_docids.remap_data_type::<DecodeIgnore>(),
             self.index.word_level_position_docids,
             self.chunk_compression_type,
             self.chunk_compression_level,
@@ -74,7 +73,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
 
         write_into_lmdb_database(
             self.wtxn,
-            *self.index.facet_field_id_value_docids.as_polymorph(),
+            *self.index.word_level_position_docids.as_polymorph(),
             entries,
             |_, _| anyhow::bail!("invalid facet level merging"),
             WriteMethod::Append,
@@ -84,25 +83,11 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
     }
 }
 
-fn clear_non_zero_levels_positions(
-    wtxn: &mut heed::RwTxn,
-    db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
-) -> heed::Result<()>
-{
-    let mut iter = db.iter_mut(wtxn)?.lazily_decode_data();
-    while let Some(result) = iter.next() {
-        let ((_, level, _, _), _) = result?;
-        if level != 0 {
-            iter.del_current()?;
-        }
-    }
-    Ok(())
-}
-
-/// Generates all the words positions levels (including the level zero).
+/// Generates all the words positions levels based on the levels zero (including the level zero).
 fn compute_positions_levels(
     rtxn: &heed::RoTxn,
-    db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
+    words_db: heed::Database<Str, DecodeIgnore>,
+    words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
     compression_type: CompressionType,
     compression_level: Option<u32>,
     shrink_size: Option<u64>,
@@ -116,11 +101,11 @@ fn compute_positions_levels(
         create_writer(compression_type, compression_level, file)
     })?;
 
-    for result in db.iter(rtxn)? {
-        let ((word, level, left, right), docids) = result?;
+    for result in words_db.iter(rtxn)? {
+        let (word, ()) = result?;
 
-        let first_level_size = db.remap_data_type::<DecodeIgnore>()
-            .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))?
+        let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
+            .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))?
             .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
 
         let level_0_range = {
@@ -136,14 +121,17 @@ fn compute_positions_levels(
             .take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
 
         // As specified in the documentation, we also write the level 0 entries.
-        write_level_entry(&mut writer, word, level, left, right, &docids)?;
+        for result in words_positions_db.range(rtxn, &level_0_range)? {
+            let ((word, level, left, right), docids) = result?;
+            write_level_entry(&mut writer, word, level, left, right, &docids)?;
+        }
 
         for (level, group_size) in group_size_iter {
             let mut left = 0;
             let mut right = 0;
             let mut group_docids = RoaringBitmap::new();
 
-            for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() {
+            for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
                 let ((_field_id, _level, value, _right), docids) = result?;
 
                 if i == 0 {

From f7138284066887cf3bc610b35b48c8f2393bb448 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 15:47:41 +0100
Subject: [PATCH 11/45] Implement the clear and delete documents for the
 word-level-positions database

---
 milli/src/update/clear_documents.rs  |  1 +
 milli/src/update/delete_documents.rs | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 250e4b13a..6d7dd72b8 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -56,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
         word_prefix_pair_proximity_docids.clear(self.wtxn)?;
+        word_level_position_docids.clear(self.wtxn)?;
         facet_field_id_value_docids.clear(self.wtxn)?;
         field_id_docid_facet_values.clear(self.wtxn)?;
         documents.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index b60b7bac2..f9303d339 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -330,6 +330,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
 
         drop(iter);
 
+        // We delete the documents ids that are under the word level position docids.
+        let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
+        while let Some(result) = iter.next() {
+            let (bytes, mut docids) = result?;
+            let previous_len = docids.len();
+            docids.difference_with(&self.documents_ids);
+            if docids.is_empty() {
+                iter.del_current()?;
+            } else if docids.len() != previous_len {
+                iter.put_current(bytes, &docids)?;
+            }
+        }
+
+        drop(iter);
+
         Ok(self.documents_ids.len())
     }
 }

From 8bd4f5d93ec5e212197a8e662a37431bfdf0c865 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 17 Mar 2021 16:09:18 +0100
Subject: [PATCH 12/45] Compute the biggest values of the
 words_level_positions_docids

---
 infos/src/main.rs                         | 18 +++++++++++++++---
 milli/src/update/words_level_positions.rs | 10 +++++-----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index c219c5758..2c11d3783 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -346,6 +346,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
     let docid_word_positions_name = "docid_word_positions";
     let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
     let word_pair_proximity_docids_name = "word_pair_proximity_docids";
+    let word_level_position_docids_name = "word_level_position_docids";
     let facet_field_id_value_docids_name = "facet_field_id_value_docids";
     let documents_name = "documents";
 
@@ -402,6 +403,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             if heap.len() > limit { heap.pop(); }
         }
 
+        for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
+            let ((word, level, left, right), value) = result?;
+            let key = format!("{} {} {:?}", word, level, left..=right);
+            heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
+            if heap.len() > limit { heap.pop(); }
+        }
+
         let faceted_fields = index.faceted_fields_ids(rtxn)?;
         let fields_ids_map = index.fields_ids_map(rtxn)?;
         for (field_id, field_type) in faceted_fields {
@@ -549,7 +557,7 @@ fn words_level_positions_docids(
 {
     let stdout = io::stdout();
     let mut wtr = csv::Writer::from_writer(stdout.lock());
-    wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?;
+    wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
 
     for word in words.iter().map(AsRef::as_ref) {
         let range = {
@@ -561,14 +569,18 @@ fn words_level_positions_docids(
             let ((w, level, left, right), docids) = result?;
             if word != w { break }
 
-            let level = level.to_string();
             let count = docids.len().to_string();
             let docids = if debug {
                 format!("{:?}", docids)
             } else {
                 format!("{:?}", docids.iter().collect::<Vec<_>>())
             };
-            let position_range = format!("{:?}", left..=right);
+            let position_range = if level == 0 {
+                format!("{:?}", left)
+            } else {
+                format!("{:?}", left..=right)
+            };
+            let level = level.to_string();
             wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
         }
     }
diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index 77cec246a..a7be248b6 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -104,16 +104,16 @@ fn compute_positions_levels(
     for result in words_db.iter(rtxn)? {
         let (word, ()) = result?;
 
-        let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
-            .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))?
-            .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
-
         let level_0_range = {
             let left = (word, 0, u32::min_value(), u32::min_value());
             let right = (word, 0, u32::max_value(), u32::max_value());
             left..=right
         };
 
+        let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
+            .range(rtxn, &level_0_range)?
+            .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
+
         // Groups sizes are always a power of the original level_group_size and therefore a group
         // always maps groups of the previous level and never splits previous levels groups in half.
         let group_size_iter = (1u8..)
@@ -132,7 +132,7 @@ fn compute_positions_levels(
             let mut group_docids = RoaringBitmap::new();
 
             for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
-                let ((_field_id, _level, value, _right), docids) = result?;
+                let ((_word, _level, value, _right), docids) = result?;
 
                 if i == 0 {
                     left = value;

From bd1a371c62cf7d1fb79b29c1b5ccde30d63aa0ca Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 18 Mar 2021 15:41:44 +0100
Subject: [PATCH 13/45] Compute the WordsLevelPositions only once

---
 milli/src/update/index_documents/mod.rs | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index e7143bde0..3a41a52ae 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -718,19 +718,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         }
         builder.execute()?;
 
-        // Run the words positions update operation.
-        let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id);
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
-        if let Some(value) = self.facet_level_group_size {
-            builder.level_group_size(value);
-        }
-        if let Some(value) = self.facet_min_level_size {
-            builder.min_level_size(value);
-        }
-        builder.execute()?;
-
         // Run the words prefixes update operation.
         let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
         builder.chunk_compression_type = self.chunk_compression_type;

From 89ee2cf576858398ee160a0ed54d6494aedcecfc Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 18 Mar 2021 17:20:16 +0100
Subject: [PATCH 14/45] Introduce the TreeLevel struct

---
 infos/src/main.rs                             |  9 ++--
 .../heed_codec/str_level_position_codec.rs    | 13 +++--
 milli/src/lib.rs                              |  2 +
 milli/src/tree_level.rs                       | 47 +++++++++++++++++++
 milli/src/update/words_level_positions.rs     | 11 +++--
 5 files changed, 67 insertions(+), 15 deletions(-)
 create mode 100644 milli/src/tree_level.rs

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 2c11d3783..0e6403d7b 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -5,7 +5,7 @@ use std::{str, io, fmt};
 use anyhow::Context;
 use byte_unit::Byte;
 use heed::EnvOpenOptions;
-use milli::Index;
+use milli::{Index, TreeLevel};
 use structopt::StructOpt;
 
 use Command::*;
@@ -561,13 +561,12 @@ fn words_level_positions_docids(
 
     for word in words.iter().map(AsRef::as_ref) {
         let range = {
-            let left = (word, 0, u32::min_value(), u32::min_value());
-            let right = (word, u8::max_value(), u32::max_value(), u32::max_value());
+            let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
+            let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
             left..=right
         };
         for result in index.word_level_position_docids.range(rtxn, &range)? {
             let ((w, level, left, right), docids) = result?;
-            if word != w { break }
 
             let count = docids.len().to_string();
             let docids = if debug {
@@ -575,7 +574,7 @@ fn words_level_positions_docids(
             } else {
                 format!("{:?}", docids.iter().collect::<Vec<_>>())
             };
-            let position_range = if level == 0 {
+            let position_range = if level == TreeLevel::min_value() {
                 format!("{:?}", left)
             } else {
                 format!("{:?}", left..=right)
diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs
index c421c04b5..810e91940 100644
--- a/milli/src/heed_codec/str_level_position_codec.rs
+++ b/milli/src/heed_codec/str_level_position_codec.rs
@@ -1,12 +1,14 @@
 use std::borrow::Cow;
-use std::convert::TryInto;
+use std::convert::{TryFrom, TryInto};
 use std::mem::size_of;
 use std::str;
 
+use crate::TreeLevel;
+
 pub struct StrLevelPositionCodec;
 
 impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
-    type DItem = (&'a str, u8, u32, u32);
+    type DItem = (&'a str, TreeLevel, u32, u32);
 
     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
         let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
@@ -19,13 +21,14 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
         let (level, bytes) = bytes.split_first()?;
         let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?;
         let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?;
+        let level = TreeLevel::try_from(*level).ok()?;
 
-        Some((word, *level, left, right))
+        Some((word, level, left, right))
     }
 }
 
 impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
-    type EItem = (&'a str, u8, u32, u32);
+    type EItem = (&'a str, TreeLevel, u32, u32);
 
     fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
         let left = left.to_be_bytes();
@@ -33,7 +36,7 @@ impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
 
         let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len());
         bytes.extend_from_slice(word.as_bytes());
-        bytes.push(*level);
+        bytes.push((*level).into());
         bytes.extend_from_slice(&left[..]);
         bytes.extend_from_slice(&right[..]);
 
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index de5c6511e..03169bce7 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -9,6 +9,7 @@ pub mod facet;
 pub mod heed_codec;
 pub mod index;
 pub mod proximity;
+pub mod tree_level;
 pub mod update;
 
 use std::borrow::Cow;
@@ -27,6 +28,7 @@ pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringB
 pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
 pub use self::index::Index;
 pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
+pub use self::tree_level::TreeLevel;
 pub use self::update_store::UpdateStore;
 
 pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs
new file mode 100644
index 000000000..7ce2904e2
--- /dev/null
+++ b/milli/src/tree_level.rs
@@ -0,0 +1,47 @@
+use std::convert::TryFrom;
+use std::fmt;
+
+/// This is just before the lowest printable character (space, sp, 32)
+const MAX_VALUE: u8 = 31;
+
+#[derive(Debug, Copy, Clone)]
+pub enum Error {
+    LevelTooHigh(u8),
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[repr(transparent)]
+pub struct TreeLevel(u8);
+
+impl TreeLevel {
+    pub const fn max_value() -> TreeLevel {
+        TreeLevel(MAX_VALUE)
+    }
+
+    pub const fn min_value() -> TreeLevel {
+        TreeLevel(0)
+    }
+}
+
+impl Into<u8> for TreeLevel {
+    fn into(self) -> u8 {
+        self.0
+    }
+}
+
+impl TryFrom<u8> for TreeLevel {
+    type Error = Error;
+
+    fn try_from(value: u8) -> Result<TreeLevel, Error> {
+        match value {
+            0..=MAX_VALUE => Ok(TreeLevel(value)),
+            _ => Err(Error::LevelTooHigh(value)),
+        }
+    }
+}
+
+impl fmt::Display for TreeLevel {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index a7be248b6..4286fc780 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -1,4 +1,5 @@
 use std::cmp;
+use std::convert::TryFrom;
 use std::fs::File;
 use std::num::NonZeroUsize;
 
@@ -9,9 +10,9 @@ use log::debug;
 use roaring::RoaringBitmap;
 
 use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
-use crate::Index;
 use crate::update::index_documents::WriteMethod;
 use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
+use crate::{Index, TreeLevel};
 
 pub struct WordsLevelPositions<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -105,8 +106,8 @@ fn compute_positions_levels(
         let (word, ()) = result?;
 
         let level_0_range = {
-            let left = (word, 0, u32::min_value(), u32::min_value());
-            let right = (word, 0, u32::max_value(), u32::max_value());
+            let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
+            let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
             left..=right
         };
 
@@ -117,7 +118,7 @@ fn compute_positions_levels(
         // Groups sizes are always a power of the original level_group_size and therefore a group
         // always maps groups of the previous level and never splits previous levels groups in half.
         let group_size_iter = (1u8..)
-            .map(|l| (l, level_group_size.get().pow(l as u32)))
+            .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32)))
             .take_while(|(_, s)| first_level_size / *s >= min_level_size.get());
 
         // As specified in the documentation, we also write the level 0 entries.
@@ -163,7 +164,7 @@ fn compute_positions_levels(
 fn write_level_entry(
     writer: &mut Writer<File>,
     word: &str,
-    level: u8,
+    level: TreeLevel,
     left: u32,
     right: u32,
     ids: &RoaringBitmap,

From 658f316511faf6f87d4d7733236887e80d5eef79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 23 Mar 2021 15:25:46 +0100
Subject: [PATCH 15/45] Introduce the Initial Criterion

---
 milli/Cargo.toml                       |   3 -
 milli/src/search/criteria/asc_desc.rs  | 161 +++++++------------------
 milli/src/search/criteria/attribute.rs |  88 ++++++--------
 milli/src/search/criteria/fetcher.rs   | 135 ---------------------
 milli/src/search/criteria/final.rs     |  57 +++++++++
 milli/src/search/criteria/initial.rs   |  28 +++++
 milli/src/search/criteria/mod.rs       |  65 ++++------
 milli/src/search/criteria/proximity.rs | 151 ++++++++---------------
 milli/src/search/criteria/typo.rs      |  66 ++++------
 milli/src/search/criteria/words.rs     |  56 +++------
 milli/src/search/mod.rs                |   9 +-
 11 files changed, 286 insertions(+), 533 deletions(-)
 delete mode 100644 milli/src/search/criteria/fetcher.rs
 create mode 100644 milli/src/search/criteria/final.rs
 create mode 100644 milli/src/search/criteria/initial.rs

diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index eefdfa7d5..ef9c64b7b 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -57,9 +57,6 @@ criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"
 
-[build-dependencies]
-fst = "0.4.5"
-
 [features]
 default = []
 
diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs
index 1dc186720..d2841d449 100644
--- a/milli/src/search/criteria/asc_desc.rs
+++ b/milli/src/search/criteria/asc_desc.rs
@@ -31,32 +31,10 @@ pub struct AscDesc<'t> {
     candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
     bucket_candidates: RoaringBitmap,
     faceted_candidates: RoaringBitmap,
-    parent: Option<Box<dyn Criterion + 't>>,
+    parent: Box<dyn Criterion + 't>,
 }
 
 impl<'t> AscDesc<'t> {
-    pub fn initial_asc(
-        index: &'t Index,
-        rtxn: &'t heed::RoTxn,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-        field_name: String,
-    ) -> anyhow::Result<Self>
-    {
-        Self::initial(index, rtxn, query_tree, candidates, field_name, true)
-    }
-
-    pub fn initial_desc(
-        index: &'t Index,
-        rtxn: &'t heed::RoTxn,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-        field_name: String,
-    ) -> anyhow::Result<Self>
-    {
-        Self::initial(index, rtxn, query_tree, candidates, field_name, false)
-    }
-
     pub fn asc(
         index: &'t Index,
         rtxn: &'t heed::RoTxn,
@@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> {
         Self::new(index, rtxn, parent, field_name, false)
     }
 
-    fn initial(
-        index: &'t Index,
-        rtxn: &'t heed::RoTxn,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-        field_name: String,
-        ascending: bool,
-    ) -> anyhow::Result<Self>
-    {
-        let fields_ids_map = index.fields_ids_map(rtxn)?;
-        let faceted_fields = index.faceted_fields(rtxn)?;
-        let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?;
-
-        let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?;
-        let candidates = match &query_tree {
-            Some(qt) => {
-                let context = CriteriaBuilder::new(rtxn, index)?;
-                let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?;
-                if let Some(candidates) = candidates {
-                    qt_candidates.intersect_with(&candidates);
-                }
-                qt_candidates
-            },
-            None => candidates.unwrap_or(faceted_candidates.clone()),
-        };
-
-        Ok(AscDesc {
-            index,
-            rtxn,
-            field_name,
-            field_id,
-            facet_type,
-            ascending,
-            query_tree,
-            candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?,
-            faceted_candidates,
-            bucket_candidates: RoaringBitmap::new(),
-            parent: None,
-        })
-    }
-
     fn new(
         index: &'t Index,
         rtxn: &'t heed::RoTxn,
@@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> {
             candidates: Box::new(std::iter::empty()),
             faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
             bucket_candidates: RoaringBitmap::new(),
-            parent: Some(parent),
+            parent,
         })
     }
 }
@@ -156,64 +93,56 @@ impl<'t> Criterion for AscDesc<'t> {
 
             match self.candidates.next().transpose()? {
                 None => {
-                    let query_tree = self.query_tree.take();
-                    let bucket_candidates = take(&mut self.bucket_candidates);
-                    match self.parent.as_mut() {
-                        Some(parent) => {
-                            match parent.next(wdcache)? {
-                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
-                                    self.query_tree = query_tree;
-                                    let candidates = match (&self.query_tree, candidates) {
-                                        (_, Some(mut candidates)) => {
-                                            candidates.intersect_with(&self.faceted_candidates);
-                                            candidates
-                                        },
-                                        (Some(qt), None) => {
-                                            let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
-                                            let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?;
-                                            candidates.intersect_with(&self.faceted_candidates);
-                                            candidates
-                                        },
-                                        (None, None) => take(&mut self.faceted_candidates),
-                                    };
-                                    if bucket_candidates.is_empty() {
-                                        self.bucket_candidates.union_with(&candidates);
-                                    } else {
-                                        self.bucket_candidates.union_with(&bucket_candidates);
-                                    }
-                                    self.candidates = facet_ordered(
-                                        self.index,
-                                        self.rtxn,
-                                        self.field_id,
-                                        self.facet_type,
-                                        self.ascending,
-                                        candidates,
-                                    )?;
+                    match self.parent.next(wdcache)? {
+                        Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
+                            let candidates_is_some = candidates.is_some();
+                            self.query_tree = query_tree;
+                            let candidates = match (&self.query_tree, candidates) {
+                                (_, Some(mut candidates)) => {
+                                    candidates.intersect_with(&self.faceted_candidates);
+                                    candidates
                                 },
-                                None => return Ok(None),
-                            }
-                        },
-                        None => if query_tree.is_none() && bucket_candidates.is_empty() {
-                            return Ok(None)
-                        },
-                    }
+                                (Some(qt), None) => {
+                                    let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
+                                    let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?;
+                                    candidates.intersect_with(&self.faceted_candidates);
+                                    candidates
+                                },
+                                (None, None) => take(&mut self.faceted_candidates),
+                            };
 
-                    return Ok(Some(CriterionResult {
-                        query_tree,
-                        candidates: Some(RoaringBitmap::new()),
-                        bucket_candidates,
-                    }));
+                            // If our parent returns candidates it means that the bucket
+                            // candidates were already computed before and we can use them.
+                            //
+                            // If not, we must use the just computed candidates as our bucket
+                            // candidates.
+                            if candidates_is_some {
+                                self.bucket_candidates.union_with(&bucket_candidates);
+                            } else {
+                                self.bucket_candidates.union_with(&candidates);
+                            }
+
+                            if candidates.is_empty() {
+                                continue;
+                            }
+
+                            self.candidates = facet_ordered(
+                                self.index,
+                                self.rtxn,
+                                self.field_id,
+                                self.facet_type,
+                                self.ascending,
+                                candidates,
+                            )?;
+                        },
+                        None => return Ok(None),
+                    }
                 },
                 Some(candidates) => {
-                    let bucket_candidates = match self.parent {
-                        Some(_) => take(&mut self.bucket_candidates),
-                        None => candidates.clone(),
-                    };
-
                     return Ok(Some(CriterionResult {
                         query_tree: self.query_tree.clone(),
                         candidates: Some(candidates),
-                        bucket_candidates,
+                        bucket_candidates: take(&mut self.bucket_candidates),
                     }));
                 },
             }
diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 7f8b5c622..6398c7d87 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -14,36 +14,19 @@ pub struct Attribute<'t> {
     query_tree: Option<Operation>,
     candidates: Option<RoaringBitmap>,
     bucket_candidates: RoaringBitmap,
-    parent: Option<Box<dyn Criterion + 't>>,
+    parent: Box<dyn Criterion + 't>,
     flattened_query_tree: Option<Vec<Vec<Query>>>,
     current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
 }
 
 impl<'t> Attribute<'t> {
-    pub fn initial(
-        ctx: &'t dyn Context,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-    ) -> Self
-    {
-        Attribute {
-            ctx,
-            query_tree,
-            candidates,
-            bucket_candidates: RoaringBitmap::new(),
-            parent: None,
-            flattened_query_tree: None,
-            current_buckets: None,
-        }
-    }
-
     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
         Attribute {
             ctx,
             query_tree: None,
             candidates: None,
             bucket_candidates: RoaringBitmap::new(),
-            parent: Some(parent),
+            parent,
             flattened_query_tree: None,
             current_buckets: None,
         }
@@ -63,34 +46,35 @@ impl<'t> Criterion for Attribute<'t> {
                     }));
                 },
                 (Some(qt), Some(candidates)) => {
-                    let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt));
-                    let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() {
-                        current_buckets
-                    } else {
-                        let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
-                        self.current_buckets.get_or_insert(new_buckets.into_iter())
+                    let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| {
+                        flatten_query_tree(&qt)
+                    });
+
+                    let current_buckets = match self.current_buckets.as_mut() {
+                        Some(current_buckets) => current_buckets,
+                        None => {
+                            let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
+                            self.current_buckets.get_or_insert(new_buckets.into_iter())
+                        },
                     };
 
-                    let found_candidates = if let Some((_score, candidates)) = current_buckets.next() {
-                        candidates
-                    } else {
-                        return Ok(Some(CriterionResult {
-                            query_tree: self.query_tree.take(),
-                            candidates: self.candidates.take(),
-                            bucket_candidates: take(&mut self.bucket_candidates),
-                        }));
+                    let found_candidates = match current_buckets.next() {
+                        Some((_score, candidates)) => candidates,
+                        None => {
+                            return Ok(Some(CriterionResult {
+                                query_tree: self.query_tree.take(),
+                                candidates: self.candidates.take(),
+                                bucket_candidates: take(&mut self.bucket_candidates),
+                            }));
+                        },
                     };
+
                     candidates.difference_with(&found_candidates);
 
-                    let bucket_candidates = match self.parent {
-                        Some(_) => take(&mut self.bucket_candidates),
-                        None => found_candidates.clone(),
-                    };
-
                     return Ok(Some(CriterionResult {
                         query_tree: self.query_tree.clone(),
                         candidates: Some(found_candidates),
-                        bucket_candidates: bucket_candidates,
+                        bucket_candidates: take(&mut self.bucket_candidates),
                     }));
                 },
                 (Some(qt), None) => {
@@ -106,18 +90,20 @@ impl<'t> Criterion for Attribute<'t> {
                     }));
                 },
                 (None, None) => {
-                    match self.parent.as_mut() {
-                        Some(parent) => {
-                            match parent.next(wdcache)? {
-                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
-                                    self.query_tree = query_tree;
-                                    self.candidates = candidates;
-                                    self.bucket_candidates.union_with(&bucket_candidates);
-                                    self.flattened_query_tree = None;
-                                    self.current_buckets = None;
-                                },
-                                None => return Ok(None),
-                            }
+                    match self.parent.next(wdcache)? {
+                        Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
+                            return Ok(Some(CriterionResult {
+                                query_tree: None,
+                                candidates: None,
+                                bucket_candidates,
+                            }));
+                        },
+                        Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
+                            self.query_tree = query_tree;
+                            self.candidates = candidates;
+                            self.bucket_candidates.union_with(&bucket_candidates);
+                            self.flattened_query_tree = None;
+                            self.current_buckets = None;
                         },
                         None => return Ok(None),
                     }
diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs
deleted file mode 100644
index fa204bdf2..000000000
--- a/milli/src/search/criteria/fetcher.rs
+++ /dev/null
@@ -1,135 +0,0 @@
-use std::collections::HashMap;
-use std::mem::take;
-
-use log::debug;
-use roaring::RoaringBitmap;
-
-use crate::search::query_tree::Operation;
-use crate::search::WordDerivationsCache;
-use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
-
-/// The result of a call to the fetcher.
-#[derive(Debug, Clone, PartialEq)]
-pub struct FetcherResult {
-    /// The query tree corresponding to the current bucket of the last criterion.
-    pub query_tree: Option<Operation>,
-    /// The candidates of the current bucket of the last criterion.
-    pub candidates: RoaringBitmap,
-    /// Candidates that comes from the current bucket of the initial criterion.
-    pub bucket_candidates: RoaringBitmap,
-}
-
-pub struct Fetcher<'t> {
-    ctx: &'t dyn Context,
-    query_tree: Option<Operation>,
-    candidates: Candidates,
-    parent: Option<Box<dyn Criterion + 't>>,
-    should_get_documents_ids: bool,
-    wdcache: WordDerivationsCache,
-}
-
-impl<'t> Fetcher<'t> {
-    pub fn initial(
-        ctx: &'t dyn Context,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-    ) -> Self
-    {
-        Fetcher {
-            ctx,
-            query_tree,
-            candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
-            parent: None,
-            should_get_documents_ids: true,
-            wdcache: WordDerivationsCache::new(),
-        }
-    }
-
-    pub fn new(
-        ctx: &'t dyn Context,
-        parent: Box<dyn Criterion + 't>,
-    ) -> Self
-    {
-        Fetcher {
-            ctx,
-            query_tree: None,
-            candidates: Candidates::default(),
-            parent: Some(parent),
-            should_get_documents_ids: true,
-            wdcache: WordDerivationsCache::new(),
-        }
-    }
-
-    #[logging_timer::time("Fetcher::{}")]
-    pub fn next(&mut self) -> anyhow::Result<Option<FetcherResult>> {
-        use Candidates::{Allowed, Forbidden};
-        loop {
-            debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})",
-                self.should_get_documents_ids, self.candidates,
-            );
-
-            let should_get_documents_ids = take(&mut self.should_get_documents_ids);
-            match &mut self.candidates {
-                Allowed(_) => {
-                    let candidates = take(&mut self.candidates).into_inner();
-                    let candidates = match &self.query_tree {
-                        Some(qt) if should_get_documents_ids => {
-                            let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?;
-                            docids.intersect_with(&candidates);
-                            docids
-                        },
-                        _ => candidates,
-                    };
-
-                    return Ok(Some(FetcherResult {
-                        query_tree: self.query_tree.take(),
-                        candidates: candidates.clone(),
-                        bucket_candidates: candidates,
-                    }));
-                },
-                Forbidden(_) => {
-                    match self.parent.as_mut() {
-                        Some(parent) => {
-                            match parent.next(&mut self.wdcache)? {
-                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
-                                    let candidates = match (&query_tree, candidates) {
-                                        (_, Some(candidates)) => candidates,
-                                        (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
-                                        (None, None) => RoaringBitmap::new(),
-                                    };
-
-                                    return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates }))
-                                },
-                                None => if should_get_documents_ids {
-                                    let candidates = match &self.query_tree {
-                                        Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?,
-                                        None => self.ctx.documents_ids()?,
-                                    };
-
-                                    return Ok(Some(FetcherResult {
-                                        query_tree: self.query_tree.clone(),
-                                        candidates: candidates.clone(),
-                                        bucket_candidates: candidates,
-                                    }));
-                                },
-                            }
-                        },
-                        None => if should_get_documents_ids {
-                            let candidates = match &self.query_tree {
-                                Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?,
-                                None => self.ctx.documents_ids()?,
-                            };
-
-                            return Ok(Some(FetcherResult {
-                                query_tree: self.query_tree.clone(),
-                                candidates: candidates.clone(),
-                                bucket_candidates: candidates,
-                            }));
-                        },
-                    }
-                    return Ok(None);
-                },
-            }
-        }
-    }
-}
diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs
new file mode 100644
index 000000000..fe224ef94
--- /dev/null
+++ b/milli/src/search/criteria/final.rs
@@ -0,0 +1,57 @@
+use std::collections::HashMap;
+
+use log::debug;
+use roaring::RoaringBitmap;
+
+use crate::search::query_tree::Operation;
+use crate::search::WordDerivationsCache;
+use super::{resolve_query_tree, Criterion, CriterionResult, Context};
+
+/// The result of a call to the fetcher.
+#[derive(Debug, Clone, PartialEq)]
+pub struct FinalResult {
+    /// The query tree corresponding to the current bucket of the last criterion.
+    pub query_tree: Option<Operation>,
+    /// The candidates of the current bucket of the last criterion.
+    pub candidates: RoaringBitmap,
+    /// Candidates that comes from the current bucket of the initial criterion.
+    pub bucket_candidates: RoaringBitmap,
+}
+
+pub struct Final<'t> {
+    ctx: &'t dyn Context,
+    parent: Box<dyn Criterion + 't>,
+    wdcache: WordDerivationsCache,
+}
+
+impl<'t> Final<'t> {
+    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Final<'t> {
+        Final { ctx, parent, wdcache: WordDerivationsCache::new() }
+    }
+
+    #[logging_timer::time("Final::{}")]
+    pub fn next(&mut self) -> anyhow::Result<Option<FinalResult>> {
+        loop {
+            debug!("Final iteration");
+
+            match self.parent.next(&mut self.wdcache)? {
+                Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => {
+                    let candidates = match (&query_tree, candidates) {
+                        (_, Some(candidates)) => candidates,
+                        (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
+                        (None, None) => self.ctx.documents_ids()?,
+                    };
+
+                    bucket_candidates.union_with(&candidates);
+
+                    return Ok(Some(FinalResult {
+                        query_tree,
+                        candidates,
+                        bucket_candidates,
+                    }));
+                },
+                None => return Ok(None),
+            }
+        }
+    }
+}
diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs
new file mode 100644
index 000000000..d4b9e1379
--- /dev/null
+++ b/milli/src/search/criteria/initial.rs
@@ -0,0 +1,28 @@
+use roaring::RoaringBitmap;
+
+use crate::search::query_tree::Operation;
+use crate::search::WordDerivationsCache;
+
+use super::{Criterion, CriterionResult};
+
+pub struct Initial {
+    answer: Option<CriterionResult>
+}
+
+impl Initial {
+    pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial {
+        let answer = CriterionResult {
+            query_tree,
+            candidates: candidates.clone(),
+            bucket_candidates: candidates.take().unwrap_or_default(),
+        };
+        Initial { answer: Some(answer) }
+    }
+}
+
+impl Criterion for Initial {
+    #[logging_timer::time("Initial::{}")]
+    fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
+        Ok(self.answer.take())
+    }
+}
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 1d7026d71..5e75be6ce 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -8,19 +8,21 @@ use crate::search::{word_derivations, WordDerivationsCache};
 use crate::{Index, DocumentId};
 
 use super::query_tree::{Operation, Query, QueryKind};
+use self::asc_desc::AscDesc;
+use self::attribute::Attribute;
+use self::r#final::Final;
+use self::initial::Initial;
+use self::proximity::Proximity;
 use self::typo::Typo;
 use self::words::Words;
-use self::asc_desc::AscDesc;
-use self::proximity::Proximity;
-use self::attribute::Attribute;
-use self::fetcher::Fetcher;
 
+mod asc_desc;
+mod attribute;
+mod initial;
+mod proximity;
 mod typo;
 mod words;
-mod asc_desc;
-mod proximity;
-mod attribute;
-pub mod fetcher;
+pub mod r#final;
 
 pub trait Criterion {
     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>>;
@@ -61,6 +63,7 @@ impl Default for Candidates {
         Self::Forbidden(RoaringBitmap::new())
     }
 }
+
 pub trait Context {
     fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
@@ -128,44 +131,26 @@ impl<'t> CriteriaBuilder<'t> {
 
     pub fn build(
         &'t self,
-        mut query_tree: Option<Operation>,
-        mut facet_candidates: Option<RoaringBitmap>,
-    ) -> anyhow::Result<Fetcher<'t>>
+        query_tree: Option<Operation>,
+        facet_candidates: Option<RoaringBitmap>,
+    ) -> anyhow::Result<Final<'t>>
     {
         use crate::criterion::Criterion as Name;
 
-        let mut criterion = None as Option<Box<dyn Criterion>>;
+        let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box<dyn Criterion>;
         for name in self.index.criteria(&self.rtxn)? {
-            criterion = Some(match criterion.take() {
-                Some(father) => match name {
-                    Name::Typo => Box::new(Typo::new(self, father)),
-                    Name::Words => Box::new(Words::new(self, father)),
-                    Name::Proximity => Box::new(Proximity::new(self, father)),
-                    Name::Attribute => Box::new(Attribute::new(self, father)),
-                    Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?),
-                    Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?),
-                    _otherwise => father,
-                },
-                None => match name {
-                    Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())),
-                    Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())),
-                    Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())),
-                    Name::Attribute => Box::new(Attribute::initial(self, query_tree.take(), facet_candidates.take())),
-                    Name::Asc(field) => {
-                        Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
-                    },
-                    Name::Desc(field) => {
-                        Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?)
-                    },
-                    _otherwise => continue,
-                },
-            });
+            criterion = match name {
+                Name::Typo => Box::new(Typo::new(self, criterion)),
+                Name::Words => Box::new(Words::new(self, criterion)),
+                Name::Proximity => Box::new(Proximity::new(self, criterion)),
+                Name::Attribute => Box::new(Attribute::new(self, criterion)),
+                Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?),
+                Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?),
+                _otherwise => criterion,
+            };
         }
 
-        match criterion {
-            Some(criterion) => Ok(Fetcher::new(self, criterion)),
-            None => Ok(Fetcher::initial(self, query_tree, facet_candidates)),
-        }
+        Ok(Final::new(self, criterion))
     }
 }
 
diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs
index decd4c338..dc1daafb2 100644
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@@ -8,48 +8,29 @@ use log::debug;
 use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
 use crate::search::query_tree::{maximum_proximity, Operation, Query};
 use crate::search::{build_dfa, WordDerivationsCache};
-use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
+use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
 
 type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
 
 pub struct Proximity<'t> {
     ctx: &'t dyn Context,
-    query_tree: Option<(usize, Operation)>,
+    /// ((max_proximity, query_tree), allowed_candidates)
+    state: Option<(Option<(usize, Operation)>, RoaringBitmap)>,
     proximity: u8,
-    candidates: Candidates,
     bucket_candidates: RoaringBitmap,
-    parent: Option<Box<dyn Criterion + 't>>,
+    parent: Box<dyn Criterion + 't>,
     candidates_cache: Cache,
     plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
 }
 
 impl<'t> Proximity<'t> {
-    pub fn initial(
-        ctx: &'t dyn Context,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-    ) -> Self
-    {
-        Proximity {
-            ctx,
-            query_tree: query_tree.map(|op| (maximum_proximity(&op), op)),
-            proximity: 0,
-            candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
-            bucket_candidates: RoaringBitmap::new(),
-            parent: None,
-            candidates_cache: Cache::new(),
-            plane_sweep_cache: None,
-        }
-    }
-
     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
         Proximity {
             ctx,
-            query_tree: None,
+            state: None,
             proximity: 0,
-            candidates: Candidates::default(),
             bucket_candidates: RoaringBitmap::new(),
-            parent: Some(parent),
+            parent: parent,
             candidates_cache: Cache::new(),
             plane_sweep_cache: None,
         }
@@ -59,27 +40,20 @@ impl<'t> Proximity<'t> {
 impl<'t> Criterion for Proximity<'t> {
     #[logging_timer::time("Proximity::{}")]
     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
-        use Candidates::{Allowed, Forbidden};
         loop {
-            debug!("Proximity at iteration {} (max {:?}) ({:?})",
+            debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
                 self.proximity,
-                self.query_tree.as_ref().map(|(mp, _)| mp),
-                self.candidates,
+                self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)),
+                self.state.as_ref().map(|(_, cd)| cd),
             );
 
-            match (&mut self.query_tree, &mut self.candidates) {
-                (_, Allowed(candidates)) if candidates.is_empty() => {
-                    return Ok(Some(CriterionResult {
-                        query_tree: self.query_tree.take().map(|(_, qt)| qt),
-                        candidates: Some(take(&mut self.candidates).into_inner()),
-                        bucket_candidates: take(&mut self.bucket_candidates),
-                    }));
+            match &mut self.state {
+                Some((_, candidates)) if candidates.is_empty() => {
+                    self.state = None; // reset state
                 },
-                (Some((max_prox, query_tree)), Allowed(candidates)) => {
+                Some((Some((max_prox, query_tree)), candidates)) => {
                     if self.proximity as usize > *max_prox {
-                        // reset state to (None, Forbidden(_))
-                        self.query_tree = None;
-                        self.candidates = Candidates::default();
+                        self.state = None; // reset state
                     } else {
                         let mut new_candidates = if candidates.len() <= 1000 {
                             if let Some(cache) = self.plane_sweep_cache.as_mut() {
@@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> {
                                         candidates
                                     },
                                     None => {
-                                        // reset state to (None, Forbidden(_))
-                                        self.query_tree = None;
-                                        self.candidates = Candidates::default();
+                                        self.state = None; // reset state
                                         continue
                                     },
                                 }
@@ -120,79 +92,54 @@ impl<'t> Criterion for Proximity<'t> {
                         candidates.difference_with(&new_candidates);
                         self.proximity += 1;
 
-                        let bucket_candidates = match self.parent {
-                            Some(_) => take(&mut self.bucket_candidates),
-                            None => new_candidates.clone(),
-                        };
-
                         return Ok(Some(CriterionResult {
                             query_tree: Some(query_tree.clone()),
                             candidates: Some(new_candidates),
-                            bucket_candidates,
+                            bucket_candidates: take(&mut self.bucket_candidates),
                         }));
                     }
                 },
-                (Some((max_prox, query_tree)), Forbidden(candidates)) => {
-                    if self.proximity as usize > *max_prox {
-                        self.query_tree = None;
-                        self.candidates = Candidates::default();
-                    } else {
-                        let mut new_candidates = resolve_candidates(
-                            self.ctx,
-                            &query_tree,
-                            self.proximity,
-                            &mut self.candidates_cache,
-                            wdcache,
-                        )?;
-
-                        new_candidates.difference_with(&candidates);
-                        candidates.union_with(&new_candidates);
-                        self.proximity += 1;
-
-                        let bucket_candidates = match self.parent {
-                            Some(_) => take(&mut self.bucket_candidates),
-                            None => new_candidates.clone(),
-                        };
-
-                        return Ok(Some(CriterionResult {
-                            query_tree: Some(query_tree.clone()),
-                            candidates: Some(new_candidates),
-                            bucket_candidates,
-                        }));
-                    }
-                },
-                (None, Allowed(_)) => {
-                    let candidates = take(&mut self.candidates).into_inner();
+                Some((None, candidates)) => {
+                    let candidates = take(candidates);
+                    self.state = None; // reset state
                     return Ok(Some(CriterionResult {
                         query_tree: None,
                         candidates: Some(candidates.clone()),
                         bucket_candidates: candidates,
                     }));
                 },
-                (None, Forbidden(_)) => {
-                    match self.parent.as_mut() {
-                        Some(parent) => {
-                            match parent.next(wdcache)? {
-                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
-                                    let candidates = match (&query_tree, candidates) {
-                                        (_, Some(candidates)) => candidates,
-                                        (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?,
-                                        (None, None) => RoaringBitmap::new(),
-                                    };
+                None => {
+                    match self.parent.next(wdcache)? {
+                        Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
+                            return Ok(Some(CriterionResult {
+                                query_tree: None,
+                                candidates: None,
+                                bucket_candidates,
+                            }));
+                        },
+                        Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
+                            let candidates_is_some = candidates.is_some();
+                            let candidates = match (&query_tree, candidates) {
+                                (_, Some(candidates)) => candidates,
+                                (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?,
+                                (None, None) => RoaringBitmap::new(),
+                            };
 
-                                    if bucket_candidates.is_empty() {
-                                        self.bucket_candidates.union_with(&candidates);
-                                    } else {
-                                        self.bucket_candidates.union_with(&bucket_candidates);
-                                    }
-
-                                    self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
-                                    self.proximity = 0;
-                                    self.candidates = Candidates::Allowed(candidates);
-                                    self.plane_sweep_cache = None;
-                                },
-                                None => return Ok(None),
+                            // If our parent returns candidates it means that the bucket
+                            // candidates were already computed before and we can use them.
+                            //
+                            // If not, we must use the just computed candidates as our bucket
+                            // candidates.
+                            if candidates_is_some {
+                                self.bucket_candidates.union_with(&bucket_candidates);
+                            } else {
+                                self.bucket_candidates.union_with(&candidates);
                             }
+
+                            let query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
+                            self.state = Some((query_tree, candidates));
+                            self.proximity = 0;
+                            self.plane_sweep_cache = None;
                         },
                         None => return Ok(None),
                     }
diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs
index 3877f53ed..40b06afc4 100644
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@@ -14,28 +14,11 @@ pub struct Typo<'t> {
     number_typos: u8,
     candidates: Candidates,
     bucket_candidates: RoaringBitmap,
-    parent: Option<Box<dyn Criterion + 't>>,
+    parent: Box<dyn Criterion + 't>,
     candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
 }
 
 impl<'t> Typo<'t> {
-    pub fn initial(
-        ctx: &'t dyn Context,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-    ) -> Self
-    {
-        Typo {
-            ctx,
-            query_tree: query_tree.map(|op| (maximum_typo(&op), op)),
-            number_typos: 0,
-            candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
-            bucket_candidates: RoaringBitmap::new(),
-            parent: None,
-            candidates_cache: HashMap::new(),
-        }
-    }
-
     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
         Typo {
             ctx,
@@ -43,7 +26,7 @@ impl<'t> Typo<'t> {
             number_typos: 0,
             candidates: Candidates::default(),
             bucket_candidates: RoaringBitmap::new(),
-            parent: Some(parent),
+            parent,
             candidates_cache: HashMap::new(),
         }
     }
@@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> {
                         candidates.difference_with(&new_candidates);
                         self.number_typos += 1;
 
-                        let bucket_candidates = match self.parent {
-                            Some(_) => take(&mut self.bucket_candidates),
-                            None => new_candidates.clone(),
-                        };
-
                         return Ok(Some(CriterionResult {
                             query_tree: Some(new_query_tree),
                             candidates: Some(new_candidates),
-                            bucket_candidates,
+                            bucket_candidates: take(&mut self.bucket_candidates),
                         }));
                     }
                 },
@@ -145,17 +123,19 @@ impl<'t> Criterion for Typo<'t> {
                     }));
                 },
                 (None, Forbidden(_)) => {
-                    match self.parent.as_mut() {
-                        Some(parent) => {
-                            match parent.next(wdcache)? {
-                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
-                                    self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
-                                    self.number_typos = 0;
-                                    self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed);
-                                    self.bucket_candidates.union_with(&bucket_candidates);
-                                },
-                                None => return Ok(None),
-                            }
+                    match self.parent.next(wdcache)? {
+                        Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
+                            return Ok(Some(CriterionResult {
+                                query_tree: None,
+                                candidates: None,
+                                bucket_candidates,
+                            }));
+                        },
+                        Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
+                            self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
+                            self.number_typos = 0;
+                            self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed);
+                            self.bucket_candidates.union_with(&bucket_candidates);
                         },
                         None => return Ok(None),
                     }
@@ -334,8 +314,8 @@ fn resolve_candidates<'t>(
 
 #[cfg(test)]
 mod test {
-
     use super::*;
+    use super::super::initial::Initial;
     use super::super::test::TestContext;
 
     #[test]
@@ -345,7 +325,8 @@ mod test {
         let facet_candidates = None;
 
         let mut wdcache = WordDerivationsCache::new();
-        let mut criteria = Typo::initial(&context, query_tree, facet_candidates);
+        let parent = Initial::new(query_tree, facet_candidates);
+        let mut criteria = Typo::new(&context, Box::new(parent));
 
         assert!(criteria.next(&mut wdcache).unwrap().is_none());
     }
@@ -364,7 +345,8 @@ mod test {
         let facet_candidates = None;
 
         let mut wdcache = WordDerivationsCache::new();
-        let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates);
+        let parent = Initial::new(Some(query_tree), facet_candidates);
+        let mut criteria = Typo::new(&context, Box::new(parent));
 
         let candidates_1 = context.word_docids("split").unwrap().unwrap()
             & context.word_docids("this").unwrap().unwrap()
@@ -413,7 +395,8 @@ mod test {
         let facet_candidates = context.word_docids("earth").unwrap().unwrap();
 
         let mut wdcache = WordDerivationsCache::new();
-        let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone()));
+        let parent = Initial::new(query_tree, Some(facet_candidates.clone()));
+        let mut criteria = Typo::new(&context, Box::new(parent));
 
         let expected = CriterionResult {
             query_tree: None,
@@ -442,7 +425,8 @@ mod test {
         let facet_candidates = context.word_docids("earth").unwrap().unwrap();
 
         let mut wdcache = WordDerivationsCache::new();
-        let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone()));
+        let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone()));
+        let mut criteria = Typo::new(&context, Box::new(parent));
 
         let candidates_1 = context.word_docids("split").unwrap().unwrap()
             & context.word_docids("this").unwrap().unwrap()
diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs
index 0aa3b483a..5bb9d8d90 100644
--- a/milli/src/search/criteria/words.rs
+++ b/milli/src/search/criteria/words.rs
@@ -12,34 +12,18 @@ pub struct Words<'t> {
     query_trees: Vec<Operation>,
     candidates: Option<RoaringBitmap>,
     bucket_candidates: RoaringBitmap,
-    parent: Option<Box<dyn Criterion + 't>>,
+    parent: Box<dyn Criterion + 't>,
     candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
 }
 
 impl<'t> Words<'t> {
-    pub fn initial(
-        ctx: &'t dyn Context,
-        query_tree: Option<Operation>,
-        candidates: Option<RoaringBitmap>,
-    ) -> Self
-    {
-        Words {
-            ctx,
-            query_trees: query_tree.map(explode_query_tree).unwrap_or_default(),
-            candidates,
-            bucket_candidates: RoaringBitmap::new(),
-            parent: None,
-            candidates_cache: HashMap::default(),
-        }
-    }
-
     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
         Words {
             ctx,
             query_trees: Vec::default(),
             candidates: None,
             bucket_candidates: RoaringBitmap::new(),
-            parent: Some(parent),
+            parent,
             candidates_cache: HashMap::default(),
         }
     }
@@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> {
                     found_candidates.intersect_with(&candidates);
                     candidates.difference_with(&found_candidates);
 
-                    let bucket_candidates = match self.parent {
-                        Some(_) => take(&mut self.bucket_candidates),
-                        None => found_candidates.clone(),
-                    };
-
                     return Ok(Some(CriterionResult {
                         query_tree: Some(qt),
                         candidates: Some(found_candidates),
-                        bucket_candidates,
+                        bucket_candidates: take(&mut self.bucket_candidates),
                     }));
                 },
                 (Some(qt), None) => {
-                    let bucket_candidates = match self.parent {
-                        Some(_) => take(&mut self.bucket_candidates),
-                        None => RoaringBitmap::new(),
-                    };
-
                     return Ok(Some(CriterionResult {
                         query_tree: Some(qt),
                         candidates: None,
-                        bucket_candidates,
+                        bucket_candidates: take(&mut self.bucket_candidates),
                     }));
                 },
                 (None, Some(_)) => {
@@ -97,16 +71,18 @@ impl<'t> Criterion for Words<'t> {
                     }));
                 },
                 (None, None) => {
-                    match self.parent.as_mut() {
-                        Some(parent) => {
-                            match parent.next(wdcache)? {
-                                Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
-                                    self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
-                                    self.candidates = candidates;
-                                    self.bucket_candidates.union_with(&bucket_candidates);
-                                },
-                                None => return Ok(None),
-                            }
+                    match self.parent.next(wdcache)? {
+                        Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
+                            return Ok(Some(CriterionResult {
+                                query_tree: None,
+                                candidates: None,
+                                bucket_candidates,
+                            }));
+                        },
+                        Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
+                            self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
+                            self.candidates = candidates;
+                            self.bucket_candidates.union_with(&bucket_candidates);
                         },
                         None => return Ok(None),
                     }
diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index 174fff35c..4f0bde422 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -13,9 +13,8 @@ use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;
 
 use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct};
-
-use crate::search::criteria::fetcher::{Fetcher, FetcherResult};
-use crate::{DocumentId, Index};
+use crate::search::criteria::r#final::{Final, FinalResult};
+use crate::{Index, DocumentId};
 
 pub use self::facet::{
     FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator,
@@ -162,14 +161,14 @@ impl<'a> Search<'a> {
         &self,
         mut distinct: impl for<'c> Distinct<'c>,
         matching_words: MatchingWords,
-        mut criteria: Fetcher,
+        mut criteria: Final,
     ) -> anyhow::Result<SearchResult> {
         let mut offset = self.offset;
         let mut initial_candidates = RoaringBitmap::new();
         let mut excluded_documents = RoaringBitmap::new();
         let mut documents_ids = Vec::with_capacity(self.limit);
 
-        while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? {
+        while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? {
             debug!("Number of candidates found {}", candidates.len());
 
             let excluded = take(&mut excluded_documents);

From 7aa5753ed282afd2df90f1fae07beb2a1b8eeb68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 24 Mar 2021 15:06:54 +0100
Subject: [PATCH 16/45] Make the attribute positions range bounds to be fixed

---
 http-ui/src/main.rs                       |  6 +--
 milli/src/update/index_documents/mod.rs   |  6 +--
 milli/src/update/words_level_positions.rs | 47 +++++++++++++++--------
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index dbf7aadce..c85bd9b15 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fmt::Display;
 use std::fs::{create_dir_all, File};
 use std::net::SocketAddr;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -286,8 +286,8 @@ struct WordsPrefixes {
 #[serde(deny_unknown_fields)]
 #[serde(rename_all = "camelCase")]
 struct WordsLevelPositions {
-    level_group_size: Option<NonZeroUsize>,
-    min_level_size: Option<NonZeroUsize>,
+    level_group_size: Option<NonZeroU32>,
+    min_level_size: Option<NonZeroU32>,
 }
 
 // Any value that is present is considered Some value, including null.
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 3a41a52ae..7a2196481 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2,7 +2,7 @@ use std::borrow::Cow;
 use std::collections::HashSet;
 use std::fs::File;
 use std::io::{self, Seek, SeekFrom};
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::sync::mpsc::sync_channel;
 use std::time::Instant;
 
@@ -263,8 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
     facet_min_level_size: Option<NonZeroUsize>,
     words_prefix_threshold: Option<f64>,
     max_prefix_length: Option<usize>,
-    words_positions_level_group_size: Option<NonZeroUsize>,
-    words_positions_min_level_size: Option<NonZeroUsize>,
+    words_positions_level_group_size: Option<NonZeroU32>,
+    words_positions_min_level_size: Option<NonZeroU32>,
     update_method: IndexDocumentsMethod,
     update_format: UpdateFormat,
     autogenerate_docids: bool,
diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index 4286fc780..eb8d3bb3c 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -1,7 +1,7 @@
 use std::cmp;
 use std::convert::TryFrom;
 use std::fs::File;
-use std::num::NonZeroUsize;
+use std::num::NonZeroU32;
 
 use grenad::{CompressionType, Reader, Writer, FileFuse};
 use heed::types::{DecodeIgnore, Str};
@@ -20,8 +20,8 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
     pub(crate) chunk_compression_type: CompressionType,
     pub(crate) chunk_compression_level: Option<u32>,
     pub(crate) chunk_fusing_shrink_size: Option<u64>,
-    level_group_size: NonZeroUsize,
-    min_level_size: NonZeroUsize,
+    level_group_size: NonZeroU32,
+    min_level_size: NonZeroU32,
     _update_id: u64,
 }
 
@@ -38,18 +38,18 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
             chunk_compression_type: CompressionType::None,
             chunk_compression_level: None,
             chunk_fusing_shrink_size: None,
-            level_group_size: NonZeroUsize::new(4).unwrap(),
-            min_level_size: NonZeroUsize::new(5).unwrap(),
+            level_group_size: NonZeroU32::new(4).unwrap(),
+            min_level_size: NonZeroU32::new(5).unwrap(),
             _update_id: update_id,
         }
     }
 
-    pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self {
-        self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap();
+    pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self {
+        self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap();
         self
     }
 
-    pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self {
+    pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self {
         self.min_level_size = value;
         self
     }
@@ -84,6 +84,20 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
     }
 }
 
+/// Returns the next number after or equal to `x` that is divisible by `d`.
+fn next_divisible(x: u32, d: u32) -> u32 {
+    (x.saturating_sub(1) | (d - 1)) + 1
+}
+
+/// Returns the previous number after or equal to `x` that is divisible by `d`,
+/// saturates on zero.
+fn previous_divisible(x: u32, d: u32) -> u32 {
+    match x.checked_sub(d - 1) {
+        Some(0) | None => 0,
+        Some(x) => next_divisible(x, d),
+    }
+}
+
 /// Generates all the words positions levels based on the levels zero (including the level zero).
 fn compute_positions_levels(
     rtxn: &heed::RoTxn,
@@ -92,8 +106,8 @@ fn compute_positions_levels(
     compression_type: CompressionType,
     compression_level: Option<u32>,
     shrink_size: Option<u64>,
-    level_group_size: NonZeroUsize,
-    min_level_size: NonZeroUsize,
+    level_group_size: NonZeroU32,
+    min_level_size: NonZeroU32,
 ) -> anyhow::Result<Reader<FileFuse>>
 {
     // It is forbidden to keep a cursor and write in a database at the same time with LMDB
@@ -113,7 +127,7 @@ fn compute_positions_levels(
 
         let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
             .range(rtxn, &level_0_range)?
-            .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
+            .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
 
         // Groups sizes are always a power of the original level_group_size and therefore a group
         // always maps groups of the previous level and never splits previous levels groups in half.
@@ -136,20 +150,23 @@ fn compute_positions_levels(
                 let ((_word, _level, value, _right), docids) = result?;
 
                 if i == 0 {
-                    left = value;
-                } else if i % group_size == 0 {
+                    left = previous_divisible(value, group_size);
+                    right = left + (group_size - 1);
+                }
+
+                if value > right {
                     // we found the first bound of the next group, we must store the left
                     // and right bounds associated with the docids.
                     write_level_entry(&mut writer, word, level, left, right, &group_docids)?;
 
                     // We save the left bound for the new group and also reset the docids.
                     group_docids = RoaringBitmap::new();
-                    left = value;
+                    left = previous_divisible(value, group_size);
+                    right = left + (group_size - 1);
                 }
 
                 // The right bound is always the bound we run through.
                 group_docids.union_with(&docids);
-                right = value;
             }
 
             if !group_docids.is_empty() {

From 0ad9499b935db85347323f14b2144c1c6a45d924 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 24 Mar 2021 15:37:03 +0100
Subject: [PATCH 17/45] Fix an indexing bug in the words level positions

---
 milli/src/update/words_level_positions.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index eb8d3bb3c..70bc89860 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -121,7 +121,7 @@ fn compute_positions_levels(
 
         let level_0_range = {
             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
-            let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
+            let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value());
             left..=right
         };
 

From ab92c814c3247b03bf2447ef5a346688a80cef95 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 24 Mar 2021 18:20:13 +0100
Subject: [PATCH 18/45] Fix attributes score

---
 milli/src/search/criteria/attribute.rs | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 6398c7d87..160807847 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -122,7 +122,8 @@ fn linear_compute_candidates(
     fn compute_candidate_rank(branches: &Vec<Vec<Query>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
         let mut min_rank = u64::max_value();
         for branch in branches {
-            let mut branch_rank = 0;
+            let branch_len = branch.len();
+            let mut branch_rank = Vec::with_capacity(branch_len);
             for Query { prefix, kind } in branch {
                 // find the best position of the current word in the document.
                 let position =  match kind {
@@ -145,13 +146,21 @@ fn linear_compute_candidates(
                 // if a position is found, we add it to the branch score,
                 // otherwise the branch is considered as unfindable in this document and we break.
                 if let Some(position) = position {
-                    branch_rank += position as u64;
+                    branch_rank.push(position as u64);
                 } else {
-                    branch_rank = u64::max_value();
+                    branch_rank.clear();
                     break;
                 }
             }
-            min_rank = min_rank.min(branch_rank);
+
+            if !branch_rank.is_empty() {
+                branch_rank.sort_unstable();
+                // because several words in same query can't match all a the position 0,
+                // we substract the word index to the position.
+                let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
+                // here we do the means of the words of the branch
+                min_rank = min_rank.min(branch_rank / branch_len as u64);
+            }
         }
 
         min_rank

From e65bad16ccd273625a624d44bde06e01eaf08bdb Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 25 Mar 2021 11:10:12 +0100
Subject: [PATCH 19/45] Compute the words prefixes at the end of an update

---
 http-ui/src/main.rs                           |  68 ------
 infos/src/main.rs                             |   1 +
 milli/src/index.rs                            |   6 +-
 milli/src/update/clear_documents.rs           |   2 +
 milli/src/update/delete_documents.rs          |  16 ++
 .../update/index_documents/merge_function.rs  |   4 +
 milli/src/update/index_documents/mod.rs       |  37 +++-
 milli/src/update/mod.rs                       |   9 +-
 milli/src/update/update_builder.rs            |  35 +---
 milli/src/update/word_prefix_docids.rs        |  75 +++++++
 .../word_prefix_pair_proximity_docids.rs      |  89 ++++++++
 milli/src/update/words_level_positions.rs     |  90 ++++++--
 milli/src/update/words_prefixes.rs            | 196 ------------------
 milli/src/update/words_prefixes_fst.rs        | 104 ++++++++++
 14 files changed, 409 insertions(+), 323 deletions(-)
 create mode 100644 milli/src/update/word_prefix_docids.rs
 create mode 100644 milli/src/update/word_prefix_pair_proximity_docids.rs
 delete mode 100644 milli/src/update/words_prefixes.rs
 create mode 100644 milli/src/update/words_prefixes_fst.rs

diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index c85bd9b15..00618f58a 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -228,8 +228,6 @@ enum UpdateMeta {
     ClearDocuments,
     Settings(Settings),
     Facets(Facets),
-    WordsPrefixes(WordsPrefixes),
-    WordsLevelPositions(WordsLevelPositions),
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -290,14 +288,6 @@ struct WordsLevelPositions {
     min_level_size: Option<NonZeroU32>,
 }
 
-// Any value that is present is considered Some value, including null.
-fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
-where T: Deserialize<'de>,
-      D: Deserializer<'de>
-{
-    Deserialize::deserialize(deserializer).map(Some)
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let opt = Opt::from_args();
@@ -496,36 +486,6 @@ async fn main() -> anyhow::Result<()> {
                         Err(e) => Err(e)
                     }
                 }
-                UpdateMeta::WordsPrefixes(settings) => {
-                    // We must use the write transaction of the update here.
-                    let mut wtxn = index_cloned.write_txn()?;
-                    let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned);
-                    if let Some(value) = settings.threshold {
-                        builder.threshold(value);
-                    }
-                    if let Some(value) = settings.max_prefix_length {
-                        builder.max_prefix_length(value);
-                    }
-                    match builder.execute() {
-                        Ok(()) => wtxn.commit().map_err(Into::into),
-                        Err(e) => Err(e)
-                    }
-                },
-                UpdateMeta::WordsLevelPositions(levels) => {
-                    // We must use the write transaction of the update here.
-                    let mut wtxn = index_cloned.write_txn()?;
-                    let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned);
-                    if let Some(value) = levels.level_group_size {
-                        builder.level_group_size(value);
-                    }
-                    if let Some(value) = levels.min_level_size {
-                        builder.min_level_size(value);
-                    }
-                    match builder.execute() {
-                        Ok(()) => wtxn.commit().map_err(Into::into),
-                        Err(e) => Err(e.into())
-                    }
-                }
             };
 
             let meta = match result {
@@ -942,32 +902,6 @@ async fn main() -> anyhow::Result<()> {
             warp::reply()
         });
 
-    let update_store_cloned = update_store.clone();
-    let update_status_sender_cloned = update_status_sender.clone();
-    let change_words_prefixes_route = warp::filters::method::post()
-        .and(warp::path!("words-prefixes"))
-        .and(warp::body::json())
-        .map(move |settings: WordsPrefixes| {
-            let meta = UpdateMeta::WordsPrefixes(settings);
-            let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
-            let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
-            eprintln!("update {} registered", update_id);
-            warp::reply()
-        });
-
-    let update_store_cloned = update_store.clone();
-    let update_status_sender_cloned = update_status_sender.clone();
-    let change_words_level_positions_route = warp::filters::method::post()
-        .and(warp::path!("words-level-positions"))
-        .and(warp::body::json())
-        .map(move |levels: WordsLevelPositions| {
-            let meta = UpdateMeta::WordsLevelPositions(levels);
-            let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
-            let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
-            eprintln!("update {} registered", update_id);
-            warp::reply()
-        });
-
     let update_store_cloned = update_store.clone();
     let update_status_sender_cloned = update_status_sender.clone();
     let abort_update_id_route = warp::filters::method::delete()
@@ -1042,8 +976,6 @@ async fn main() -> anyhow::Result<()> {
         .or(clearing_route)
         .or(change_settings_route)
         .or(change_facet_levels_route)
-        .or(change_words_prefixes_route)
-        .or(change_words_level_positions_route)
         .or(update_ws_route);
 
     let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
diff --git a/infos/src/main.rs b/infos/src/main.rs
index 0e6403d7b..e730a8b43 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -338,6 +338,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         facet_field_id_value_docids,
         field_id_docid_facet_values: _,
         documents,
+        ..
     } = index;
 
     let main_name = "main";
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 0659b207a..ba7747250 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -54,6 +54,8 @@ pub struct Index {
     pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
     /// Maps the word, level and position range with the docids that corresponds to it.
     pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
+    /// Maps the level positions of a word prefix with all the docids where this prefix appears.
+    pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
     /// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
     pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
     /// Maps the document id, the facet field id and the globally ordered value.
@@ -64,7 +66,7 @@ pub struct Index {
 
 impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
-        options.max_dbs(10);
+        options.max_dbs(11);
 
         let env = options.open(path)?;
         let main = env.create_poly_database(Some("main"))?;
@@ -74,6 +76,7 @@ impl Index {
         let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
         let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
         let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
+        let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
         let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
         let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
         let documents = env.create_database(Some("documents"))?;
@@ -98,6 +101,7 @@ impl Index {
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
             word_level_position_docids,
+            word_prefix_level_position_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 6d7dd72b8..f89c2d00c 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
             word_level_position_docids,
+            word_prefix_level_position_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
@@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_pair_proximity_docids.clear(self.wtxn)?;
         word_prefix_pair_proximity_docids.clear(self.wtxn)?;
         word_level_position_docids.clear(self.wtxn)?;
+        word_prefix_level_position_docids.clear(self.wtxn)?;
         facet_field_id_value_docids.clear(self.wtxn)?;
         field_id_docid_facet_values.clear(self.wtxn)?;
         documents.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index f9303d339..4c5f8d61a 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -89,6 +89,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
             word_level_position_docids,
+            word_prefix_level_position_docids,
             facet_field_id_value_docids,
             field_id_docid_facet_values,
             documents,
@@ -345,6 +346,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
 
         drop(iter);
 
+        // We delete the documents ids that are under the word prefix level position docids.
+        let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
+        while let Some(result) = iter.next() {
+            let (bytes, mut docids) = result?;
+            let previous_len = docids.len();
+            docids.difference_with(&self.documents_ids);
+            if docids.is_empty() {
+                iter.del_current()?;
+            } else if docids.len() != previous_len {
+                iter.put_current(bytes, &docids)?;
+            }
+        }
+
+        drop(iter);
+
         Ok(self.documents_ids.len())
     }
 }
diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs
index 54f994fc0..a6d008513 100644
--- a/milli/src/update/index_documents/merge_function.rs
+++ b/milli/src/update/index_documents/merge_function.rs
@@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -
     cbo_roaring_bitmap_merge(values)
 }
 
+pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
+    cbo_roaring_bitmap_merge(values)
+}
+
 pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     cbo_roaring_bitmap_merge(values)
 }
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 7a2196481..8ebdf1634 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -3,6 +3,7 @@ use std::collections::HashSet;
 use std::fs::File;
 use std::io::{self, Seek, SeekFrom};
 use std::num::{NonZeroU32, NonZeroUsize};
+use std::str;
 use std::sync::mpsc::sync_channel;
 use std::time::Instant;
 
@@ -13,18 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy
 use heed::types::ByteSlice;
 use log::{debug, info, error};
 use memmap::Mmap;
-use rayon::ThreadPool;
 use rayon::prelude::*;
+use rayon::ThreadPool;
 use serde::{Serialize, Deserialize};
 
 use crate::index::Index;
-use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep};
+use crate::update::{
+    Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
+    WordPrefixPairProximityDocids,
+};
 use self::store::{Store, Readers};
 pub use self::merge_function::{
     main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
     docid_word_positions_merge, documents_merge,
-    word_level_position_docids_merge, facet_field_value_docids_merge,
-    field_id_docid_facet_values_merge,
+    word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
+    facet_field_value_docids_merge, field_id_docid_facet_values_merge,
 };
 pub use self::transform::{Transform, TransformOutput};
 
@@ -719,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         builder.execute()?;
 
         // Run the words prefixes update operation.
-        let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
         if let Some(value) = self.words_prefix_threshold {
             builder.threshold(value);
         }
@@ -731,8 +732,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
         }
         builder.execute()?;
 
+        // Run the word prefix docids update operation.
+        let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        builder.max_nb_chunks = self.max_nb_chunks;
+        builder.max_memory = self.max_memory;
+        builder.execute()?;
+
+        // Run the word prefix pair proximity docids update operation.
+        let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
+        builder.chunk_compression_type = self.chunk_compression_type;
+        builder.chunk_compression_level = self.chunk_compression_level;
+        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
+        builder.max_nb_chunks = self.max_nb_chunks;
+        builder.max_memory = self.max_memory;
+        builder.execute()?;
+
         // Run the words level positions update operation.
-        let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id);
+        let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
         builder.chunk_compression_type = self.chunk_compression_type;
         builder.chunk_compression_level = self.chunk_compression_level;
         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index 1fc4890fb..203937e2f 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -6,8 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc
 pub use self::settings::{Setting, Settings};
 pub use self::update_builder::UpdateBuilder;
 pub use self::update_step::UpdateIndexingStep;
+pub use self::word_prefix_docids::WordPrefixDocids;
+pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
 pub use self::words_level_positions::WordsLevelPositions;
-pub use self::words_prefixes::WordsPrefixes;
+pub use self::words_prefixes_fst::WordsPrefixesFst;
 
 mod available_documents_ids;
 mod clear_documents;
@@ -17,6 +19,7 @@ mod index_documents;
 mod settings;
 mod update_builder;
 mod update_step;
+mod word_prefix_docids;
+mod word_prefix_pair_proximity_docids;
 mod words_level_positions;
-mod words_prefixes;
-
+mod words_prefixes_fst;
diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs
index 9a4fb850e..8d6eb034d 100644
--- a/milli/src/update/update_builder.rs
+++ b/milli/src/update/update_builder.rs
@@ -2,10 +2,7 @@ use grenad::CompressionType;
 use rayon::ThreadPool;
 
 use crate::Index;
-use super::{
-    ClearDocuments, DeleteDocuments, IndexDocuments, Settings,
-    Facets, WordsPrefixes, WordsLevelPositions,
-};
+use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
 
 pub struct UpdateBuilder<'a> {
     pub(crate) log_every_n: Option<usize>,
@@ -138,34 +135,4 @@ impl<'a> UpdateBuilder<'a> {
 
         builder
     }
-
-    pub fn words_prefixes<'t, 'u, 'i>(
-        self,
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-    ) -> WordsPrefixes<'t, 'u, 'i>
-    {
-        let mut builder = WordsPrefixes::new(wtxn, index, self.update_id);
-
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
-
-        builder
-    }
-
-    pub fn words_level_positions<'t, 'u, 'i>(
-        self,
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-    ) -> WordsLevelPositions<'t, 'u, 'i>
-    {
-        let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id);
-
-        builder.chunk_compression_type = self.chunk_compression_type;
-        builder.chunk_compression_level = self.chunk_compression_level;
-        builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
-
-        builder
-    }
 }
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
new file mode 100644
index 000000000..58c984212
--- /dev/null
+++ b/milli/src/update/word_prefix_docids.rs
@@ -0,0 +1,75 @@
+use std::str;
+
+use crate::Index;
+use fst::Streamer;
+use grenad::CompressionType;
+use heed::types::ByteSlice;
+
+use crate::update::index_documents::WriteMethod;
+use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database};
+
+pub struct WordPrefixDocids<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    pub(crate) chunk_compression_type: CompressionType,
+    pub(crate) chunk_compression_level: Option<u32>,
+    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
+}
+
+impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
+    pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
+        WordPrefixDocids {
+            wtxn,
+            index,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
+        }
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        // Clear the word prefix docids database.
+        self.index.word_prefix_docids.clear(self.wtxn)?;
+
+        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+
+        // It is forbidden to keep a mutable reference into the database
+        // and write into it at the same time, therefore we write into another file.
+        let mut prefix_docids_sorter = create_sorter(
+            word_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We iterate over all the prefixes and retrieve the corresponding docids.
+        let mut prefix_stream = prefix_fst.stream();
+        while let Some(bytes) = prefix_stream.next() {
+            let prefix = str::from_utf8(bytes)?;
+            let db = self.index.word_docids.remap_data_type::<ByteSlice>();
+            for result in db.prefix_iter(self.wtxn, prefix)? {
+                let (_word, data) = result?;
+                prefix_docids_sorter.insert(prefix, data)?;
+            }
+        }
+
+        drop(prefix_fst);
+
+        // We finally write the word prefix docids into the LMDB database.
+        sorter_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_docids.as_polymorph(),
+            prefix_docids_sorter,
+            word_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        Ok(())
+    }
+}
diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
new file mode 100644
index 000000000..c972efc4f
--- /dev/null
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -0,0 +1,89 @@
+use std::str;
+
+use fst::automaton::{Automaton, Str};
+use fst::{Streamer, IntoStreamer};
+use grenad::CompressionType;
+use heed::BytesEncode;
+use heed::types::ByteSlice;
+use log::debug;
+
+use crate::Index;
+use crate::heed_codec::StrStrU8Codec;
+use crate::update::index_documents::{
+    WriteMethod, create_sorter, sorter_into_lmdb_database,
+    words_pairs_proximities_docids_merge,
+};
+
+pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    pub(crate) chunk_compression_type: CompressionType,
+    pub(crate) chunk_compression_level: Option<u32>,
+    pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
+}
+
+impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
+    pub fn new(
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+    ) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
+    {
+        WordPrefixPairProximityDocids {
+            wtxn,
+            index,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
+        }
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
+
+        self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
+
+        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+
+        // Here we create a sorter akin to the previous one.
+        let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
+            words_pairs_proximities_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We insert all the word pairs corresponding to the word-prefix pairs
+        // where the prefixes appears in the prefix FST previously constructed.
+        let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
+        for result in db.iter(self.wtxn)? {
+            let ((word1, word2, prox), data) = result?;
+            let automaton = Str::new(word2).starts_with();
+            let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
+            while let Some(prefix) = matching_prefixes.next() {
+                let prefix = str::from_utf8(prefix)?;
+                let pair = (word1, prefix, prox);
+                let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
+                word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
+            }
+        }
+
+        drop(prefix_fst);
+
+        // We finally write the word prefix pair proximity docids into the LMDB database.
+        sorter_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
+            word_prefix_pair_proximity_docids_sorter,
+            words_pairs_proximities_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        Ok(())
+    }
+}
diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs
index 70bc89860..1b772c37d 100644
--- a/milli/src/update/words_level_positions.rs
+++ b/milli/src/update/words_level_positions.rs
@@ -1,17 +1,22 @@
-use std::cmp;
+use std::{cmp, str};
 use std::convert::TryFrom;
 use std::fs::File;
 use std::num::NonZeroU32;
 
+use fst::automaton::{self, Automaton};
+use fst::{Streamer, IntoStreamer};
 use grenad::{CompressionType, Reader, Writer, FileFuse};
-use heed::types::{DecodeIgnore, Str};
+use heed::types::{ByteSlice, DecodeIgnore, Str};
 use heed::{BytesEncode, Error};
 use log::debug;
 use roaring::RoaringBitmap;
 
 use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
 use crate::update::index_documents::WriteMethod;
-use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
+use crate::update::index_documents::{
+    create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
+    word_prefix_level_positions_docids_merge, sorter_into_lmdb_database
+};
 use crate::{Index, TreeLevel};
 
 pub struct WordsLevelPositions<'t, 'u, 'i> {
@@ -20,27 +25,24 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
     pub(crate) chunk_compression_type: CompressionType,
     pub(crate) chunk_compression_level: Option<u32>,
     pub(crate) chunk_fusing_shrink_size: Option<u64>,
+    pub(crate) max_nb_chunks: Option<usize>,
+    pub(crate) max_memory: Option<usize>,
     level_group_size: NonZeroU32,
     min_level_size: NonZeroU32,
-    _update_id: u64,
 }
 
 impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-        update_id: u64,
-    ) -> WordsLevelPositions<'t, 'u, 'i>
-    {
+    pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
         WordsLevelPositions {
             wtxn,
             index,
             chunk_compression_type: CompressionType::None,
             chunk_compression_level: None,
             chunk_fusing_shrink_size: None,
+            max_nb_chunks: None,
+            max_memory: None,
             level_group_size: NonZeroU32::new(4).unwrap(),
             min_level_size: NonZeroU32::new(5).unwrap(),
-            _update_id: update_id,
         }
     }
 
@@ -76,7 +78,71 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
             self.wtxn,
             *self.index.word_level_position_docids.as_polymorph(),
             entries,
-            |_, _| anyhow::bail!("invalid facet level merging"),
+            |_, _| anyhow::bail!("invalid word level position merging"),
+            WriteMethod::Append,
+        )?;
+
+        // We compute the word prefix level positions database.
+        self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
+
+        let mut word_prefix_level_positions_docids_sorter = create_sorter(
+            word_prefix_level_positions_docids_merge,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.max_nb_chunks,
+            self.max_memory,
+        );
+
+        // We insert the word prefix level positions where the level is equal to 0 and
+        // corresponds to the word-prefix level positions where the prefixes appears
+        // in the prefix FST previously constructed.
+        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
+        let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>();
+        for result in db.iter(self.wtxn)? {
+            let ((word, level, left, right), data) = result?;
+            if level == TreeLevel::min_value() {
+                let automaton = automaton::Str::new(word).starts_with();
+                let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
+                while let Some(prefix) = matching_prefixes.next() {
+                    let prefix = str::from_utf8(prefix)?;
+                    let key = (prefix, level, left, right);
+                    let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap();
+                    word_prefix_level_positions_docids_sorter.insert(bytes, data)?;
+                }
+            }
+        }
+
+        // We finally write all the word prefix level positions docids with
+        // a level equal to 0 into the LMDB database.
+        sorter_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_level_position_docids.as_polymorph(),
+            word_prefix_level_positions_docids_sorter,
+            word_prefix_level_positions_docids_merge,
+            WriteMethod::Append,
+        )?;
+
+        let entries = compute_positions_levels(
+            self.wtxn,
+            self.index.word_prefix_docids.remap_data_type::<DecodeIgnore>(),
+            self.index.word_prefix_level_position_docids,
+            self.chunk_compression_type,
+            self.chunk_compression_level,
+            self.chunk_fusing_shrink_size,
+            self.level_group_size,
+            self.min_level_size,
+        )?;
+
+        // The previously computed entries also defines the level 0 entries
+        // so we can clear the database and append all of these entries.
+        self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
+
+        write_into_lmdb_database(
+            self.wtxn,
+            *self.index.word_prefix_level_position_docids.as_polymorph(),
+            entries,
+            |_, _| anyhow::bail!("invalid word prefix level position merging"),
             WriteMethod::Append,
         )?;
 
diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs
deleted file mode 100644
index f2fe526a2..000000000
--- a/milli/src/update/words_prefixes.rs
+++ /dev/null
@@ -1,196 +0,0 @@
-use std::iter::FromIterator;
-use std::str;
-
-use chrono::Utc;
-use fst::automaton::Str;
-use fst::{Automaton, Streamer, IntoStreamer};
-use grenad::CompressionType;
-use heed::BytesEncode;
-use heed::types::ByteSlice;
-
-use crate::heed_codec::StrStrU8Codec;
-use crate::update::index_documents::WriteMethod;
-use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database};
-use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
-use crate::{Index, SmallString32};
-
-pub struct WordsPrefixes<'t, 'u, 'i> {
-    wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    index: &'i Index,
-    pub(crate) chunk_compression_type: CompressionType,
-    pub(crate) chunk_compression_level: Option<u32>,
-    pub(crate) chunk_fusing_shrink_size: Option<u64>,
-    pub(crate) max_nb_chunks: Option<usize>,
-    pub(crate) max_memory: Option<usize>,
-    threshold: f64,
-    max_prefix_length: usize,
-    _update_id: u64,
-}
-
-impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-        update_id: u64,
-    ) -> WordsPrefixes<'t, 'u, 'i>
-    {
-        WordsPrefixes {
-            wtxn,
-            index,
-            chunk_compression_type: CompressionType::None,
-            chunk_compression_level: None,
-            chunk_fusing_shrink_size: None,
-            max_nb_chunks: None,
-            max_memory: None,
-            threshold: 0.1 / 100.0, // .01%
-            max_prefix_length: 4,
-            _update_id: update_id,
-        }
-    }
-
-    /// Set the ratio of concerned words required to make a prefix be part of the words prefixes
-    /// database. If a word prefix is supposed to match more than this number of words in the
-    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
-    ///
-    /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
-    /// to these bounds otherwise.
-    pub fn threshold(&mut self, value: f64) -> &mut Self {
-        self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
-        self
-    }
-
-    /// Set the maximum length of prefixes in bytes.
-    ///
-    /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
-    /// to these bounds, otherwise.
-    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
-        self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
-        self
-    }
-
-    pub fn execute(self) -> anyhow::Result<()> {
-        self.index.set_updated_at(self.wtxn, &Utc::now())?;
-        // Clear the words prefixes datastructures.
-        self.index.word_prefix_docids.clear(self.wtxn)?;
-        self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
-
-        let words_fst = self.index.words_fst(&self.wtxn)?;
-        let number_of_words = words_fst.len();
-        let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
-
-        // It is forbidden to keep a mutable reference into the database
-        // and write into it at the same time, therefore we write into another file.
-        let mut prefix_docids_sorter = create_sorter(
-            word_docids_merge,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-            self.chunk_fusing_shrink_size,
-            self.max_nb_chunks,
-            self.max_memory,
-        );
-
-        let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
-        for n in 1..=self.max_prefix_length {
-
-            let mut current_prefix = SmallString32::new();
-            let mut current_prefix_count = 0;
-            let mut builder = fst::SetBuilder::memory();
-
-            let mut stream = words_fst.stream();
-            while let Some(bytes) = stream.next() {
-                // We try to get the first n bytes out of this string but we only want
-                // to split at valid characters bounds. If we try to split in the middle of
-                // a character we ignore this word and go to the next one.
-                let word = str::from_utf8(bytes)?;
-                let prefix = match word.get(..n) {
-                    Some(prefix) => prefix,
-                    None => continue,
-                };
-
-                // This is the first iteration of the loop,
-                // or the current word doesn't starts with the current prefix.
-                if current_prefix_count == 0 || prefix != current_prefix.as_str() {
-                    current_prefix = SmallString32::from(prefix);
-                    current_prefix_count = 0;
-                }
-
-                current_prefix_count += 1;
-
-                // There is enough words corresponding to this prefix to add it to the cache.
-                if current_prefix_count == min_number_of_words {
-                    builder.insert(prefix)?;
-                }
-            }
-
-            // We construct the final set for prefixes of size n.
-            prefix_fsts.push(builder.into_set());
-        }
-
-        // We merge all of the previously computed prefixes into on final set.
-        let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
-        let mut builder = fst::SetBuilder::memory();
-        builder.extend_stream(op.r#union())?;
-        let prefix_fst = builder.into_set();
-
-        // We iterate over all the prefixes and retrieve the corresponding docids.
-        let mut prefix_stream = prefix_fst.stream();
-        while let Some(bytes) = prefix_stream.next() {
-            let prefix = str::from_utf8(bytes)?;
-            let db = self.index.word_docids.remap_data_type::<ByteSlice>();
-            for result in db.prefix_iter(self.wtxn, prefix)? {
-                let (_word, data) = result?;
-                prefix_docids_sorter.insert(prefix, data)?;
-            }
-        }
-
-        // Set the words prefixes FST in the dtabase.
-        self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
-
-        // We finally write the word prefix docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.index.word_prefix_docids.as_polymorph(),
-            prefix_docids_sorter,
-            word_docids_merge,
-            WriteMethod::Append,
-        )?;
-
-        // We compute the word prefix pair proximity database.
-
-        // Here we create a sorter akin to the previous one.
-        let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
-            words_pairs_proximities_docids_merge,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-            self.chunk_fusing_shrink_size,
-            self.max_nb_chunks,
-            self.max_memory,
-        );
-
-        // We insert all the word pairs corresponding to the word-prefix pairs
-        // where the prefixes appears in the prefix FST previously constructed.
-        let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
-        for result in db.iter(self.wtxn)? {
-            let ((word1, word2, prox), data) = result?;
-            let automaton = Str::new(word2).starts_with();
-            let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
-            while let Some(prefix) = matching_prefixes.next() {
-                let prefix = str::from_utf8(prefix)?;
-                let pair = (word1, prefix, prox);
-                let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
-                word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
-            }
-        }
-
-        // We finally write the word prefix pair proximity docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
-            word_prefix_pair_proximity_docids_sorter,
-            words_pairs_proximities_docids_merge,
-            WriteMethod::Append,
-        )?;
-
-        Ok(())
-    }
-}
diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs
new file mode 100644
index 000000000..f53b0ee00
--- /dev/null
+++ b/milli/src/update/words_prefixes_fst.rs
@@ -0,0 +1,104 @@
+use std::iter::FromIterator;
+use std::str;
+
+use fst::Streamer;
+use crate::{Index, SmallString32};
+
+pub struct WordsPrefixesFst<'t, 'u, 'i> {
+    wtxn: &'t mut heed::RwTxn<'i, 'u>,
+    index: &'i Index,
+    threshold: f64,
+    max_prefix_length: usize,
+    _update_id: u64,
+}
+
+impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
+    pub fn new(
+        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        index: &'i Index,
+        update_id: u64,
+    ) -> WordsPrefixesFst<'t, 'u, 'i>
+    {
+        WordsPrefixesFst {
+            wtxn,
+            index,
+            threshold: 0.1 / 100.0, // .01%
+            max_prefix_length: 4,
+            _update_id: update_id,
+        }
+    }
+
+    /// Set the ratio of concerned words required to make a prefix be part of the words prefixes
+    /// database. If a word prefix is supposed to match more than this number of words in the
+    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
+    ///
+    /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
+    /// to these bounds otherwise.
+    pub fn threshold(&mut self, value: f64) -> &mut Self {
+        self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
+        self
+    }
+
+    /// Set the maximum length of prefixes in bytes.
+    ///
+    /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
+    /// to these bounds, otherwise.
+    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
+        self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
+        self
+    }
+
+    pub fn execute(self) -> anyhow::Result<()> {
+        let words_fst = self.index.words_fst(&self.wtxn)?;
+        let number_of_words = words_fst.len();
+        let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
+
+        let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
+        for n in 1..=self.max_prefix_length {
+
+            let mut current_prefix = SmallString32::new();
+            let mut current_prefix_count = 0;
+            let mut builder = fst::SetBuilder::memory();
+
+            let mut stream = words_fst.stream();
+            while let Some(bytes) = stream.next() {
+                // We try to get the first n bytes out of this string but we only want
+                // to split at valid characters bounds. If we try to split in the middle of
+                // a character we ignore this word and go to the next one.
+                let word = str::from_utf8(bytes)?;
+                let prefix = match word.get(..n) {
+                    Some(prefix) => prefix,
+                    None => continue,
+                };
+
+                // This is the first iteration of the loop,
+                // or the current word doesn't starts with the current prefix.
+                if current_prefix_count == 0 || prefix != current_prefix.as_str() {
+                    current_prefix = SmallString32::from(prefix);
+                    current_prefix_count = 0;
+                }
+
+                current_prefix_count += 1;
+
+                // There is enough words corresponding to this prefix to add it to the cache.
+                if current_prefix_count == min_number_of_words {
+                    builder.insert(prefix)?;
+                }
+            }
+
+            // We construct the final set for prefixes of size n.
+            prefix_fsts.push(builder.into_set());
+        }
+
+        // We merge all of the previously computed prefixes into on final set.
+        let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
+        let mut builder = fst::SetBuilder::memory();
+        builder.extend_stream(op.r#union())?;
+        let prefix_fst = builder.into_set();
+
+        // Set the words prefixes FST in the dtabase.
+        self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
+
+        Ok(())
+    }
+}

From 1aad66bdaafcf29428f30c3cf7463c0635396a7e Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 25 Mar 2021 11:17:32 +0100
Subject: [PATCH 20/45] Compute stats about the word prefix level positions
 database in the infos crate

---
 infos/src/main.rs | 101 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 10 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index e730a8b43..81b753084 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -21,6 +21,7 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
 const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
 const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
 const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
+const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
 const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
 const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
 const DOCUMENTS_DB_NAME: &str = "documents";
@@ -33,6 +34,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[
     WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
     WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
     WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
+    WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
     FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
     FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
     DOCUMENTS_DB_NAME,
@@ -122,10 +124,21 @@ enum Command {
         #[structopt(long)]
         full_display: bool,
 
-        /// The field name in the document.
+        /// Words appearing in the documents.
         words: Vec<String>,
     },
 
+    /// Outputs a CSV with the documents ids along with
+    /// the word prefix level positions where it appears.
+    WordPrefixesLevelPositionsDocids {
+        /// Display the whole documents ids in details.
+        #[structopt(long)]
+        full_display: bool,
+
+        /// Prefixes of words appearing in the documents.
+        prefixes: Vec<String>,
+    },
+
     /// Outputs a CSV with the documents ids, words and the positions where this word appears.
     DocidsWordsPositions {
         /// Display the whole positions in detail.
@@ -236,6 +249,9 @@ fn main() -> anyhow::Result<()> {
         WordsLevelPositionsDocids { full_display, words } => {
             words_level_positions_docids(&index, &rtxn, !full_display, words)
         },
+        WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
+            word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
+        },
         DocidsWordsPositions { full_display, internal_documents_ids } => {
             docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
         },
@@ -335,6 +351,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         word_pair_proximity_docids,
         word_prefix_pair_proximity_docids,
         word_level_position_docids,
+        word_prefix_level_position_docids,
         facet_field_id_value_docids,
         field_id_docid_facet_values: _,
         documents,
@@ -348,6 +365,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
     let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
     let word_pair_proximity_docids_name = "word_pair_proximity_docids";
     let word_level_position_docids_name = "word_level_position_docids";
+    let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
     let facet_field_id_value_docids_name = "facet_field_id_value_docids";
     let documents_name = "documents";
 
@@ -411,6 +429,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             if heap.len() > limit { heap.pop(); }
         }
 
+        for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
+            let ((word, level, left, right), value) = result?;
+            let key = format!("{} {} {:?}", word, level, left..=right);
+            heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
+            if heap.len() > limit { heap.pop(); }
+        }
+
         let faceted_fields = index.faceted_fields_ids(rtxn)?;
         let fields_ids_map = index.fields_ids_map(rtxn)?;
         for (field_id, field_type) in faceted_fields {
@@ -588,6 +613,45 @@ fn words_level_positions_docids(
     Ok(wtr.flush()?)
 }
 
+fn word_prefixes_level_positions_docids(
+    index: &Index,
+    rtxn: &heed::RoTxn,
+    debug: bool,
+    prefixes: Vec<String>,
+) -> anyhow::Result<()>
+{
+    let stdout = io::stdout();
+    let mut wtr = csv::Writer::from_writer(stdout.lock());
+    wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
+
+    for word in prefixes.iter().map(AsRef::as_ref) {
+        let range = {
+            let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
+            let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
+            left..=right
+        };
+        for result in index.word_prefix_level_position_docids.range(rtxn, &range)? {
+            let ((w, level, left, right), docids) = result?;
+
+            let count = docids.len().to_string();
+            let docids = if debug {
+                format!("{:?}", docids)
+            } else {
+                format!("{:?}", docids.iter().collect::<Vec<_>>())
+            };
+            let position_range = if level == TreeLevel::min_value() {
+                format!("{:?}", left)
+            } else {
+                format!("{:?}", left..=right)
+            };
+            let level = level.to_string();
+            wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
+        }
+    }
+
+    Ok(wtr.flush()?)
+}
+
 fn docids_words_positions(
     index: &Index,
     rtxn: &heed::RoTxn,
@@ -779,6 +843,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any
 fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> anyhow::Result<()> {
     use heed::types::ByteSlice;
 
+    let Index {
+        env: _,
+        main,
+        word_docids,
+        word_prefix_docids,
+        docid_word_positions,
+        word_pair_proximity_docids,
+        word_prefix_pair_proximity_docids,
+        word_level_position_docids,
+        word_prefix_level_position_docids,
+        facet_field_id_value_docids,
+        field_id_docid_facet_values,
+        documents,
+    } = index;
+
     let names = if names.is_empty() {
         ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect()
     } else {
@@ -787,15 +866,17 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
 
     for name in names {
         let database = match name.as_str() {
-            MAIN_DB_NAME => &index.main,
-            WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(),
-            WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(),
-            DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
-            WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
-            WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(),
-            FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(),
-            FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(),
-            DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
+            MAIN_DB_NAME => &main,
+            WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(),
+            WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(),
+            DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(),
+            WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(),
+            WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
+            WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
+            WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
+            FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(),
+            FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(),
+            DOCUMENTS_DB_NAME => documents.as_polymorph(),
             unknown => anyhow::bail!("unknown database {:?}", unknown),
         };
 

From 7ff4a2a708d4d08a25fd800348316ee361108c5d Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 25 Mar 2021 23:45:06 +0100
Subject: [PATCH 21/45] Display the number of entries in the infos crate

---
 infos/src/main.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 81b753084..5a12a9d4d 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -882,16 +882,19 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
 
         let mut key_size: u64 = 0;
         let mut val_size: u64 = 0;
+        let mut number_entries: u64 = 0;
         for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
             let (k, v) = result?;
             key_size += k.len() as u64;
             val_size += v.len() as u64;
+            number_entries += 1;
         }
 
         println!("The {} database weigh:", name);
         println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true));
         println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true));
         println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true));
+        println!("\tnumber of entries: {}", number_entries);
     }
 
     Ok(())

From 361193099fdeedb8b4b6fb5bf450bc9baa07f5cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Mon, 29 Mar 2021 16:25:14 +0200
Subject: [PATCH 22/45] Reduce the amount of branches when query tree flattened

---
 milli/src/search/criteria/attribute.rs | 83 +++++++++++++++-----------
 1 file changed, 49 insertions(+), 34 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 160807847..31c11e7bb 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -1,3 +1,4 @@
+use std::cmp;
 use std::collections::{BTreeMap, HashMap, btree_map};
 use std::mem::take;
 
@@ -15,7 +16,7 @@ pub struct Attribute<'t> {
     candidates: Option<RoaringBitmap>,
     bucket_candidates: RoaringBitmap,
     parent: Box<dyn Criterion + 't>,
-    flattened_query_tree: Option<Vec<Vec<Query>>>,
+    flattened_query_tree: Option<Vec<Vec<Vec<Query>>>>,
     current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
 }
 
@@ -115,33 +116,43 @@ impl<'t> Criterion for Attribute<'t> {
 
 fn linear_compute_candidates(
     ctx: &dyn Context,
-    branches: &Vec<Vec<Query>>,
+    branches: &Vec<Vec<Vec<Query>>>,
     allowed_candidates: &RoaringBitmap,
 ) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
 {
-    fn compute_candidate_rank(branches: &Vec<Vec<Query>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
+    fn compute_candidate_rank(branches: &Vec<Vec<Vec<Query>>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
         let mut min_rank = u64::max_value();
         for branch in branches {
+
             let branch_len = branch.len();
             let mut branch_rank = Vec::with_capacity(branch_len);
-            for Query { prefix, kind } in branch {
-                // find the best position of the current word in the document.
-                let position =  match kind {
-                    QueryKind::Exact { word, .. } => {
-                        if *prefix {
-                            word_derivations(word, true, 0, &words_positions)
-                            .flat_map(|positions| positions.iter().next()).min()
-                        } else {
-                            words_positions.get(word)
-                                .map(|positions| positions.iter().next())
-                                .flatten()
-                        }
-                    },
-                    QueryKind::Tolerant { typo, word } => {
-                        word_derivations(word, *prefix, *typo, &words_positions)
-                            .flat_map(|positions| positions.iter().next()).min()
-                    },
-                };
+            for derivates in branch {
+                let mut position = None;
+                for Query { prefix, kind } in derivates {
+                    // find the best position of the current word in the document.
+                    let current_position = match kind {
+                        QueryKind::Exact { word, .. } => {
+                            if *prefix {
+                                word_derivations(word, true, 0, &words_positions)
+                                .flat_map(|positions| positions.iter().next()).min()
+                            } else {
+                                words_positions.get(word)
+                                    .map(|positions| positions.iter().next())
+                                    .flatten()
+                            }
+                        },
+                        QueryKind::Tolerant { typo, word } => {
+                            word_derivations(word, *prefix, *typo, &words_positions)
+                                .flat_map(|positions| positions.iter().next()).min()
+                        },
+                    };
+
+                    match (position, current_position) {
+                        (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)),
+                        (None, Some(cp)) => position = Some(cp),
+                        _ => (),
+                    }
+                }
 
                 // if a position is found, we add it to the branch score,
                 // otherwise the branch is considered as unfindable in this document and we break.
@@ -194,10 +205,10 @@ fn linear_compute_candidates(
 }
 
 // TODO can we keep refs of Query
-fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
+fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
     use crate::search::criteria::Operation::{And, Or, Consecutive};
 
-    fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Query>> {
+    fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Vec<Query>>> {
         match tail.split_first() {
             Some((thead, tail)) => {
                 let tail = and_recurse(thead, tail);
@@ -215,13 +226,17 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Query>> {
         }
     }
 
-    fn recurse(op: &Operation) -> Vec<Vec<Query>> {
+    fn recurse(op: &Operation) -> Vec<Vec<Vec<Query>>> {
         match op {
             And(ops) | Consecutive(ops) => {
                 ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
             },
-            Or(_, ops) => ops.into_iter().map(recurse).flatten().collect(),
-            Operation::Query(query) => vec![vec![query.clone()]],
+            Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
+                vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
+            } else {
+                ops.into_iter().map(recurse).flatten().collect()
+            },
+            Operation::Query(query) => vec![vec![vec![query.clone()]]],
         }
     }
 
@@ -256,19 +271,19 @@ mod tests {
         ]);
 
         let expected = vec![
-            vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }],
+            vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]],
             vec![
-                Query { prefix: false, kind: QueryKind::exact(S("manythe")) },
-                Query { prefix: false, kind: QueryKind::exact(S("fish")) },
+                vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }],
+                vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
             ],
             vec![
-                Query { prefix: false, kind: QueryKind::exact(S("many")) },
-                Query { prefix: false, kind: QueryKind::exact(S("thefish")) },
+                vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
+                vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }],
             ],
             vec![
-                Query { prefix: false, kind: QueryKind::exact(S("many")) },
-                Query { prefix: false, kind: QueryKind::exact(S("the")) },
-                Query { prefix: false, kind: QueryKind::exact(S("fish")) },
+                vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }],
+                vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }],
+                vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }],
             ],
         ];
 

From 59f58c15f7cedf16acdbb3a89bae17247ffcc3ab Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 31 Mar 2021 19:23:02 +0200
Subject: [PATCH 23/45] Implement attribute criterion

* Implement WordLevelIterator
* Implement QueryLevelIterator
* Implement set algorithm based on iterators

Not tested + Some TODO to fix
---
 milli/src/search/criteria/attribute.rs | 354 +++++++++++++++++++++++--
 milli/src/search/criteria/final.rs     |   4 +-
 milli/src/search/criteria/mod.rs       |  52 +++-
 milli/src/search/criteria/proximity.rs |   4 +-
 milli/src/search/criteria/typo.rs      |   4 +-
 milli/src/search/criteria/words.rs     |   4 +-
 milli/src/tree_level.rs                |   4 +
 7 files changed, 394 insertions(+), 32 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 31c11e7bb..af336c21f 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -1,17 +1,17 @@
-use std::cmp;
+use std::{cmp::{self, Ordering}, collections::BinaryHeap};
 use std::collections::{BTreeMap, HashMap, btree_map};
 use std::mem::take;
 
 use roaring::RoaringBitmap;
 
-use crate::{search::build_dfa};
+use crate::{TreeLevel, search::build_dfa};
 use crate::search::criteria::Query;
 use crate::search::query_tree::{Operation, QueryKind};
 use crate::search::WordDerivationsCache;
 use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 
 pub struct Attribute<'t> {
-    ctx: &'t dyn Context,
+    ctx: &'t dyn Context<'t>,
     query_tree: Option<Operation>,
     candidates: Option<RoaringBitmap>,
     bucket_candidates: RoaringBitmap,
@@ -21,7 +21,7 @@ pub struct Attribute<'t> {
 }
 
 impl<'t> Attribute<'t> {
-    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
+    pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
         Attribute {
             ctx,
             query_tree: None,
@@ -51,23 +51,27 @@ impl<'t> Criterion for Attribute<'t> {
                         flatten_query_tree(&qt)
                     });
 
-                    let current_buckets = match self.current_buckets.as_mut() {
-                        Some(current_buckets) => current_buckets,
-                        None => {
-                            let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
-                            self.current_buckets.get_or_insert(new_buckets.into_iter())
-                        },
-                    };
+                    let found_candidates = if candidates.len() < 1000 {
+                        let current_buckets = match self.current_buckets.as_mut() {
+                            Some(current_buckets) => current_buckets,
+                            None => {
+                                let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?;
+                                self.current_buckets.get_or_insert(new_buckets.into_iter())
+                            },
+                        };
 
-                    let found_candidates = match current_buckets.next() {
-                        Some((_score, candidates)) => candidates,
-                        None => {
-                            return Ok(Some(CriterionResult {
-                                query_tree: self.query_tree.take(),
-                                candidates: self.candidates.take(),
-                                bucket_candidates: take(&mut self.bucket_candidates),
-                            }));
-                        },
+                        match current_buckets.next() {
+                            Some((_score, candidates)) => candidates,
+                            None => {
+                                return Ok(Some(CriterionResult {
+                                    query_tree: self.query_tree.take(),
+                                    candidates: self.candidates.take(),
+                                    bucket_candidates: take(&mut self.bucket_candidates),
+                                }));
+                            },
+                        }
+                    } else {
+                        set_compute_candidates(self.ctx, flattened_query_tree, candidates)?
                     };
 
                     candidates.difference_with(&found_candidates);
@@ -114,6 +118,316 @@ impl<'t> Criterion for Attribute<'t> {
     }
 }
 
+struct WordLevelIterator<'t, 'q> {
+    inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
+    level: TreeLevel,
+    interval_size: u32,
+    word: &'q str,
+    in_prefix_cache: bool,
+    inner_next: Option<(u32, u32, RoaringBitmap)>,
+    current_interval: Option<(u32, u32)>,
+}
+
+impl<'t, 'q> WordLevelIterator<'t, 'q> {
+    fn new(ctx: &'t dyn Context<'t>, query: &'q Query) -> heed::Result<Option<Self>> {
+        // TODO make it typo/prefix tolerant
+        let word = query.kind.word();
+        let in_prefix_cache = query.prefix && ctx.in_prefix_cache(word);
+        match ctx.word_position_last_level(word, in_prefix_cache)? {
+            Some(level) =>  {
+                let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
+                let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?;
+                Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
+            },
+            None => Ok(None),
+        }
+    }
+
+    fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result<Self> {
+        let level = level.min(&self.level).clone();
+        let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
+        let word = self.word;
+        let in_prefix_cache = self.in_prefix_cache;
+        // TODO try to dig starting from the current interval
+        // let left = self.current_interval.map(|(left, _)| left);
+        let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?;
+
+        Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
+    }
+
+    fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
+        fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left }
+
+        let inner_next = match self.inner_next.take() {
+            Some(inner_next) => Some(inner_next),
+            None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)),
+        };
+
+        match inner_next {
+            Some((left, right, docids)) => {
+                match self.current_interval {
+                    Some((last_left, last_right)) if !is_next_interval(last_right, left) => {
+                        let blank_left = last_left + self.interval_size;
+                        let blank_right = last_right + self.interval_size;
+                        self.current_interval = Some((blank_left, blank_right));
+                        self.inner_next = Some((left, right, docids));
+                        Ok(Some((blank_left, blank_right, RoaringBitmap::new())))
+                    },
+                    _ => {
+                        self.current_interval = Some((left, right));
+                        Ok(Some((left, right, docids)))
+                    }
+                }
+            },
+            None => Ok(None),
+        }
+    }
+}
+
+struct QueryLevelIterator<'t, 'q> {
+    previous: Option<Box<QueryLevelIterator<'t, 'q>>>,
+    inner: Vec<WordLevelIterator<'t, 'q>>,
+    level: TreeLevel,
+    accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
+    previous_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
+}
+
+impl<'t, 'q> QueryLevelIterator<'t, 'q> {
+    fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>) -> heed::Result<Option<Self>> {
+        let mut inner = Vec::with_capacity(queries.len());
+        for query in queries {
+            if let Some(word_level_iterator) = WordLevelIterator::new(ctx, query)? {
+                inner.push(word_level_iterator);
+            }
+        }
+
+        let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone());
+        match highest {
+            Some(level) => Ok(Some(Self {
+                previous: None,
+                inner,
+                level,
+                accumulator: vec![],
+                previous_accumulator: vec![],
+            })),
+            None => Ok(None),
+        }
+    }
+
+    fn previous(&mut self, previous: QueryLevelIterator<'t, 'q>) -> &Self {
+        self.previous = Some(Box::new(previous));
+        self
+    }
+
+    fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result<Self> {
+        let (level, previous) = match &self.previous {
+            Some(previous) => {
+                let previous = previous.dig(ctx)?;
+                (previous.level.min(self.level), Some(Box::new(previous)))
+            },
+            None => (self.level.saturating_sub(1), None),
+        };
+
+        let mut inner = Vec::with_capacity(self.inner.len());
+        for word_level_iterator in self.inner.iter() {
+            inner.push(word_level_iterator.dig(ctx, &level)?);
+        }
+
+        Ok(Self {previous, inner, level, accumulator: vec![], previous_accumulator: vec![]})
+    }
+
+
+
+    fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
+        let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None;
+        let u8_level = Into::<u8>::into(level);
+        let interval_size = 4u32.pow(u8_level as u32);
+        for wli in self.inner.iter_mut() {
+            let wli_u8_level = Into::<u8>::into(wli.level.clone());
+            let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32);
+            for _ in 0..accumulated_count {
+                if let Some((next_left, _, next_docids)) =  wli.next()? {
+                    accumulated = accumulated.take().map(
+                        |(acc_left, acc_right, mut acc_docids)| {
+                            acc_docids.union_with(&next_docids);
+                            (acc_left, acc_right, acc_docids)
+                        }
+                    ).or_else(|| Some((next_left, next_left + interval_size, next_docids)));
+                }
+            }
+        }
+
+        Ok(accumulated)
+    }
+
+    fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> {
+        let previous_result = match self.previous.as_mut() {
+            Some(previous) => {
+                Some(previous.next()?)
+            },
+            None => None,
+        };
+
+        match previous_result {
+            Some((previous_level, previous_next)) => {
+                let inner_next = self.inner_next(previous_level)?;
+                self.accumulator.push(inner_next);
+                self.previous_accumulator.push(previous_next);
+                // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty,
+                // WARNING the cleaned intervals count needs to be kept to skip at the end
+                let mut merged_interval = None;
+                for current in self.accumulator.iter().rev().zip(self.previous_accumulator.iter()) {
+                    if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
+                        let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new()));
+                        merged_docids.union_with(&(a & b));
+                    }
+                }
+                Ok((previous_level, merged_interval))
+            },
+            None => {
+                let level = self.level.clone();
+                let next_interval = self.inner_next(level.clone())?;
+                self.accumulator = vec![next_interval.clone()];
+                Ok((level, next_interval))
+            }
+        }
+    }
+}
+
+struct Branch<'t, 'q> {
+    query_level_iterator: QueryLevelIterator<'t, 'q>,
+    last_result: Option<(u32, u32, RoaringBitmap)>,
+    tree_level: TreeLevel,
+    branch_size: u32,
+}
+
+impl<'t, 'q> Branch<'t, 'q> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((1..branch_size).sum()) / branch_size }
+        match (&self.last_result, &other.last_result) {
+            (Some((s_left, _, _)), Some((o_left, _, _))) => {
+                // we compute a rank form the left interval.
+                let self_rank = compute_rank(*s_left, self.branch_size);
+                let other_rank = compute_rank(*o_left, other.branch_size);
+                let left_cmp = self_rank.cmp(&other_rank).reverse();
+                // on level: higher is better,
+                // we want to reduce highest levels first.
+                let level_cmp = self.tree_level.cmp(&other.tree_level);
+
+                left_cmp.then(level_cmp)
+            },
+            (Some(_), None) => Ordering::Greater,
+            (None, Some(_)) => Ordering::Less,
+            (None, None) => Ordering::Equal,
+        }
+    }
+}
+
+impl<'t, 'q> Ord for Branch<'t, 'q> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.cmp(other)
+    }
+}
+
+impl<'t, 'q> PartialOrd for Branch<'t, 'q> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<'t, 'q> PartialEq for Branch<'t, 'q> {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<'t, 'q> Eq for Branch<'t, 'q> {}
+
+fn initialize_query_level_iterators<'t, 'q>(
+    ctx: &'t dyn Context<'t>,
+    branches: &'q Vec<Vec<Vec<Query>>>,
+) -> heed::Result<BinaryHeap<Branch<'t, 'q>>> {
+
+    let mut positions = BinaryHeap::with_capacity(branches.len());
+    for branch in branches {
+        let mut branch_positions = Vec::with_capacity(branch.len());
+        for query in  branch {
+            match QueryLevelIterator::new(ctx, query)? {
+                Some(qli) => branch_positions.push(qli),
+                None => {
+                    // the branch seems to be invalid, so we skip it.
+                    branch_positions.clear();
+                    break;
+                },
+            }
+        }
+        // QueryLevelIterator need to be sorted by level and folded in descending order.
+        branch_positions.sort_unstable_by_key(|qli| qli.level);
+        let folded_query_level_iterators =  branch_positions
+            .into_iter()
+            .rev()
+            .fold(None, |fold: Option<QueryLevelIterator>, qli| match fold {
+                Some(mut fold) => {
+                    fold.previous(qli);
+                    Some(fold)
+                },
+                None => Some(qli),
+        });
+
+        if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
+            let (tree_level, last_result)  = folded_query_level_iterators.next()?;
+            let branch = Branch {
+                last_result,
+                tree_level,
+                query_level_iterator: folded_query_level_iterators,
+                branch_size: branch.len() as u32,
+            };
+            positions.push(branch);
+        }
+    }
+
+    Ok(positions)
+}
+
+fn set_compute_candidates<'t>(
+    ctx: &'t dyn Context<'t>,
+    branches: &Vec<Vec<Vec<Query>>>,
+    allowed_candidates: &RoaringBitmap,
+) -> anyhow::Result<RoaringBitmap>
+{
+    let mut branches_heap = initialize_query_level_iterators(ctx, branches)?;
+    let lowest_level = TreeLevel::min_value();
+
+    while let Some(mut branch) = branches_heap.peek_mut() {
+        let is_lowest_level = branch.tree_level == lowest_level;
+        match branch.last_result.as_mut() {
+            Some((_, _, candidates)) => {
+                candidates.intersect_with(&allowed_candidates);
+                if candidates.len() > 0 && is_lowest_level {
+                    // we have candidates, but we can't dig deeper, return candidates.
+                    return Ok(std::mem::take(candidates));
+                } else if candidates.len() > 0 {
+                    // we have candidates, lets dig deeper in levels.
+                    let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?;
+                    let (tree_level, last_result) = query_level_iterator.next()?;
+                    branch.query_level_iterator = query_level_iterator;
+                    branch.tree_level = tree_level;
+                    branch.last_result = last_result;
+                } else {
+                    // we don't have candidates, get next interval.
+                    let (_, last_result) = branch.query_level_iterator.next()?;
+                    branch.last_result = last_result;
+                }
+            },
+            // None = no candidates to find.
+            None => return Ok(RoaringBitmap::new()),
+        }
+    }
+
+    // we made all iterations without finding anything.
+    Ok(RoaringBitmap::new())
+}
+
 fn linear_compute_candidates(
     ctx: &dyn Context,
     branches: &Vec<Vec<Vec<Query>>>,
diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs
index fe224ef94..d3c394467 100644
--- a/milli/src/search/criteria/final.rs
+++ b/milli/src/search/criteria/final.rs
@@ -19,13 +19,13 @@ pub struct FinalResult {
 }
 
 pub struct Final<'t> {
-    ctx: &'t dyn Context,
+    ctx: &'t dyn Context<'t>,
     parent: Box<dyn Criterion + 't>,
     wdcache: WordDerivationsCache,
 }
 
 impl<'t> Final<'t> {
-    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Final<'t> {
+    pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
         Final { ctx, parent, wdcache: WordDerivationsCache::new() }
     }
 
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index 5e75be6ce..b972a0b2c 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -4,7 +4,7 @@ use std::borrow::Cow;
 use anyhow::bail;
 use roaring::RoaringBitmap;
 
-use crate::search::{word_derivations, WordDerivationsCache};
+use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}};
 use crate::{Index, DocumentId};
 
 use super::query_tree::{Operation, Query, QueryKind};
@@ -64,7 +64,7 @@ impl Default for Candidates {
     }
 }
 
-pub trait Context {
+pub trait Context<'c> {
     fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
@@ -73,6 +73,8 @@ pub trait Context {
     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
     fn in_prefix_cache(&self, word: &str) -> bool;
     fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>;
+    fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>;
+    fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>;
 }
 pub struct CriteriaBuilder<'t> {
     rtxn: &'t heed::RoTxn<'t>,
@@ -81,7 +83,7 @@ pub struct CriteriaBuilder<'t> {
     words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
 }
 
-impl<'a> Context for CriteriaBuilder<'a> {
+impl<'c> Context<'c> for CriteriaBuilder<'c> {
     fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
         self.index.documents_ids(self.rtxn)
     }
@@ -120,6 +122,40 @@ impl<'a> Context for CriteriaBuilder<'a> {
         }
         Ok(words_positions)
     }
+
+    fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
+        let range = {
+            let left = left.unwrap_or(u32::min_value());
+            let right = right.unwrap_or(u32::max_value());
+            let left = (word, level, left, left);
+            let right = (word, level, right, right);
+            left..=right
+        };
+        let db = match in_prefix_cache {
+            true => self.index.word_prefix_level_position_docids,
+            false => self.index.word_level_position_docids,
+        };
+
+        Ok(Box::new(db.range(self.rtxn, &range)?))
+    }
+
+    fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
+        let range = {
+            let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
+            let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
+            left..=right
+        };
+        let db = match in_prefix_cache {
+            true => self.index.word_prefix_level_position_docids,
+            false => self.index.word_level_position_docids,
+        };
+        let last_level = db
+            .remap_data_type::<heed::types::DecodeIgnore>()
+            .range(self.rtxn, &range)?.last().transpose()?
+            .map(|((_, level, _, _), _)| level);
+
+        Ok(last_level)
+    }
 }
 
 impl<'t> CriteriaBuilder<'t> {
@@ -354,7 +390,7 @@ pub mod test {
         docid_words: HashMap<u32, Vec<String>>,
     }
 
-    impl<'a> Context for TestContext<'a> {
+    impl<'c> Context<'c> for TestContext<'c> {
         fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
             Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids))
         }
@@ -397,6 +433,14 @@ pub mod test {
                 Ok(HashMap::new())
             }
         }
+
+        fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
+            todo!()
+        }
+
+        fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> {
+            todo!()
+        }
     }
 
     impl<'a> Default for TestContext<'a> {
diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs
index dc1daafb2..ca412bf28 100644
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@@ -13,7 +13,7 @@ use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proxim
 type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
 
 pub struct Proximity<'t> {
-    ctx: &'t dyn Context,
+    ctx: &'t dyn Context<'t>,
     /// ((max_proximity, query_tree), allowed_candidates)
     state: Option<(Option<(usize, Operation)>, RoaringBitmap)>,
     proximity: u8,
@@ -24,7 +24,7 @@ pub struct Proximity<'t> {
 }
 
 impl<'t> Proximity<'t> {
-    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
+    pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
         Proximity {
             ctx,
             state: None,
diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs
index 40b06afc4..bf58fa258 100644
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@@ -9,7 +9,7 @@ use crate::search::{word_derivations, WordDerivationsCache};
 use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
 
 pub struct Typo<'t> {
-    ctx: &'t dyn Context,
+    ctx: &'t dyn Context<'t>,
     query_tree: Option<(usize, Operation)>,
     number_typos: u8,
     candidates: Candidates,
@@ -19,7 +19,7 @@ pub struct Typo<'t> {
 }
 
 impl<'t> Typo<'t> {
-    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
+    pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
         Typo {
             ctx,
             query_tree: None,
diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs
index 5bb9d8d90..047b3c5f0 100644
--- a/milli/src/search/criteria/words.rs
+++ b/milli/src/search/criteria/words.rs
@@ -8,7 +8,7 @@ use crate::search::query_tree::Operation;
 use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache};
 
 pub struct Words<'t> {
-    ctx: &'t dyn Context,
+    ctx: &'t dyn Context<'t>,
     query_trees: Vec<Operation>,
     candidates: Option<RoaringBitmap>,
     bucket_candidates: RoaringBitmap,
@@ -17,7 +17,7 @@ pub struct Words<'t> {
 }
 
 impl<'t> Words<'t> {
-    pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
+    pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
         Words {
             ctx,
             query_trees: Vec::default(),
diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs
index 7ce2904e2..b69316cf6 100644
--- a/milli/src/tree_level.rs
+++ b/milli/src/tree_level.rs
@@ -21,6 +21,10 @@ impl TreeLevel {
     pub const fn min_value() -> TreeLevel {
         TreeLevel(0)
     }
+
+    pub fn saturating_sub(&self, lhs: u8) -> TreeLevel {
+        TreeLevel(self.0.saturating_sub(lhs))
+    }
 }
 
 impl Into<u8> for TreeLevel {

From 1eee0029a8d3633f42a045d654c94048cd9f4e40 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 1 Apr 2021 14:42:23 +0200
Subject: [PATCH 24/45] Make attribute criterion typo/prefix tolerant

---
 milli/src/search/criteria/attribute.rs | 57 ++++++++++++++++++--------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index af336c21f..87f9d4dde 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -1,4 +1,4 @@
-use std::{cmp::{self, Ordering}, collections::BinaryHeap};
+use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap};
 use std::collections::{BTreeMap, HashMap, btree_map};
 use std::mem::take;
 
@@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
 use crate::{TreeLevel, search::build_dfa};
 use crate::search::criteria::Query;
 use crate::search::query_tree::{Operation, QueryKind};
-use crate::search::WordDerivationsCache;
+use crate::search::{word_derivations, WordDerivationsCache};
 use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 
 pub struct Attribute<'t> {
@@ -71,7 +71,7 @@ impl<'t> Criterion for Attribute<'t> {
                             },
                         }
                     } else {
-                        set_compute_candidates(self.ctx, flattened_query_tree, candidates)?
+                        set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?
                     };
 
                     candidates.difference_with(&found_candidates);
@@ -122,21 +122,18 @@ struct WordLevelIterator<'t, 'q> {
     inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
     level: TreeLevel,
     interval_size: u32,
-    word: &'q str,
+    word: Cow<'q, str>,
     in_prefix_cache: bool,
     inner_next: Option<(u32, u32, RoaringBitmap)>,
     current_interval: Option<(u32, u32)>,
 }
 
 impl<'t, 'q> WordLevelIterator<'t, 'q> {
-    fn new(ctx: &'t dyn Context<'t>, query: &'q Query) -> heed::Result<Option<Self>> {
-        // TODO make it typo/prefix tolerant
-        let word = query.kind.word();
-        let in_prefix_cache = query.prefix && ctx.in_prefix_cache(word);
-        match ctx.word_position_last_level(word, in_prefix_cache)? {
+    fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
+        match ctx.word_position_last_level(&word, in_prefix_cache)? {
             Some(level) =>  {
                 let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
-                let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?;
+                let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
                 Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
             },
             None => Ok(None),
@@ -146,11 +143,11 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
     fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result<Self> {
         let level = level.min(&self.level).clone();
         let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
-        let word = self.word;
+        let word = self.word.clone();
         let in_prefix_cache = self.in_prefix_cache;
         // TODO try to dig starting from the current interval
         // let left = self.current_interval.map(|(left, _)| left);
-        let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?;
+        let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
 
         Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
     }
@@ -193,11 +190,33 @@ struct QueryLevelIterator<'t, 'q> {
 }
 
 impl<'t, 'q> QueryLevelIterator<'t, 'q> {
-    fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>) -> heed::Result<Option<Self>> {
+    fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<Self>> {
         let mut inner = Vec::with_capacity(queries.len());
         for query in queries {
-            if let Some(word_level_iterator) = WordLevelIterator::new(ctx, query)? {
-                inner.push(word_level_iterator);
+            match &query.kind {
+                QueryKind::Exact { word, .. } => {
+                    if !query.prefix || ctx.in_prefix_cache(&word) {
+                        let word = Cow::Borrowed(query.kind.word());
+                        if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? {
+                            inner.push(word_level_iterator);
+                        }
+                    } else {
+                        for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? {
+                            let word = Cow::Owned(word.to_owned());
+                            if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
+                                inner.push(word_level_iterator);
+                            }
+                        }
+                    }
+                },
+                QueryKind::Tolerant { typo, word } => {
+                    for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? {
+                        let word = Cow::Owned(word.to_owned());
+                        if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? {
+                            inner.push(word_level_iterator);
+                        }
+                    }
+                }
             }
         }
 
@@ -346,13 +365,14 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {}
 fn initialize_query_level_iterators<'t, 'q>(
     ctx: &'t dyn Context<'t>,
     branches: &'q Vec<Vec<Vec<Query>>>,
-) -> heed::Result<BinaryHeap<Branch<'t, 'q>>> {
+    wdcache: &mut WordDerivationsCache,
+) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> {
 
     let mut positions = BinaryHeap::with_capacity(branches.len());
     for branch in branches {
         let mut branch_positions = Vec::with_capacity(branch.len());
         for query in  branch {
-            match QueryLevelIterator::new(ctx, query)? {
+            match QueryLevelIterator::new(ctx, query, wdcache)? {
                 Some(qli) => branch_positions.push(qli),
                 None => {
                     // the branch seems to be invalid, so we skip it.
@@ -393,9 +413,10 @@ fn set_compute_candidates<'t>(
     ctx: &'t dyn Context<'t>,
     branches: &Vec<Vec<Vec<Query>>>,
     allowed_candidates: &RoaringBitmap,
+    wdcache: &mut WordDerivationsCache,
 ) -> anyhow::Result<RoaringBitmap>
 {
-    let mut branches_heap = initialize_query_level_iterators(ctx, branches)?;
+    let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?;
     let lowest_level = TreeLevel::min_value();
 
     while let Some(mut branch) = branches_heap.peek_mut() {

From b3e2280bb93fa4806229c8ee4188c4903b654887 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 1 Apr 2021 19:02:13 +0200
Subject: [PATCH 25/45] Debug attribute criterion

* debug folding when initializing iterators
---
 milli/src/search/criteria/attribute.rs | 28 ++++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 87f9d4dde..d96ec493f 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -51,7 +51,7 @@ impl<'t> Criterion for Attribute<'t> {
                         flatten_query_tree(&qt)
                     });
 
-                    let found_candidates = if candidates.len() < 1000 {
+                    let found_candidates = if candidates.len() < 1_000 {
                         let current_buckets = match self.current_buckets.as_mut() {
                             Some(current_buckets) => current_buckets,
                             None => {
@@ -322,10 +322,10 @@ struct Branch<'t, 'q> {
 
 impl<'t, 'q> Branch<'t, 'q> {
     fn cmp(&self, other: &Self) -> Ordering {
-        fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((1..branch_size).sum()) / branch_size }
+        fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((0..branch_size).sum()) / branch_size }
         match (&self.last_result, &other.last_result) {
             (Some((s_left, _, _)), Some((o_left, _, _))) => {
-                // we compute a rank form the left interval.
+                // we compute a rank from the left interval.
                 let self_rank = compute_rank(*s_left, self.branch_size);
                 let other_rank = compute_rank(*o_left, other.branch_size);
                 let left_cmp = self_rank.cmp(&other_rank).reverse();
@@ -371,8 +371,8 @@ fn initialize_query_level_iterators<'t, 'q>(
     let mut positions = BinaryHeap::with_capacity(branches.len());
     for branch in branches {
         let mut branch_positions = Vec::with_capacity(branch.len());
-        for query in  branch {
-            match QueryLevelIterator::new(ctx, query, wdcache)? {
+        for queries in  branch {
+            match QueryLevelIterator::new(ctx, queries, wdcache)? {
                 Some(qli) => branch_positions.push(qli),
                 None => {
                     // the branch seems to be invalid, so we skip it.
@@ -386,10 +386,10 @@ fn initialize_query_level_iterators<'t, 'q>(
         let folded_query_level_iterators =  branch_positions
             .into_iter()
             .rev()
-            .fold(None, |fold: Option<QueryLevelIterator>, qli| match fold {
-                Some(mut fold) => {
-                    fold.previous(qli);
-                    Some(fold)
+            .fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
+                Some(fold) => {
+                    qli.previous(fold);
+                    Some(qli)
                 },
                 None => Some(qli),
         });
@@ -418,6 +418,7 @@ fn set_compute_candidates<'t>(
 {
     let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?;
     let lowest_level = TreeLevel::min_value();
+    let mut final_candidates = RoaringBitmap::new();
 
     while let Some(mut branch) = branches_heap.peek_mut() {
         let is_lowest_level = branch.tree_level == lowest_level;
@@ -426,7 +427,8 @@ fn set_compute_candidates<'t>(
                 candidates.intersect_with(&allowed_candidates);
                 if candidates.len() > 0 && is_lowest_level {
                     // we have candidates, but we can't dig deeper, return candidates.
-                    return Ok(std::mem::take(candidates));
+                    final_candidates = std::mem::take(candidates);
+                    break;
                 } else if candidates.len() > 0 {
                     // we have candidates, lets dig deeper in levels.
                     let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?;
@@ -441,12 +443,12 @@ fn set_compute_candidates<'t>(
                 }
             },
             // None = no candidates to find.
-            None => return Ok(RoaringBitmap::new()),
+            None => break,
         }
+
     }
 
-    // we made all iterations without finding anything.
-    Ok(RoaringBitmap::new())
+    Ok(final_candidates)
 }
 
 fn linear_compute_candidates(

From 17c8c6f945bdffebdf6a935160566f9e6deaa8be Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Tue, 6 Apr 2021 15:03:41 +0200
Subject: [PATCH 26/45] Make set algorithm return None when nothing can be
 returned

---
 milli/src/search/criteria/attribute.rs | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index d96ec493f..12c6b36b8 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -71,7 +71,18 @@ impl<'t> Criterion for Attribute<'t> {
                             },
                         }
                     } else {
-                        set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?
+                        let found_candidates = set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?;
+
+                        match found_candidates {
+                            Some(candidates) => candidates,
+                            None => {
+                                return Ok(Some(CriterionResult {
+                                    query_tree: self.query_tree.take(),
+                                    candidates: self.candidates.take(),
+                                    bucket_candidates: take(&mut self.bucket_candidates),
+                                }));
+                            },
+                        }
                     };
 
                     candidates.difference_with(&found_candidates);
@@ -414,11 +425,11 @@ fn set_compute_candidates<'t>(
     branches: &Vec<Vec<Vec<Query>>>,
     allowed_candidates: &RoaringBitmap,
     wdcache: &mut WordDerivationsCache,
-) -> anyhow::Result<RoaringBitmap>
+) -> anyhow::Result<Option<RoaringBitmap>>
 {
     let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?;
     let lowest_level = TreeLevel::min_value();
-    let mut final_candidates = RoaringBitmap::new();
+    let mut final_candidates = None;
 
     while let Some(mut branch) = branches_heap.peek_mut() {
         let is_lowest_level = branch.tree_level == lowest_level;
@@ -427,7 +438,7 @@ fn set_compute_candidates<'t>(
                 candidates.intersect_with(&allowed_candidates);
                 if candidates.len() > 0 && is_lowest_level {
                     // we have candidates, but we can't dig deeper, return candidates.
-                    final_candidates = std::mem::take(candidates);
+                    final_candidates = Some(std::mem::take(candidates));
                     break;
                 } else if candidates.len() > 0 {
                     // we have candidates, lets dig deeper in levels.

From 0efa011e0965fa6e6a1da630d6a3c1cead9ba0e4 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Mon, 12 Apr 2021 11:19:25 +0200
Subject: [PATCH 27/45] Make a small code clean-up

---
 milli/src/search/criteria/attribute.rs | 90 ++++++++++++++------------
 1 file changed, 47 insertions(+), 43 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 12c6b36b8..af3e08af1 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -326,30 +326,25 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
 
 struct Branch<'t, 'q> {
     query_level_iterator: QueryLevelIterator<'t, 'q>,
-    last_result: Option<(u32, u32, RoaringBitmap)>,
+    last_result: (u32, u32, RoaringBitmap),
     tree_level: TreeLevel,
     branch_size: u32,
 }
 
 impl<'t, 'q> Branch<'t, 'q> {
     fn cmp(&self, other: &Self) -> Ordering {
-        fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((0..branch_size).sum()) / branch_size }
-        match (&self.last_result, &other.last_result) {
-            (Some((s_left, _, _)), Some((o_left, _, _))) => {
-                // we compute a rank from the left interval.
-                let self_rank = compute_rank(*s_left, self.branch_size);
-                let other_rank = compute_rank(*o_left, other.branch_size);
-                let left_cmp = self_rank.cmp(&other_rank).reverse();
-                // on level: higher is better,
-                // we want to reduce highest levels first.
-                let level_cmp = self.tree_level.cmp(&other.tree_level);
+        let compute_rank = |left: u32, branch_size: u32| left.saturating_sub((0..branch_size).sum()) / branch_size;
+        let (s_left, _, _) = self.last_result;
+        let (o_left, _, _) = other.last_result;
+        // we compute a rank from the left interval.
+        let self_rank = compute_rank(s_left, self.branch_size);
+        let other_rank = compute_rank(o_left, other.branch_size);
+        let left_cmp = self_rank.cmp(&other_rank).reverse();
+        // on level: higher is better,
+        // we want to reduce highest levels first.
+        let level_cmp = self.tree_level.cmp(&other.tree_level);
 
-                left_cmp.then(level_cmp)
-            },
-            (Some(_), None) => Ordering::Greater,
-            (None, Some(_)) => Ordering::Less,
-            (None, None) => Ordering::Equal,
-        }
+        left_cmp.then(level_cmp)
     }
 }
 
@@ -407,13 +402,15 @@ fn initialize_query_level_iterators<'t, 'q>(
 
         if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
             let (tree_level, last_result)  = folded_query_level_iterators.next()?;
-            let branch = Branch {
-                last_result,
-                tree_level,
-                query_level_iterator: folded_query_level_iterators,
-                branch_size: branch.len() as u32,
-            };
-            positions.push(branch);
+            if let Some(last_result) = last_result {
+                let branch = Branch {
+                    last_result,
+                    tree_level,
+                    query_level_iterator: folded_query_level_iterators,
+                    branch_size: branch.len() as u32,
+                };
+                positions.push(branch);
+            }
         }
     }
 
@@ -433,28 +430,35 @@ fn set_compute_candidates<'t>(
 
     while let Some(mut branch) = branches_heap.peek_mut() {
         let is_lowest_level = branch.tree_level == lowest_level;
-        match branch.last_result.as_mut() {
-            Some((_, _, candidates)) => {
-                candidates.intersect_with(&allowed_candidates);
-                if candidates.len() > 0 && is_lowest_level {
-                    // we have candidates, but we can't dig deeper, return candidates.
-                    final_candidates = Some(std::mem::take(candidates));
-                    break;
-                } else if candidates.len() > 0 {
-                    // we have candidates, lets dig deeper in levels.
-                    let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?;
-                    let (tree_level, last_result) = query_level_iterator.next()?;
+        let (_, _, candidates) = &mut branch.last_result;
+        candidates.intersect_with(&allowed_candidates);
+        if candidates.is_empty() {
+            // we don't have candidates, get next interval.
+            match branch.query_level_iterator.next()? {
+                (_, Some(last_result)) => {
+                    branch.last_result = last_result;
+                },
+                // TODO clean up this
+                (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); },
+            }
+
+        }
+        else if is_lowest_level {
+            // we have candidates, but we can't dig deeper, return candidates.
+            final_candidates = Some(take(candidates));
+            break;
+        } else {
+            // we have candidates, lets dig deeper in levels.
+            let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?;
+            match query_level_iterator.next()? {
+                (tree_level, Some(last_result)) => {
                     branch.query_level_iterator = query_level_iterator;
                     branch.tree_level = tree_level;
                     branch.last_result = last_result;
-                } else {
-                    // we don't have candidates, get next interval.
-                    let (_, last_result) = branch.query_level_iterator.next()?;
-                    branch.last_result = last_result;
-                }
-            },
-            // None = no candidates to find.
-            None => break,
+                },
+                // TODO clean up this
+                (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); },
+            }
         }
 
     }

From 2b036449be4f2c4a1ca15d5b4d1cfed3a6828e07 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Tue, 13 Apr 2021 15:06:12 +0200
Subject: [PATCH 28/45] Fix the return of equal candidates in different pages

---
 milli/src/search/criteria/attribute.rs | 79 +++++++++++++++++---------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index af3e08af1..8d150730f 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -1,5 +1,6 @@
 use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap};
 use std::collections::{BTreeMap, HashMap, btree_map};
+use std::collections::binary_heap::PeekMut;
 use std::mem::take;
 
 use roaring::RoaringBitmap;
@@ -332,13 +333,26 @@ struct Branch<'t, 'q> {
 }
 
 impl<'t, 'q> Branch<'t, 'q> {
-    fn cmp(&self, other: &Self) -> Ordering {
-        let compute_rank = |left: u32, branch_size: u32| left.saturating_sub((0..branch_size).sum()) / branch_size;
-        let (s_left, _, _) = self.last_result;
-        let (o_left, _, _) = other.last_result;
+    fn next(&mut self) -> heed::Result<bool> {
+        match self.query_level_iterator.next()? {
+            (tree_level, Some(last_result)) => {
+                self.last_result = last_result;
+                self.tree_level = tree_level;
+                Ok(true)
+            },
+            (_, None) => Ok(false),
+        }
+    }
+
+    fn compute_rank(&self) -> u32 {
         // we compute a rank from the left interval.
-        let self_rank = compute_rank(s_left, self.branch_size);
-        let other_rank = compute_rank(o_left, other.branch_size);
+        let (left, _, _) = self.last_result;
+        left.saturating_sub((0..self.branch_size).sum()) * 60 / self.branch_size
+    }
+
+    fn cmp(&self, other: &Self) -> Ordering {
+        let self_rank = self.compute_rank();
+        let other_rank = other.compute_rank();
         let left_cmp = self_rank.cmp(&other_rank).reverse();
         // on level: higher is better,
         // we want to reduce highest levels first.
@@ -426,44 +440,53 @@ fn set_compute_candidates<'t>(
 {
     let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?;
     let lowest_level = TreeLevel::min_value();
-    let mut final_candidates = None;
+    let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
 
     while let Some(mut branch) = branches_heap.peek_mut() {
         let is_lowest_level = branch.tree_level == lowest_level;
+        let branch_rank = branch.compute_rank();
         let (_, _, candidates) = &mut branch.last_result;
         candidates.intersect_with(&allowed_candidates);
         if candidates.is_empty() {
             // we don't have candidates, get next interval.
-            match branch.query_level_iterator.next()? {
-                (_, Some(last_result)) => {
-                    branch.last_result = last_result;
-                },
-                // TODO clean up this
-                (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); },
-            }
-
+            if !branch.next()? { PeekMut::pop(branch); }
         }
         else if is_lowest_level {
             // we have candidates, but we can't dig deeper, return candidates.
-            final_candidates = Some(take(candidates));
-            break;
+            final_candidates = match final_candidates.take() {
+                Some((best_rank, mut best_candidates)) => {
+                    // if current is worst than best we break to return
+                    // candidates that correspond to the best rank
+                    if branch_rank > best_rank {
+                        final_candidates = Some((best_rank, best_candidates));
+                        break;
+                    // else we add current candidates to best candidates
+                    // and we fetch the next page
+                    } else {
+                        best_candidates.union_with(candidates);
+                        if !branch.next()? { PeekMut::pop(branch); }
+                        Some((best_rank, best_candidates))
+                    }
+                },
+                // we take current candidates as best candidates
+                // and we fetch the next page
+                None => {
+                    let candidates = take(candidates);
+                    if !branch.next()? { PeekMut::pop(branch); }
+                    Some((branch_rank, candidates))
+                },
+            };
         } else {
             // we have candidates, lets dig deeper in levels.
-            let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?;
-            match query_level_iterator.next()? {
-                (tree_level, Some(last_result)) => {
-                    branch.query_level_iterator = query_level_iterator;
-                    branch.tree_level = tree_level;
-                    branch.last_result = last_result;
-                },
-                // TODO clean up this
-                (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); },
-            }
+            branch.query_level_iterator = branch.query_level_iterator.dig(ctx)?;
+            if !branch.next()? { PeekMut::pop(branch); }
         }
 
     }
 
-    Ok(final_candidates)
+    Ok(final_candidates.map(|(_rank, candidates)| {
+        candidates
+    }))
 }
 
 fn linear_compute_candidates(

From f8537900168265841993c4eb0bbd3fcc539a76b4 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Tue, 13 Apr 2021 18:25:38 +0200
Subject: [PATCH 29/45] Use the LCM of 10 first numbers to compute attribute
 rank

---
 milli/src/search/criteria/attribute.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 8d150730f..5ab60c58d 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -11,6 +11,10 @@ use crate::search::query_tree::{Operation, QueryKind};
 use crate::search::{word_derivations, WordDerivationsCache};
 use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 
+/// To be able to divide integers by the number of words in the query
+/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
+/// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
+const LCM_10_FIRST_NUMBERS: u32 = 2520;
 pub struct Attribute<'t> {
     ctx: &'t dyn Context<'t>,
     query_tree: Option<Operation>,
@@ -347,7 +351,7 @@ impl<'t, 'q> Branch<'t, 'q> {
     fn compute_rank(&self) -> u32 {
         // we compute a rank from the left interval.
         let (left, _, _) = self.last_result;
-        left.saturating_sub((0..self.branch_size).sum()) * 60 / self.branch_size
+        left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
     }
 
     fn cmp(&self, other: &Self) -> Ordering {
@@ -545,7 +549,7 @@ fn linear_compute_candidates(
                 // we substract the word index to the position.
                 let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
                 // here we do the means of the words of the branch
-                min_rank = min_rank.min(branch_rank / branch_len as u64);
+                min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
             }
         }
 

From 716c8e22b0bb82a65f2d9320af8d3d68ffc9a79f Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 15 Apr 2021 10:44:27 +0200
Subject: [PATCH 30/45] Add style and comments

---
 milli/src/search/criteria/attribute.rs | 51 +++++++++++++++-----------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 5ab60c58d..2672169de 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -15,6 +15,7 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 /// we want to find a multiplier that allow us to divide by any number between 1 and 10.
 /// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
 const LCM_10_FIRST_NUMBERS: u32 = 2520;
+
 pub struct Attribute<'t> {
     ctx: &'t dyn Context<'t>,
     query_tree: Option<Operation>,
@@ -134,6 +135,9 @@ impl<'t> Criterion for Attribute<'t> {
     }
 }
 
+/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word,
+/// it will begin at the first non-empty interval and will return every interval without
+/// jumping over empty intervals.
 struct WordLevelIterator<'t, 'q> {
     inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>,
     level: TreeLevel,
@@ -197,12 +201,14 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
     }
 }
 
+/// QueryLevelIterator is an pseudo-Iterator for a Query,
+/// It contains WordLevelIterators and is chainned with other QueryLevelIterator.
 struct QueryLevelIterator<'t, 'q> {
-    previous: Option<Box<QueryLevelIterator<'t, 'q>>>,
+    parent: Option<Box<QueryLevelIterator<'t, 'q>>>,
     inner: Vec<WordLevelIterator<'t, 'q>>,
     level: TreeLevel,
     accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
-    previous_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
+    parent_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
 }
 
 impl<'t, 'q> QueryLevelIterator<'t, 'q> {
@@ -239,26 +245,27 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
         let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone());
         match highest {
             Some(level) => Ok(Some(Self {
-                previous: None,
+                parent: None,
                 inner,
                 level,
                 accumulator: vec![],
-                previous_accumulator: vec![],
+                parent_accumulator: vec![],
             })),
             None => Ok(None),
         }
     }
 
-    fn previous(&mut self, previous: QueryLevelIterator<'t, 'q>) -> &Self {
-        self.previous = Some(Box::new(previous));
+    fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self {
+        self.parent = Some(Box::new(parent));
         self
     }
 
+    /// create a new QueryLevelIterator with a lower level than the current one.
     fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result<Self> {
-        let (level, previous) = match &self.previous {
-            Some(previous) => {
-                let previous = previous.dig(ctx)?;
-                (previous.level.min(self.level), Some(Box::new(previous)))
+        let (level, parent) = match &self.parent {
+            Some(parent) => {
+                let parent = parent.dig(ctx)?;
+                (parent.level.min(self.level), Some(Box::new(parent)))
             },
             None => (self.level.saturating_sub(1), None),
         };
@@ -268,7 +275,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
             inner.push(word_level_iterator.dig(ctx, &level)?);
         }
 
-        Ok(Self {previous, inner, level, accumulator: vec![], previous_accumulator: vec![]})
+        Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![]})
     }
 
 
@@ -295,29 +302,31 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
         Ok(accumulated)
     }
 
+    /// return the next meta-interval created from inner WordLevelIterators,
+    /// and from eventual chainned QueryLevelIterator.
     fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> {
-        let previous_result = match self.previous.as_mut() {
-            Some(previous) => {
-                Some(previous.next()?)
+        let parent_result = match self.parent.as_mut() {
+            Some(parent) => {
+                Some(parent.next()?)
             },
             None => None,
         };
 
-        match previous_result {
-            Some((previous_level, previous_next)) => {
-                let inner_next = self.inner_next(previous_level)?;
+        match parent_result {
+            Some((parent_level, parent_next)) => {
+                let inner_next = self.inner_next(parent_level)?;
                 self.accumulator.push(inner_next);
-                self.previous_accumulator.push(previous_next);
+                self.parent_accumulator.push(parent_next);
                 // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty,
                 // WARNING the cleaned intervals count needs to be kept to skip at the end
                 let mut merged_interval = None;
-                for current in self.accumulator.iter().rev().zip(self.previous_accumulator.iter()) {
+                for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()) {
                     if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
                         let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new()));
                         merged_docids.union_with(&(a & b));
                     }
                 }
-                Ok((previous_level, merged_interval))
+                Ok((parent_level, merged_interval))
             },
             None => {
                 let level = self.level.clone();
@@ -412,7 +421,7 @@ fn initialize_query_level_iterators<'t, 'q>(
             .rev()
             .fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
                 Some(fold) => {
-                    qli.previous(fold);
+                    qli.parent(fold);
                     Some(qli)
                 },
                 None => Some(qli),

From e77291a6f3065824779d630c9fd4449869b338ee Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 15 Apr 2021 12:22:44 +0200
Subject: [PATCH 31/45] Optimize Atrribute criterion on big requests

---
 milli/src/search/criteria/attribute.rs | 160 +++++++++++++++----------
 1 file changed, 97 insertions(+), 63 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 2672169de..745d8cdb0 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -101,7 +101,7 @@ impl<'t> Criterion for Attribute<'t> {
                 },
                 (Some(qt), None) => {
                     let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?;
-                    self.bucket_candidates.union_with(&query_tree_candidates);
+                    self.bucket_candidates |= &query_tree_candidates;
                     self.candidates = Some(query_tree_candidates);
                 },
                 (None, Some(_)) => {
@@ -123,7 +123,7 @@ impl<'t> Criterion for Attribute<'t> {
                         Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
                             self.query_tree = query_tree;
                             self.candidates = candidates;
-                            self.bucket_candidates.union_with(&bucket_candidates);
+                            self.bucket_candidates |= bucket_candidates;
                             self.flattened_query_tree = None;
                             self.current_buckets = None;
                         },
@@ -160,14 +160,12 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
         }
     }
 
-    fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result<Self> {
+    fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
         let level = level.min(&self.level).clone();
         let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
         let word = self.word.clone();
         let in_prefix_cache = self.in_prefix_cache;
-        // TODO try to dig starting from the current interval
-        // let left = self.current_interval.map(|(left, _)| left);
-        let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
+        let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
 
         Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None})
     }
@@ -209,6 +207,7 @@ struct QueryLevelIterator<'t, 'q> {
     level: TreeLevel,
     accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
     parent_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>,
+    interval_to_skip: usize,
 }
 
 impl<'t, 'q> QueryLevelIterator<'t, 'q> {
@@ -250,6 +249,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
                 level,
                 accumulator: vec![],
                 parent_accumulator: vec![],
+                interval_to_skip: 0,
             })),
             None => Ok(None),
         }
@@ -270,16 +270,15 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
             None => (self.level.saturating_sub(1), None),
         };
 
+        let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten();
         let mut inner = Vec::with_capacity(self.inner.len());
         for word_level_iterator in self.inner.iter() {
-            inner.push(word_level_iterator.dig(ctx, &level)?);
+            inner.push(word_level_iterator.dig(ctx, &level, left_interval)?);
         }
 
-        Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![]})
+        Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0})
     }
 
-
-
     fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
         let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None;
         let u8_level = Into::<u8>::into(level);
@@ -289,12 +288,13 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
             let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32);
             for _ in 0..accumulated_count {
                 if let Some((next_left, _, next_docids)) =  wli.next()? {
-                    accumulated = accumulated.take().map(
-                        |(acc_left, acc_right, mut acc_docids)| {
-                            acc_docids.union_with(&next_docids);
-                            (acc_left, acc_right, acc_docids)
-                        }
-                    ).or_else(|| Some((next_left, next_left + interval_size, next_docids)));
+                    accumulated = match accumulated.take(){
+                        Some((acc_left, acc_right, mut acc_docids)) => {
+                            acc_docids |= next_docids;
+                            Some((acc_left, acc_right, acc_docids))
+                        },
+                        None => Some((next_left, next_left + interval_size, next_docids)),
+                    };
                 }
             }
         }
@@ -304,35 +304,59 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
 
     /// return the next meta-interval created from inner WordLevelIterators,
     /// and from eventual chainned QueryLevelIterator.
-    fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> {
+    fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
         let parent_result = match self.parent.as_mut() {
             Some(parent) => {
-                Some(parent.next()?)
+                Some(parent.next(allowed_candidates, tree_level)?)
             },
             None => None,
         };
 
         match parent_result {
-            Some((parent_level, parent_next)) => {
-                let inner_next = self.inner_next(parent_level)?;
+            Some(parent_next) => {
+                let inner_next = self.inner_next(tree_level)?;
+                self.interval_to_skip += self.accumulator.iter().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip).take_while(|current| {
+                    match current {
+                        (Some((_, _, inner)), Some((_, _, parent))) => {
+                            inner.is_disjoint(allowed_candidates) && parent.is_empty()
+                        },
+                        (Some((_, _, inner)), None) => {
+                            inner.is_disjoint(allowed_candidates)
+                        },
+                        (None, Some((_, _, parent))) => {
+                            parent.is_empty()
+                        },
+                        (None, None) => true,
+                    }
+                }).count();
                 self.accumulator.push(inner_next);
                 self.parent_accumulator.push(parent_next);
-                // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty,
-                // WARNING the cleaned intervals count needs to be kept to skip at the end
-                let mut merged_interval = None;
-                for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()) {
+                let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
+
+                for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) {
                     if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current {
-                        let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new()));
-                        merged_docids.union_with(&(a & b));
+                        match merged_interval.as_mut() {
+                            Some((_, _, merged_docids)) => *merged_docids |= a & b,
+                            None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)),
+                        }
                     }
                 }
-                Ok((parent_level, merged_interval))
+                Ok(merged_interval)
             },
             None => {
-                let level = self.level.clone();
-                let next_interval = self.inner_next(level.clone())?;
-                self.accumulator = vec![next_interval.clone()];
-                Ok((level, next_interval))
+                let level = self.level;
+                match self.inner_next(level)? {
+                    Some((left, right, mut candidates)) => {
+                        self.accumulator = vec![Some((left, right, RoaringBitmap::new()))];
+                        candidates &= allowed_candidates;
+                        Ok(Some((left, right, candidates)))
+
+                    },
+                    None => {
+                        self.accumulator = vec![None];
+                        Ok(None)
+                    },
+                }
             }
         }
     }
@@ -346,17 +370,31 @@ struct Branch<'t, 'q> {
 }
 
 impl<'t, 'q> Branch<'t, 'q> {
-    fn next(&mut self) -> heed::Result<bool> {
-        match self.query_level_iterator.next()? {
-            (tree_level, Some(last_result)) => {
+    fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> {
+        let tree_level = self.query_level_iterator.level;
+        match self.query_level_iterator.next(allowed_candidates, tree_level)? {
+            Some(last_result) => {
                 self.last_result = last_result;
                 self.tree_level = tree_level;
                 Ok(true)
             },
-            (_, None) => Ok(false),
+            None => Ok(false),
         }
     }
 
+    fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> {
+        self.query_level_iterator = self.query_level_iterator.dig(ctx)?;
+        Ok(())
+    }
+
+    fn lazy_next(&mut self) {
+        let u8_level = Into::<u8>::into(self.tree_level.clone());
+        let interval_size = 4u32.pow(u8_level as u32);
+        let (left, right, _) = self.last_result;
+
+        self.last_result = (left + interval_size,  right + interval_size, RoaringBitmap::new());
+    }
+
     fn compute_rank(&self) -> u32 {
         // we compute a rank from the left interval.
         let (left, _, _) = self.last_result;
@@ -367,11 +405,11 @@ impl<'t, 'q> Branch<'t, 'q> {
         let self_rank = self.compute_rank();
         let other_rank = other.compute_rank();
         let left_cmp = self_rank.cmp(&other_rank).reverse();
-        // on level: higher is better,
-        // we want to reduce highest levels first.
-        let level_cmp = self.tree_level.cmp(&other.tree_level);
+        // on level: lower is better,
+        // we want to dig faster into levels on interesting branches.
+        let level_cmp = self.tree_level.cmp(&other.tree_level).reverse();
 
-        left_cmp.then(level_cmp)
+        left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len()))
     }
 }
 
@@ -398,6 +436,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {}
 fn initialize_query_level_iterators<'t, 'q>(
     ctx: &'t dyn Context<'t>,
     branches: &'q Vec<Vec<Vec<Query>>>,
+    allowed_candidates: &RoaringBitmap,
     wdcache: &mut WordDerivationsCache,
 ) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> {
 
@@ -418,7 +457,6 @@ fn initialize_query_level_iterators<'t, 'q>(
         branch_positions.sort_unstable_by_key(|qli| qli.level);
         let folded_query_level_iterators =  branch_positions
             .into_iter()
-            .rev()
             .fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
                 Some(fold) => {
                     qli.parent(fold);
@@ -428,7 +466,8 @@ fn initialize_query_level_iterators<'t, 'q>(
         });
 
         if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
-            let (tree_level, last_result)  = folded_query_level_iterators.next()?;
+            let tree_level = folded_query_level_iterators.level;
+            let last_result  = folded_query_level_iterators.next(allowed_candidates, tree_level)?;
             if let Some(last_result) = last_result {
                 let branch = Branch {
                     last_result,
@@ -451,48 +490,43 @@ fn set_compute_candidates<'t>(
     wdcache: &mut WordDerivationsCache,
 ) -> anyhow::Result<Option<RoaringBitmap>>
 {
-    let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?;
+    let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?;
     let lowest_level = TreeLevel::min_value();
     let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
+    let mut allowed_candidates = allowed_candidates.clone();
 
     while let Some(mut branch) = branches_heap.peek_mut() {
         let is_lowest_level = branch.tree_level == lowest_level;
         let branch_rank = branch.compute_rank();
-        let (_, _, candidates) = &mut branch.last_result;
-        candidates.intersect_with(&allowed_candidates);
+        // if current is worst than best we break to return
+        // candidates that correspond to the best rank
+        if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } }
+        let _left = branch.last_result.0;
+        let candidates = take(&mut branch.last_result.2);
         if candidates.is_empty() {
             // we don't have candidates, get next interval.
-            if !branch.next()? { PeekMut::pop(branch); }
+            if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
         }
         else if is_lowest_level {
-            // we have candidates, but we can't dig deeper, return candidates.
+            // we have candidates, but we can't dig deeper.
+            allowed_candidates -= &candidates;
             final_candidates = match final_candidates.take() {
+                // we add current candidates to best candidates
                 Some((best_rank, mut best_candidates)) => {
-                    // if current is worst than best we break to return
-                    // candidates that correspond to the best rank
-                    if branch_rank > best_rank {
-                        final_candidates = Some((best_rank, best_candidates));
-                        break;
-                    // else we add current candidates to best candidates
-                    // and we fetch the next page
-                    } else {
-                        best_candidates.union_with(candidates);
-                        if !branch.next()? { PeekMut::pop(branch); }
-                        Some((best_rank, best_candidates))
-                    }
+                    best_candidates |= candidates;
+                    branch.lazy_next();
+                    Some((best_rank, best_candidates))
                 },
                 // we take current candidates as best candidates
-                // and we fetch the next page
                 None => {
-                    let candidates = take(candidates);
-                    if !branch.next()? { PeekMut::pop(branch); }
+                    branch.lazy_next();
                     Some((branch_rank, candidates))
                 },
             };
         } else {
             // we have candidates, lets dig deeper in levels.
-            branch.query_level_iterator = branch.query_level_iterator.dig(ctx)?;
-            if !branch.next()? { PeekMut::pop(branch); }
+            branch.dig(ctx)?;
+            if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); }
         }
 
     }

From 71740805a7c2f45eff9db63f5ae4e4705352d189 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 21 Apr 2021 11:44:29 +0200
Subject: [PATCH 32/45] Fix forgotten typo tests

---
 Cargo.lock                        | 14 ++++++++++++--
 milli/Cargo.toml                  |  2 +-
 milli/src/search/criteria/typo.rs |  5 +++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 065be362f..0e42f60f4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1005,7 +1005,7 @@ dependencies = [
  "heed",
  "jemallocator",
  "milli",
- "roaring",
+ "roaring 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
  "serde_json",
  "stderrlog",
  "structopt",
@@ -1287,7 +1287,7 @@ dependencies = [
  "rand 0.8.3",
  "rayon",
  "regex",
- "roaring",
+ "roaring 0.6.5 (git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops)",
  "serde",
  "serde_json",
  "slice-group-by",
@@ -1973,6 +1973,16 @@ dependencies = [
  "retain_mut",
 ]
 
+[[package]]
+name = "roaring"
+version = "0.6.5"
+source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops#6689f8c9dd2efdbfde4442d4d803e87169780593"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "retain_mut",
+]
+
 [[package]]
 name = "rustc_version"
 version = "0.2.3"
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index ef9c64b7b..b54c0d768 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -27,7 +27,7 @@ once_cell = "1.5.2"
 ordered-float = "2.1.1"
 rayon = "1.5.0"
 regex = "1.4.3"
-roaring = "0.6.5"
+roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "optimize-ops" }
 serde = { version = "1.0.123", features = ["derive"] }
 serde_json = { version = "1.0.62", features = ["preserve_order"] }
 slice-group-by = "0.2.6"
diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs
index bf58fa258..5a3c93ac8 100644
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@@ -328,6 +328,7 @@ mod test {
         let parent = Initial::new(query_tree, facet_candidates);
         let mut criteria = Typo::new(&context, Box::new(parent));
 
+        assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none());
         assert!(criteria.next(&mut wdcache).unwrap().is_none());
     }
 
@@ -440,7 +441,7 @@ mod test {
                 ]),
             ])),
             candidates: Some(&candidates_1 & &facet_candidates),
-            bucket_candidates: candidates_1 & &facet_candidates,
+            bucket_candidates: facet_candidates.clone(),
         };
 
         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1));
@@ -462,7 +463,7 @@ mod test {
                 ]),
             ])),
             candidates: Some(&candidates_2 & &facet_candidates),
-            bucket_candidates: candidates_2 & &facet_candidates,
+            bucket_candidates: RoaringBitmap::new(),
         };
 
         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2));

From 0d7d3ce802d4e1ef5226bb90d1bc65f140fdf104 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 21 Apr 2021 11:53:07 +0200
Subject: [PATCH 33/45] Update roaring package

---
 Cargo.lock       | 18 ++++--------------
 infos/Cargo.toml |  2 +-
 milli/Cargo.toml |  2 +-
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0e42f60f4..6a30891ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1005,7 +1005,7 @@ dependencies = [
  "heed",
  "jemallocator",
  "milli",
- "roaring 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "roaring",
  "serde_json",
  "stderrlog",
  "structopt",
@@ -1287,7 +1287,7 @@ dependencies = [
  "rand 0.8.3",
  "rayon",
  "regex",
- "roaring 0.6.5 (git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops)",
+ "roaring",
  "serde",
  "serde_json",
  "slice-group-by",
@@ -1964,19 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1"
 
 [[package]]
 name = "roaring"
-version = "0.6.5"
+version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f"
-dependencies = [
- "bytemuck",
- "byteorder",
- "retain_mut",
-]
-
-[[package]]
-name = "roaring"
-version = "0.6.5"
-source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops#6689f8c9dd2efdbfde4442d4d803e87169780593"
+checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447"
 dependencies = [
  "bytemuck",
  "byteorder",
diff --git a/infos/Cargo.toml b/infos/Cargo.toml
index 59cfbd661..8b5867fde 100644
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@@ -11,7 +11,7 @@ csv = "1.1.5"
 heed = "0.10.6"
 jemallocator = "0.3.2"
 milli = { path = "../milli" }
-roaring = "0.6.5"
+roaring = "0.6.6"
 serde_json = "1.0.62"
 stderrlog = "0.5.1"
 structopt = { version = "0.3.21", default-features = false }
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index b54c0d768..8b359a09b 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -27,7 +27,7 @@ once_cell = "1.5.2"
 ordered-float = "2.1.1"
 rayon = "1.5.0"
 regex = "1.4.3"
-roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "optimize-ops" }
+roaring = "0.6.6"
 serde = { version = "1.0.123", features = ["derive"] }
 serde_json = { version = "1.0.62", features = ["preserve_order"] }
 slice-group-by = "0.2.6"

From 0daa0e170ac5f81d517aca2384ec4fcb237fe76e Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Mon, 26 Apr 2021 11:30:42 +0200
Subject: [PATCH 34/45] Fix PR comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 infos/src/main.rs                      |  3 +--
 milli/src/search/criteria/attribute.rs |  2 +-
 milli/src/search/criteria/final.rs     |  6 +-----
 milli/src/search/criteria/mod.rs       | 10 +++++++++-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/infos/src/main.rs b/infos/src/main.rs
index 5a12a9d4d..902394af8 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -354,8 +354,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         word_prefix_level_position_docids,
         facet_field_id_value_docids,
         field_id_docid_facet_values: _,
-        documents,
-        ..
+        documents
     } = index;
 
     let main_name = "main";
diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 745d8cdb0..18a18816c 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -13,7 +13,7 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 
 /// To be able to divide integers by the number of words in the query
 /// we want to find a multiplier that allow us to divide by any number between 1 and 10.
-/// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
+/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
 const LCM_10_FIRST_NUMBERS: u32 = 2520;
 
 pub struct Attribute<'t> {
diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs
index d3c394467..f8bc43204 100644
--- a/milli/src/search/criteria/final.rs
+++ b/milli/src/search/criteria/final.rs
@@ -44,11 +44,7 @@ impl<'t> Final<'t> {
 
                     bucket_candidates.union_with(&candidates);
 
-                    return Ok(Some(FinalResult {
-                        query_tree,
-                        candidates,
-                        bucket_candidates,
-                    }));
+                    return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }));
                 },
                 None => return Ok(None),
             }
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index b972a0b2c..d3eac94fd 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -123,7 +123,15 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
         Ok(words_positions)
     }
 
-    fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> {
+    fn word_position_iterator(
+        &self,
+        word: &str,
+        level: TreeLevel,
+        in_prefix_cache: bool,
+        left: Option<u32>,
+        right: Option<u32>
+    ) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
+    {
         let range = {
             let left = left.unwrap_or(u32::min_value());
             let right = right.unwrap_or(u32::max_value());

From 47d780b8ce43fad2631efb473e25b5ea12992476 Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Mon, 26 Apr 2021 14:51:52 +0200
Subject: [PATCH 35/45] Update milli/src/search/criteria/mod.rs

Co-authored-by: Irevoire <tamo@meilisearch.com>
---
 milli/src/search/criteria/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index d3eac94fd..01af1ffbd 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -130,7 +130,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
         in_prefix_cache: bool,
         left: Option<u32>,
         right: Option<u32>
-    ) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>
     {
         let range = {
             let left = left.unwrap_or(u32::min_value());

From 0e4e6dfada834728644df9a5ab5afb8698a6c85f Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:29:52 +0200
Subject: [PATCH 36/45] Update milli/src/search/criteria/proximity.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/proximity.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs
index ca412bf28..4c73d7459 100644
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@@ -30,7 +30,7 @@ impl<'t> Proximity<'t> {
             state: None,
             proximity: 0,
             bucket_candidates: RoaringBitmap::new(),
-            parent: parent,
+            parent,
             candidates_cache: Cache::new(),
             plane_sweep_cache: None,
         }

From 498c2b298c795810726c18f8c95ef16ce490c02d Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:30:02 +0200
Subject: [PATCH 37/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 18a18816c..820085c31 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -57,7 +57,7 @@ impl<'t> Criterion for Attribute<'t> {
                         flatten_query_tree(&qt)
                     });
 
-                    let found_candidates = if candidates.len() < 1_000 {
+                    let found_candidates = if candidates.len() < 1000 {
                         let current_buckets = match self.current_buckets.as_mut() {
                             Some(current_buckets) => current_buckets,
                             None => {

From b3d6c6a9a0e8447daefa92b022c03fe39d5b08e3 Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:31:13 +0200
Subject: [PATCH 38/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 820085c31..31725e221 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -77,9 +77,7 @@ impl<'t> Criterion for Attribute<'t> {
                             },
                         }
                     } else {
-                        let found_candidates = set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?;
-
-                        match found_candidates {
+                        match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? {
                             Some(candidates) => candidates,
                             None => {
                                 return Ok(Some(CriterionResult {

From e92d13767667f7ee5c4bfca6fca339d8c26c5e85 Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:31:42 +0200
Subject: [PATCH 39/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 31725e221..a1a31247b 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -453,7 +453,7 @@ fn initialize_query_level_iterators<'t, 'q>(
         }
         // QueryLevelIterator need to be sorted by level and folded in descending order.
         branch_positions.sort_unstable_by_key(|qli| qli.level);
-        let folded_query_level_iterators =  branch_positions
+        let folded_query_level_iterators = branch_positions
             .into_iter()
             .fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold {
                 Some(fold) => {

From c862b1bc6be8c364a474104139f90272ccd1787f Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:32:10 +0200
Subject: [PATCH 40/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index a1a31247b..c7d10e431 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -465,7 +465,7 @@ fn initialize_query_level_iterators<'t, 'q>(
 
         if let Some(mut folded_query_level_iterators) = folded_query_level_iterators {
             let tree_level = folded_query_level_iterators.level;
-            let last_result  = folded_query_level_iterators.next(allowed_candidates, tree_level)?;
+            let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?;
             if let Some(last_result) = last_result {
                 let branch = Branch {
                     last_result,

From 3b1358b62f539ec6f74a74deb40850dbff6ba34f Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:32:19 +0200
Subject: [PATCH 41/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index c7d10e431..8f2e34ca9 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -554,7 +554,7 @@ fn linear_compute_candidates(
                         QueryKind::Exact { word, .. } => {
                             if *prefix {
                                 word_derivations(word, true, 0, &words_positions)
-                                .flat_map(|positions| positions.iter().next()).min()
+                                    .flat_map(|positions| positions.iter().next()).min()
                             } else {
                                 words_positions.get(word)
                                     .map(|positions| positions.iter().next())

From 329bd4a1bbe4ddfe408fb33611f7d7d8e6d91661 Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:39:03 +0200
Subject: [PATCH 42/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 8f2e34ca9..62e992fad 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -304,9 +304,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
     /// and from eventual chainned QueryLevelIterator.
     fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
         let parent_result = match self.parent.as_mut() {
-            Some(parent) => {
-                Some(parent.next(allowed_candidates, tree_level)?)
-            },
+            Some(parent) => Some(parent.next(allowed_candidates, tree_level)?),
             None => None,
         };
 

From 3794ffc9529d89bf6e965ad0c99eb477a5881bc6 Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:39:23 +0200
Subject: [PATCH 43/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 62e992fad..3d7132e77 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -496,7 +496,9 @@ fn set_compute_candidates<'t>(
         let branch_rank = branch.compute_rank();
         // if current is worst than best we break to return
         // candidates that correspond to the best rank
-        if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } }
+        if let Some((best_rank, _)) = final_candidates {
+            if branch_rank > best_rank { break }
+        }
         let _left = branch.last_result.0;
         let candidates = take(&mut branch.last_result.2);
         if candidates.is_empty() {

From 0add4d735c95ed8bcddeb2e2afa853bab7dcf62e Mon Sep 17 00:00:00 2001
From: Many <legendre.maxime.isn@gmail.com>
Date: Tue, 27 Apr 2021 17:40:34 +0200
Subject: [PATCH 44/45] Update milli/src/search/criteria/attribute.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/search/criteria/attribute.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index 3d7132e77..e1069b5f5 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -529,9 +529,7 @@ fn set_compute_candidates<'t>(
 
     }
 
-    Ok(final_candidates.map(|(_rank, candidates)| {
-        candidates
-    }))
+    Ok(final_candidates.map(|(_rank, candidates)| candidates))
 }
 
 fn linear_compute_candidates(

From 3b7e6afb55e76749bb69b11e5ab15d488bc8924f Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 28 Apr 2021 13:53:27 +0200
Subject: [PATCH 45/45] Make some refacto and add documentation

---
 milli/src/search/criteria/attribute.rs | 64 ++++++++++++++++++--------
 1 file changed, 45 insertions(+), 19 deletions(-)

diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs
index e1069b5f5..bbbc0de1a 100644
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -16,6 +16,10 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree};
 /// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
 const LCM_10_FIRST_NUMBERS: u32 = 2520;
 
+/// To compute the interval size of a level,
+/// we use 4 as the exponentiation base and the level as the exponent.
+const LEVEL_EXPONENTIATION_BASE: u32 = 4;
+
 pub struct Attribute<'t> {
     ctx: &'t dyn Context<'t>,
     query_tree: Option<Operation>,
@@ -150,7 +154,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
     fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> {
         match ctx.word_position_last_level(&word, in_prefix_cache)? {
             Some(level) =>  {
-                let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
+                let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
                 let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?;
                 Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None }))
             },
@@ -160,7 +164,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> {
 
     fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> {
         let level = level.min(&self.level).clone();
-        let interval_size = 4u32.pow(Into::<u8>::into(level.clone()) as u32);
+        let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32);
         let word = self.word.clone();
         let in_prefix_cache = self.in_prefix_cache;
         let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?;
@@ -280,10 +284,10 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
     fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> {
         let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None;
         let u8_level = Into::<u8>::into(level);
-        let interval_size = 4u32.pow(u8_level as u32);
+        let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
         for wli in self.inner.iter_mut() {
             let wli_u8_level = Into::<u8>::into(wli.level.clone());
-            let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32);
+            let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32);
             for _ in 0..accumulated_count {
                 if let Some((next_left, _, next_docids)) =  wli.next()? {
                     accumulated = match accumulated.take(){
@@ -311,20 +315,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
         match parent_result {
             Some(parent_next) => {
                 let inner_next = self.inner_next(tree_level)?;
-                self.interval_to_skip += self.accumulator.iter().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip).take_while(|current| {
-                    match current {
-                        (Some((_, _, inner)), Some((_, _, parent))) => {
-                            inner.is_disjoint(allowed_candidates) && parent.is_empty()
-                        },
-                        (Some((_, _, inner)), None) => {
-                            inner.is_disjoint(allowed_candidates)
-                        },
-                        (None, Some((_, _, parent))) => {
-                            parent.is_empty()
-                        },
-                        (None, None) => true,
-                    }
-                }).count();
+                self.interval_to_skip += interval_to_skip(
+                    &self.parent_accumulator,
+                    &self.accumulator,
+                    self.interval_to_skip,
+                    allowed_candidates
+                );
                 self.accumulator.push(inner_next);
                 self.parent_accumulator.push(parent_next);
                 let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None;
@@ -358,6 +354,29 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> {
     }
 }
 
+/// Count the number of interval that can be skiped when we make the cross-intersections
+/// in order to compute the next meta-interval.
+/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids.
+fn interval_to_skip(
+    parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>],
+    current_accumulator: &[Option<(u32, u32, RoaringBitmap)>],
+    already_skiped: usize,
+    allowed_candidates: &RoaringBitmap,
+) -> usize {
+    parent_accumulator.into_iter()
+        .zip(current_accumulator.into_iter())
+        .skip(already_skiped)
+        .take_while(|(parent, current)| {
+            let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty());
+            let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates));
+            skip_parent && skip_current
+        })
+        .count()
+
+}
+
+/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
+/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates.
 struct Branch<'t, 'q> {
     query_level_iterator: QueryLevelIterator<'t, 'q>,
     last_result: (u32, u32, RoaringBitmap),
@@ -366,6 +385,8 @@ struct Branch<'t, 'q> {
 }
 
 impl<'t, 'q> Branch<'t, 'q> {
+    /// return the next meta-interval of the branch,
+    /// and update inner interval in order to be ranked by the BinaryHeap.
     fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> {
         let tree_level = self.query_level_iterator.level;
         match self.query_level_iterator.next(allowed_candidates, tree_level)? {
@@ -378,19 +399,24 @@ impl<'t, 'q> Branch<'t, 'q> {
         }
     }
 
+    /// make the current Branch iterate over smaller intervals.
     fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> {
         self.query_level_iterator = self.query_level_iterator.dig(ctx)?;
         Ok(())
     }
 
+    /// because next() method could be time consuming,
+    /// update inner interval in order to be ranked by the binary_heap without computing it,
+    /// the next() method should be called when the real interval is needed.
     fn lazy_next(&mut self) {
         let u8_level = Into::<u8>::into(self.tree_level.clone());
-        let interval_size = 4u32.pow(u8_level as u32);
+        let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32);
         let (left, right, _) = self.last_result;
 
         self.last_result = (left + interval_size,  right + interval_size, RoaringBitmap::new());
     }
 
+    /// return the score of the current inner interval.
     fn compute_rank(&self) -> u32 {
         // we compute a rank from the left interval.
         let (left, _, _) = self.last_result;