From 39e0d9fc4adfd6379bd91b183e843d111a89ae61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 2 Oct 2019 17:34:32 +0200
Subject: [PATCH] Introduce a basically working rkv based MeiliDB

---
 .gitignore                              |   4 +
 Cargo.toml                              |  26 ++
 src/automaton/dfa.rs                    |  51 +++
 src/automaton/mod.rs                    | 202 ++++++++++++
 src/automaton/query_enhancer.rs         | 398 ++++++++++++++++++++++++
 src/criterion/document_id.rs            |  16 +
 src/criterion/exact.rs                  |  65 ++++
 src/criterion/mod.rs                    | 120 +++++++
 src/criterion/number_of_words.rs        |  31 ++
 src/criterion/sum_of_typos.rs           | 116 +++++++
 src/criterion/sum_of_words_attribute.rs |  64 ++++
 src/criterion/sum_of_words_position.rs  |  64 ++++
 src/criterion/words_proximity.rs        | 155 +++++++++
 src/lib.rs                              | 135 ++++++++
 src/main.rs                             |  52 ++++
 src/query_builder.rs                    | 275 ++++++++++++++++
 src/raw_document.rs                     | 141 +++++++++
 src/raw_indexer.rs                      | 208 +++++++++++++
 src/reordered_attrs.rs                  |  24 ++
 src/store/mod.rs                        |  26 ++
 src/store/synonyms.rs                   |  23 ++
 src/store/words.rs                      |  91 ++++++
 22 files changed, 2287 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.toml
 create mode 100644 src/automaton/dfa.rs
 create mode 100644 src/automaton/mod.rs
 create mode 100644 src/automaton/query_enhancer.rs
 create mode 100644 src/criterion/document_id.rs
 create mode 100644 src/criterion/exact.rs
 create mode 100644 src/criterion/mod.rs
 create mode 100644 src/criterion/number_of_words.rs
 create mode 100644 src/criterion/sum_of_typos.rs
 create mode 100644 src/criterion/sum_of_words_attribute.rs
 create mode 100644 src/criterion/sum_of_words_position.rs
 create mode 100644 src/criterion/words_proximity.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/query_builder.rs
 create mode 100644 src/raw_document.rs
 create mode 100644 src/raw_indexer.rs
 create mode 100644 src/reordered_attrs.rs
 create mode 100644 src/store/mod.rs
 create mode 100644 src/store/synonyms.rs
 create mode 100644 src/store/words.rs
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..6e03cb642
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+/target
+**/*.rs.bk
+Cargo.lock
+/*.rkv
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 000000000..c04eb5170
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "new-meilidb"
+version = "0.1.0"
+authors = ["Clément Renault <clement@meilisearch.com>"]
+edition = "2018"
+
+[dependencies]
+byteorder = "1.3.2"
+deunicode = "1.0.0"
+once_cell = "1.2.0"
+rkv = "0.10.2"
+sdset = "0.3.2"
+slice-group-by = "0.2.6"
+zerocopy = "0.2.8"
+
+meilidb-schema = { path = "../MeiliDB/meilidb-schema", version = "0.1.0" }
+meilidb-tokenizer = { path = "../MeiliDB/meilidb-tokenizer", version = "0.1.0" }
+
+[dependencies.levenshtein_automata]
+git = "https://github.com/Kerollmops/levenshtein-automata.git"
+branch = "arc-byte-slice"
+features = ["fst_automaton"]
+
+[dependencies.fst]
+git = "https://github.com/Kerollmops/fst.git"
+branch = "arc-byte-slice"
diff --git a/src/automaton/dfa.rs b/src/automaton/dfa.rs
new file mode 100644
index 000000000..015fdd877
--- /dev/null
+++ b/src/automaton/dfa.rs
@@ -0,0 +1,51 @@
+use once_cell::sync::OnceCell;
+use levenshtein_automata::{
+    LevenshteinAutomatonBuilder as LevBuilder,
+    DFA,
+};
+
+static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
+static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
+static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();
+
+#[derive(Copy, Clone)]
+enum PrefixSetting {
+    Prefix,
+    NoPrefix,
+}
+
+fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
+    use PrefixSetting::{Prefix, NoPrefix};
+
+    match query.len() {
+        0 ..= 4 => {
+            let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, false));
+            match setting {
+                Prefix   => builder.build_prefix_dfa(query),
+                NoPrefix => builder.build_dfa(query),
+            }
+        },
+        5 ..= 8 => {
+            let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, false));
+            match setting {
+                Prefix   => builder.build_prefix_dfa(query),
+                NoPrefix => builder.build_dfa(query),
+            }
+        },
+        _ => {
+            let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, false));
+            match setting {
+                Prefix   => builder.build_prefix_dfa(query),
+                NoPrefix => builder.build_dfa(query),
+            }
+        },
+    }
+}
+
+pub fn build_prefix_dfa(query: &str) -> DFA {
+    build_dfa_with_setting(query, PrefixSetting::Prefix)
+}
+
+pub fn build_dfa(query: &str) -> DFA {
+    build_dfa_with_setting(query, PrefixSetting::NoPrefix)
+}
diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs
new file mode 100644
index 000000000..8c5c68d98
--- /dev/null
+++ b/src/automaton/mod.rs
@@ -0,0 +1,202 @@
+mod dfa;
+mod query_enhancer;
+
+use std::cmp::Reverse;
+use std::vec;
+
+use fst::{IntoStreamer, Streamer};
+use levenshtein_automata::DFA;
+use meilidb_tokenizer::{split_query_string, is_cjk};
+
+use crate::store;
+
+use self::dfa::{build_dfa, build_prefix_dfa};
+use self::query_enhancer::QueryEnhancerBuilder;
+pub use self::query_enhancer::QueryEnhancer;
+
+const NGRAMS: usize = 3;
+
+pub struct AutomatonProducer {
+    automatons: Vec<Vec<Automaton>>,
+}
+
+impl AutomatonProducer {
+    pub fn new(
+        reader: &rkv::Reader,
+        query: &str,
+        synonyms_store: store::Synonyms,
+    ) -> (AutomatonProducer, QueryEnhancer)
+    {
+        let (automatons, query_enhancer) = generate_automatons(reader, query, synonyms_store).unwrap();
+        (AutomatonProducer { automatons }, query_enhancer)
+    }
+
+    pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
+        self.automatons.into_iter()
+    }
+}
+
+pub struct Automaton {
+    pub index: usize,
+    pub ngram: usize,
+    pub query_len: usize,
+    pub is_exact: bool,
+    pub is_prefix: bool,
+    pub query: String,
+}
+
+impl Automaton {
+    pub fn dfa(&self) -> DFA {
+        if self.is_prefix {
+            build_prefix_dfa(&self.query)
+        } else {
+            build_dfa(&self.query)
+        }
+    }
+
+    fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
+        Automaton {
+            index,
+            ngram,
+            query_len: query.len(),
+            is_exact: true,
+            is_prefix: false,
+            query: query.to_string(),
+        }
+    }
+
+    fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
+        Automaton {
+            index,
+            ngram,
+            query_len: query.len(),
+            is_exact: true,
+            is_prefix: true,
+            query: query.to_string(),
+        }
+    }
+
+    fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
+        Automaton {
+            index,
+            ngram,
+            query_len: query.len(),
+            is_exact: false,
+            is_prefix: false,
+            query: query.to_string(),
+        }
+    }
+}
+
+pub fn normalize_str(string: &str) -> String {
+    let mut string = string.to_lowercase();
+
+    if !string.contains(is_cjk) {
+        string = deunicode::deunicode_with_tofu(&string, "");
+    }
+
+    string
+}
+
+fn generate_automatons(
+    reader: &rkv::Reader,
+    query: &str,
+    synonym_store: store::Synonyms,
+) -> Result<(Vec<Vec<Automaton>>, QueryEnhancer), rkv::StoreError>
+{
+    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
+    let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
+    let synonyms = synonym_store.synonyms_fst(reader)?;
+
+    let mut automatons = Vec::new();
+    let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
+
+    // We must not declare the original words to the query enhancer
+    // *but* we need to push them in the automatons list first
+    let mut original_automatons = Vec::new();
+    let mut original_words = query_words.iter().peekable();
+    while let Some(word) = original_words.next() {
+
+        let has_following_word = original_words.peek().is_some();
+        let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
+
+        let automaton = if not_prefix_dfa {
+            Automaton::exact(automatons.len(), 1, word)
+        } else {
+            Automaton::prefix_exact(automatons.len(), 1, word)
+        };
+        original_automatons.push(automaton);
+    }
+
+    automatons.push(original_automatons);
+
+    for n in 1..=NGRAMS {
+        let mut ngrams = query_words.windows(n).enumerate().peekable();
+        while let Some((query_index, ngram_slice)) = ngrams.next() {
+
+            let query_range = query_index..query_index + n;
+            let ngram_nb_words = ngram_slice.len();
+            let ngram = ngram_slice.join(" ");
+
+            let has_following_word = ngrams.peek().is_some();
+            let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
+
+            // automaton of synonyms of the ngrams
+            let normalized = normalize_str(&ngram);
+            let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
+
+            let mut stream = synonyms.search(&lev).into_stream();
+            while let Some(base) = stream.next() {
+
+                // only trigger alternatives when the last word has been typed
+                // i.e. "new " do not but "new yo" triggers alternatives to "new york"
+                let base = std::str::from_utf8(base).unwrap();
+                let base_nb_words = split_query_string(base).count();
+                if ngram_nb_words != base_nb_words { continue }
+
+                if let Some(synonyms) = synonym_store.alternatives_to(reader, base.as_bytes())? {
+
+                    let mut stream = synonyms.into_stream();
+                    while let Some(synonyms) = stream.next() {
+                        let synonyms = std::str::from_utf8(synonyms).unwrap();
+                        let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
+                        let nb_synonym_words = synonyms_words.len();
+
+                        let real_query_index = automatons.len();
+                        enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
+
+                        for synonym in synonyms_words {
+                            let automaton = if nb_synonym_words == 1 {
+                                Automaton::exact(automatons.len(), n, synonym)
+                            } else {
+                                Automaton::non_exact(automatons.len(), n, synonym)
+                            };
+                            automatons.push(vec![automaton]);
+                        }
+                    }
+                }
+            }
+
+            if n != 1 {
+                // automaton of concatenation of query words
+                let concat = ngram_slice.concat();
+                let normalized = normalize_str(&concat);
+
+                let real_query_index = automatons.len();
+                enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
+
+                let automaton = Automaton::exact(automatons.len(), n, &normalized);
+                automatons.push(vec![automaton]);
+            }
+        }
+    }
+
+    // order automatons, the most important first,
+    // we keep the original automatons at the front.
+    automatons[1..].sort_unstable_by_key(|a| {
+        let a = a.first().unwrap();
+        (Reverse(a.is_exact), Reverse(a.ngram))
+    });
+
+    Ok((automatons, enhancer_builder.build()))
+}
diff --git a/src/automaton/query_enhancer.rs b/src/automaton/query_enhancer.rs
new file mode 100644
index 000000000..165c1b094
--- /dev/null
+++ b/src/automaton/query_enhancer.rs
@@ -0,0 +1,398 @@
+use std::ops::Range;
+use std::cmp::Ordering::{Less, Greater, Equal};
+
+/// Return `true` if the specified range can accept the given replacements words.
+/// Returns `false` if the replacements words are already present in the original query
+/// or if there is fewer replacement words than the range to replace.
+//
+//
+// ## Ignored because already present in original
+//
+//     new york city subway
+//     -------- ^^^^
+//   /          \
+//  [new york city]
+//
+//
+// ## Ignored because smaller than the original
+//
+//   new york city subway
+//   -------------
+//   \          /
+//    [new york]
+//
+//
+// ## Accepted because bigger than the original
+//
+//        NYC subway
+//        ---
+//       /   \
+//      /     \
+//     /       \
+//    /         \
+//   /           \
+//  [new york city]
+//
+fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
+where S: AsRef<str>,
+      T: AsRef<str>,
+{
+    if words.len() <= range.len() {
+        // there is fewer or equal replacement words
+        // than there is already in the replaced range
+        return false
+    }
+
+    // retrieve the part to rewrite but with the length
+    // of the replacement part
+    let original = query.iter().skip(range.start).take(words.len());
+
+    // check if the original query doesn't already contain
+    // the replacement words
+    !original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
+}
+
+type Origin = usize;
+type RealLength = usize;
+
+struct FakeIntervalTree {
+    intervals: Vec<(Range<usize>, (Origin, RealLength))>,
+}
+
+impl FakeIntervalTree {
+    fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
+        intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
+        FakeIntervalTree { intervals }
+    }
+
+    fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
+        let element = self.intervals.binary_search_by(|(r, _)| {
+            if point >= r.start {
+                if point < r.end { Equal } else { Less }
+            } else { Greater }
+        });
+
+        let n = match element { Ok(n) => n, Err(n) => n };
+
+        match self.intervals.get(n) {
+            Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
+            _otherwise => None,
+        }
+    }
+}
+
+pub struct QueryEnhancerBuilder<'a, S> {
+    query: &'a [S],
+    origins: Vec<usize>,
+    real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
+}
+
+impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
+    pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
+        // we initialize origins query indices based on their positions
+        let origins: Vec<_> = (0..query.len() + 1).collect();
+        let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
+
+        QueryEnhancerBuilder { query, origins, real_to_origin }
+    }
+
+    /// Update the final real to origin query indices mapping.
+    ///
+    /// `range` is the original words range that this `replacement` words replace
+    /// and `real` is the first real query index of these replacement words.
+    pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
+    where T: AsRef<str>,
+    {
+        // check if the range of original words
+        // can be rewritten with the replacement words
+        if rewrite_range_with(self.query, range.clone(), replacement) {
+
+            // this range can be replaced so we need to
+            // modify the origins accordingly
+            let offset = replacement.len() - range.len();
+
+            let previous_padding = self.origins[range.end - 1];
+            let current_offset = (self.origins[range.end] - 1) - previous_padding;
+            let diff = offset.saturating_sub(current_offset);
+            self.origins[range.end] += diff;
+
+            for r in &mut self.origins[range.end + 1..] {
+                *r += diff;
+            }
+        }
+
+        // we need to store the real number and origins relations
+        // this way it will be possible to know by how many
+        // we need to pad real query indices
+        let real_range = real..real + replacement.len().max(range.len());
+        let real_length = replacement.len();
+        self.real_to_origin.push((real_range, (range.start, real_length)));
+    }
+
+    pub fn build(self) -> QueryEnhancer {
+        QueryEnhancer {
+            origins: self.origins,
+            real_to_origin: FakeIntervalTree::new(self.real_to_origin),
+        }
+    }
+}
+
+pub struct QueryEnhancer {
+    origins: Vec<usize>,
+    real_to_origin: FakeIntervalTree,
+}
+
+impl QueryEnhancer {
+    /// Returns the query indices to use to replace this real query index.
+    pub fn replacement(&self, real: u32) -> Range<u32> {
+        let real = real as usize;
+
+        // query the fake interval tree with the real query index
+        let (range, (origin, real_length)) =
+            self.real_to_origin
+                .query(real)
+                .expect("real has never been declared");
+
+        // if `real` is the end bound of the range
+        if (range.start + real_length - 1) == real {
+            let mut count = range.len();
+            let mut new_origin = origin;
+            for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
+                let len = slice[1] - slice[0];
+                count = count.saturating_sub(len);
+                if count == 0 { new_origin = origin + i; break }
+            }
+
+            let n = real - range.start;
+            let start = self.origins[origin];
+            let end = self.origins[new_origin + 1];
+            let remaining = (end - start) - n;
+
+            Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
+
+        } else {
+            // just return the origin along with
+            // the real position of the word
+            let n = real as usize - range.start;
+            let origin = self.origins[origin];
+
+            Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn original_unmodified() {
+        let query = ["new", "york", "city", "subway"];
+        //             0       1       2        3
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // new york = new york city
+        builder.declare(0..2, 4, &["new", "york", "city"]);
+        //                    ^      4       5       6
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..1); // new
+        assert_eq!(enhancer.replacement(1), 1..2); // york
+        assert_eq!(enhancer.replacement(2), 2..3); // city
+        assert_eq!(enhancer.replacement(3), 3..4); // subway
+        assert_eq!(enhancer.replacement(4), 0..1); // new
+        assert_eq!(enhancer.replacement(5), 1..2); // york
+        assert_eq!(enhancer.replacement(6), 2..3); // city
+    }
+
+    #[test]
+    fn simple_growing() {
+        let query = ["new", "york", "subway"];
+        //             0       1        2
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // new york = new york city
+        builder.declare(0..2, 3, &["new", "york", "city"]);
+        //                    ^      3       4       5
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..1); // new
+        assert_eq!(enhancer.replacement(1), 1..3); // york
+        assert_eq!(enhancer.replacement(2), 3..4); // subway
+        assert_eq!(enhancer.replacement(3), 0..1); // new
+        assert_eq!(enhancer.replacement(4), 1..2); // york
+        assert_eq!(enhancer.replacement(5), 2..3); // city
+    }
+
+    #[test]
+    fn same_place_growings() {
+        let query = ["NY", "subway"];
+        //             0       1
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // NY = new york
+        builder.declare(0..1, 2, &["new", "york"]);
+        //                    ^      2       3
+
+        // NY = new york city
+        builder.declare(0..1, 4, &["new", "york", "city"]);
+        //                    ^      4       5       6
+
+        // NY = NYC
+        builder.declare(0..1, 7, &["NYC"]);
+        //                    ^      7
+
+        // NY = new york city
+        builder.declare(0..1, 8, &["new", "york", "city"]);
+        //                    ^      8       9      10
+
+        // subway = underground train
+        builder.declare(1..2, 11, &["underground", "train"]);
+        //                    ^          11          12
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..3); // NY
+        assert_eq!(enhancer.replacement(1), 3..5); // subway
+        assert_eq!(enhancer.replacement(2), 0..1); // new
+        assert_eq!(enhancer.replacement(3), 1..3); // york
+        assert_eq!(enhancer.replacement(4), 0..1); // new
+        assert_eq!(enhancer.replacement(5), 1..2); // york
+        assert_eq!(enhancer.replacement(6), 2..3); // city
+        assert_eq!(enhancer.replacement(7), 0..3); // NYC
+        assert_eq!(enhancer.replacement(8), 0..1); // new
+        assert_eq!(enhancer.replacement(9), 1..2); // york
+        assert_eq!(enhancer.replacement(10), 2..3); // city
+        assert_eq!(enhancer.replacement(11), 3..4); // underground
+        assert_eq!(enhancer.replacement(12), 4..5); // train
+    }
+
+    #[test]
+    fn bigger_growing() {
+        let query = ["NYC", "subway"];
+        //             0        1
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // NYC = new york city
+        builder.declare(0..1, 2, &["new", "york", "city"]);
+        //                    ^      2       3       4
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..3); // NYC
+        assert_eq!(enhancer.replacement(1), 3..4); // subway
+        assert_eq!(enhancer.replacement(2), 0..1); // new
+        assert_eq!(enhancer.replacement(3), 1..2); // york
+        assert_eq!(enhancer.replacement(4), 2..3); // city
+    }
+
+    #[test]
+    fn middle_query_growing() {
+        let query = ["great", "awesome", "NYC", "subway"];
+        //              0         1        2        3
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // NYC = new york city
+        builder.declare(2..3, 4, &["new", "york", "city"]);
+        //                    ^      4       5       6
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..1); // great
+        assert_eq!(enhancer.replacement(1), 1..2); // awesome
+        assert_eq!(enhancer.replacement(2), 2..5); // NYC
+        assert_eq!(enhancer.replacement(3), 5..6); // subway
+        assert_eq!(enhancer.replacement(4), 2..3); // new
+        assert_eq!(enhancer.replacement(5), 3..4); // york
+        assert_eq!(enhancer.replacement(6), 4..5); // city
+    }
+
+    #[test]
+    fn end_query_growing() {
+        let query = ["NYC", "subway"];
+        //             0        1
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // NYC = new york city
+        builder.declare(1..2, 2, &["underground", "train"]);
+        //                    ^         2            3
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..1); // NYC
+        assert_eq!(enhancer.replacement(1), 1..3); // subway
+        assert_eq!(enhancer.replacement(2), 1..2); // underground
+        assert_eq!(enhancer.replacement(3), 2..3); // train
+    }
+
+    #[test]
+    fn multiple_growings() {
+        let query = ["great", "awesome", "NYC", "subway"];
+        //              0         1        2        3
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // NYC = new york city
+        builder.declare(2..3, 4, &["new", "york", "city"]);
+        //                    ^      4       5       6
+
+        // subway = underground train
+        builder.declare(3..4, 7, &["underground", "train"]);
+        //                    ^          7           8
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0), 0..1); // great
+        assert_eq!(enhancer.replacement(1), 1..2); // awesome
+        assert_eq!(enhancer.replacement(2), 2..5); // NYC
+        assert_eq!(enhancer.replacement(3), 5..7); // subway
+        assert_eq!(enhancer.replacement(4), 2..3); // new
+        assert_eq!(enhancer.replacement(5), 3..4); // york
+        assert_eq!(enhancer.replacement(6), 4..5); // city
+        assert_eq!(enhancer.replacement(7), 5..6); // underground
+        assert_eq!(enhancer.replacement(8), 6..7); // train
+    }
+
+    #[test]
+    fn multiple_probable_growings() {
+        let query = ["great", "awesome", "NYC", "subway"];
+        //              0         1        2        3
+        let mut builder = QueryEnhancerBuilder::new(&query);
+
+        // NYC = new york city
+        builder.declare(2..3, 4, &["new", "york", "city"]);
+        //                    ^      4       5       6
+
+        // subway = underground train
+        builder.declare(3..4, 7, &["underground", "train"]);
+        //                    ^          7           8
+
+        // great awesome = good
+        builder.declare(0..2, 9, &["good"]);
+        //                    ^       9
+
+        // awesome NYC = NY
+        builder.declare(1..3, 10, &["NY"]);
+        //                    ^^     10
+
+        // NYC subway = metro
+        builder.declare(2..4, 11, &["metro"]);
+        //                    ^^      11
+
+        let enhancer = builder.build();
+
+        assert_eq!(enhancer.replacement(0),  0..1); // great
+        assert_eq!(enhancer.replacement(1),  1..2); // awesome
+        assert_eq!(enhancer.replacement(2),  2..5); // NYC
+        assert_eq!(enhancer.replacement(3),  5..7); // subway
+        assert_eq!(enhancer.replacement(4),  2..3); // new
+        assert_eq!(enhancer.replacement(5),  3..4); // york
+        assert_eq!(enhancer.replacement(6),  4..5); // city
+        assert_eq!(enhancer.replacement(7),  5..6); // underground
+        assert_eq!(enhancer.replacement(8),  6..7); // train
+        assert_eq!(enhancer.replacement(9),  0..2); // good
+        assert_eq!(enhancer.replacement(10), 1..5); // NY
+        assert_eq!(enhancer.replacement(11), 2..5); // metro
+    }
+}
diff --git a/src/criterion/document_id.rs b/src/criterion/document_id.rs
new file mode 100644
index 000000000..34d0bd7f5
--- /dev/null
+++ b/src/criterion/document_id.rs
@@ -0,0 +1,16 @@
+use std::cmp::Ordering;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[derive(Debug, Clone, Copy)]
+pub struct DocumentId;
+
+impl Criterion for DocumentId {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        lhs.id.cmp(&rhs.id)
+    }
+
+    fn name(&self) -> &'static str {
+        "DocumentId"
+    }
+}
diff --git a/src/criterion/exact.rs b/src/criterion/exact.rs
new file mode 100644
index 000000000..bde3ca733
--- /dev/null
+++ b/src/criterion/exact.rs
@@ -0,0 +1,65 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
+    let mut count = 0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        let len = group.len();
+        count += is_exact[index..index + len].contains(&true) as usize;
+        index += len;
+    }
+
+    count
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct Exact;
+
+impl Criterion for Exact {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let is_exact = lhs.is_exact();
+            number_exact_matches(query_index, is_exact)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let is_exact = rhs.is_exact();
+            number_exact_matches(query_index, is_exact)
+        };
+
+        lhs.cmp(&rhs).reverse()
+    }
+
+    fn name(&self) -> &'static str {
+        "Exact"
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // typing: "soulier"
+    //
+    // doc0: "Soulier bleu"
+    // doc1: "souliereres rouge"
+    #[test]
+    fn easy_case() {
+        let query_index0 = &[0];
+        let is_exact0 = &[true];
+
+        let query_index1 = &[0];
+        let is_exact1 = &[false];
+
+        let doc0 = number_exact_matches(query_index0, is_exact0);
+        let doc1 = number_exact_matches(query_index1, is_exact1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+}
diff --git a/src/criterion/mod.rs b/src/criterion/mod.rs
new file mode 100644
index 000000000..6ce42007c
--- /dev/null
+++ b/src/criterion/mod.rs
@@ -0,0 +1,120 @@
+mod sum_of_typos;
+mod number_of_words;
+mod words_proximity;
+mod sum_of_words_attribute;
+mod sum_of_words_position;
+mod exact;
+mod document_id;
+
+use std::cmp::Ordering;
+use crate::RawDocument;
+
+pub use self::{
+    sum_of_typos::SumOfTypos,
+    number_of_words::NumberOfWords,
+    words_proximity::WordsProximity,
+    sum_of_words_attribute::SumOfWordsAttribute,
+    sum_of_words_position::SumOfWordsPosition,
+    exact::Exact,
+    document_id::DocumentId,
+};
+
+pub trait Criterion: Send + Sync {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
+
+    fn name(&self) -> &'static str;
+
+    #[inline]
+    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
+        self.evaluate(lhs, rhs) == Ordering::Equal
+    }
+}
+
+impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        (**self).evaluate(lhs, rhs)
+    }
+
+    fn name(&self) -> &'static str {
+        (**self).name()
+    }
+
+    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
+        (**self).eq(lhs, rhs)
+    }
+}
+
+impl<T: Criterion + ?Sized> Criterion for Box<T> {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        (**self).evaluate(lhs, rhs)
+    }
+
+    fn name(&self) -> &'static str {
+        (**self).name()
+    }
+
+    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
+        (**self).eq(lhs, rhs)
+    }
+}
+
+#[derive(Default)]
+pub struct CriteriaBuilder<'a> {
+    inner: Vec<Box<dyn Criterion + 'a>>
+}
+
+impl<'a> CriteriaBuilder<'a>
+{
+    pub fn new() -> CriteriaBuilder<'a> {
+        CriteriaBuilder { inner: Vec::new() }
+    }
+
+    pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
+        CriteriaBuilder { inner: Vec::with_capacity(capacity) }
+    }
+
+    pub fn reserve(&mut self, additional: usize) {
+        self.inner.reserve(additional)
+    }
+
+    pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
+    where C: Criterion,
+    {
+        self.push(criterion);
+        self
+    }
+
+    pub fn push<C: 'a>(&mut self, criterion: C)
+    where C: Criterion,
+    {
+        self.inner.push(Box::new(criterion));
+    }
+
+    pub fn build(self) -> Criteria<'a> {
+        Criteria { inner: self.inner }
+    }
+}
+
+pub struct Criteria<'a> {
+    inner: Vec<Box<dyn Criterion + 'a>>,
+}
+
+impl<'a> Default for Criteria<'a> {
+    fn default() -> Self {
+        CriteriaBuilder::with_capacity(7)
+            .add(SumOfTypos)
+            .add(NumberOfWords)
+            .add(WordsProximity)
+            .add(SumOfWordsAttribute)
+            .add(SumOfWordsPosition)
+            .add(Exact)
+            .add(DocumentId)
+            .build()
+    }
+}
+
+impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
+    fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
+        &self.inner
+    }
+}
diff --git a/src/criterion/number_of_words.rs b/src/criterion/number_of_words.rs
new file mode 100644
index 000000000..43095a066
--- /dev/null
+++ b/src/criterion/number_of_words.rs
@@ -0,0 +1,31 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn number_of_query_words(query_index: &[u32]) -> usize {
+    query_index.linear_group().count()
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct NumberOfWords;
+
+impl Criterion for NumberOfWords {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            number_of_query_words(query_index)
+        };
+        let rhs = {
+            let query_index = rhs.query_index();
+            number_of_query_words(query_index)
+        };
+
+        lhs.cmp(&rhs).reverse()
+    }
+
+    fn name(&self) -> &'static str {
+        "NumberOfWords"
+    }
+}
diff --git a/src/criterion/sum_of_typos.rs b/src/criterion/sum_of_typos.rs
new file mode 100644
index 000000000..6736e6caa
--- /dev/null
+++ b/src/criterion/sum_of_typos.rs
@@ -0,0 +1,116 @@
+use std::cmp::Ordering;
+
+use slice_group_by::GroupBy;
+
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+// This function is a wrong logarithmic 10 function.
+// It is safe to panic on input number higher than 3,
+// the number of typos is never bigger than that.
+#[inline]
+fn custom_log10(n: u8) -> f32 {
+    match n {
+        0 => 0.0,       // log(1)
+        1 => 0.30102,   // log(2)
+        2 => 0.47712,   // log(3)
+        3 => 0.60205,   // log(4)
+        _ => panic!("invalid number"),
+    }
+}
+
+#[inline]
+fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
+    let mut number_words: usize = 0;
+    let mut sum_typos = 0.0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        sum_typos += custom_log10(distance[index]);
+        number_words += 1;
+        index += group.len();
+    }
+
+    (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SumOfTypos;
+
+impl Criterion for SumOfTypos {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let distance = lhs.distance();
+            sum_matches_typos(query_index, distance)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let distance = rhs.distance();
+            sum_matches_typos(query_index, distance)
+        };
+
+        lhs.cmp(&rhs).reverse()
+    }
+
+    fn name(&self) -> &'static str {
+        "SumOfTypos"
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // typing: "Geox CEO"
+    //
+    // doc0: "Geox SpA: CEO and Executive"
+    // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
+    #[test]
+    fn one_typo_reference() {
+        let query_index0 = &[0, 1];
+        let distance0 = &[0, 0];
+
+        let query_index1 = &[0, 1];
+        let distance1 = &[1, 0];
+
+        let doc0 = sum_matches_typos(query_index0, distance0);
+        let doc1 = sum_matches_typos(query_index1, distance1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+
+    // typing: "bouton manchette"
+    //
+    // doc0: "bouton manchette"
+    // doc1: "bouton"
+    #[test]
+    fn no_typo() {
+        let query_index0 = &[0, 1];
+        let distance0 = &[0, 0];
+
+        let query_index1 = &[0];
+        let distance1 = &[0];
+
+        let doc0 = sum_matches_typos(query_index0, distance0);
+        let doc1 = sum_matches_typos(query_index1, distance1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+
+    // typing: "bouton manchztte"
+    //
+    // doc0: "bouton manchette"
+    // doc1: "bouton"
+    #[test]
+    fn one_typo() {
+        let query_index0 = &[0, 1];
+        let distance0 = &[0, 1];
+
+        let query_index1 = &[0];
+        let distance1 = &[0];
+
+        let doc0 = sum_matches_typos(query_index0, distance0);
+        let doc1 = sum_matches_typos(query_index1, distance1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+}
diff --git a/src/criterion/sum_of_words_attribute.rs b/src/criterion/sum_of_words_attribute.rs
new file mode 100644
index 000000000..d5787ef3a
--- /dev/null
+++ b/src/criterion/sum_of_words_attribute.rs
@@ -0,0 +1,64 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
+    let mut sum_attributes = 0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        sum_attributes += attribute[index] as usize;
+        index += group.len();
+    }
+
+    sum_attributes
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SumOfWordsAttribute;
+
+impl Criterion for SumOfWordsAttribute {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let attribute = lhs.attribute();
+            sum_matches_attributes(query_index, attribute)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let attribute = rhs.attribute();
+            sum_matches_attributes(query_index, attribute)
+        };
+
+        lhs.cmp(&rhs)
+    }
+
+    fn name(&self) -> &'static str {
+        "SumOfWordsAttribute"
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // typing: "soulier"
+    //
+    // doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
+    // doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
+    #[test]
+    fn title_vs_description() {
+        let query_index0 = &[0];
+        let attribute0 = &[0];
+
+        let query_index1 = &[0];
+        let attribute1 = &[1];
+
+        let doc0 = sum_matches_attributes(query_index0, attribute0);
+        let doc1 = sum_matches_attributes(query_index1, attribute1);
+        assert_eq!(doc0.cmp(&doc1), Ordering::Less);
+    }
+}
diff --git a/src/criterion/sum_of_words_position.rs b/src/criterion/sum_of_words_position.rs
new file mode 100644
index 000000000..13f26774c
--- /dev/null
+++ b/src/criterion/sum_of_words_position.rs
@@ -0,0 +1,64 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
+    let mut sum_word_index = 0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        sum_word_index += word_index[index] as usize;
+        index += group.len();
+    }
+
+    sum_word_index
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SumOfWordsPosition;
+
+impl Criterion for SumOfWordsPosition {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let word_index = lhs.word_index();
+            sum_matches_attribute_index(query_index, word_index)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let word_index = rhs.word_index();
+            sum_matches_attribute_index(query_index, word_index)
+        };
+
+        lhs.cmp(&rhs)
+    }
+
+    fn name(&self) -> &'static str {
+        "SumOfWordsPosition"
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // typing: "soulier"
+    //
+    // doc0: "Soulier bleu"
+    // doc1: "Botte rouge et soulier noir"
+    #[test]
+    fn easy_case() {
+        let query_index0 = &[0];
+        let word_index0 = &[0];
+
+        let query_index1 = &[0];
+        let word_index1 = &[3];
+
+        let doc0 = sum_matches_attribute_index(query_index0, word_index0);
+        let doc1 = sum_matches_attribute_index(query_index1, word_index1);
+        assert_eq!(doc0.cmp(&doc1), Ordering::Less);
+    }
+}
diff --git a/src/criterion/words_proximity.rs b/src/criterion/words_proximity.rs
new file mode 100644
index 000000000..10f167bef
--- /dev/null
+++ b/src/criterion/words_proximity.rs
@@ -0,0 +1,155 @@
+use std::cmp::{self, Ordering};
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+const MAX_DISTANCE: u16 = 8;
+
+#[inline]
+fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
+    (a.clone(), b.clone())
+}
+
+fn index_proximity(lhs: u16, rhs: u16) -> u16 {
+    if lhs < rhs {
+        cmp::min(rhs - lhs, MAX_DISTANCE)
+    } else {
+        cmp::min(lhs - rhs, MAX_DISTANCE) + 1
+    }
+}
+
+fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
+    if lattr != rattr { return MAX_DISTANCE }
+    index_proximity(lwi, rwi)
+}
+
+fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
+    let mut min_prox = u16::max_value();
+
+    for a in lattr.iter().zip(lwi) {
+        for b in rattr.iter().zip(rwi) {
+            let a = clone_tuple(a);
+            let b = clone_tuple(b);
+            min_prox = cmp::min(min_prox, attribute_proximity(a, b));
+        }
+    }
+
+    min_prox
+}
+
+fn matches_proximity(
+    query_index: &[u32],
+    distance: &[u8],
+    attribute: &[u16],
+    word_index: &[u16],
+) -> u16
+{
+    let mut query_index_groups = query_index.linear_group();
+    let mut proximity = 0;
+    let mut index = 0;
+
+    let get_attr_wi = |index: usize, group_len: usize| {
+        // retrieve the first distance group (with the lowest values)
+        let len = distance[index..index + group_len].linear_group().next().unwrap().len();
+
+        let rattr = &attribute[index..index + len];
+        let rwi = &word_index[index..index + len];
+
+        (rattr, rwi)
+    };
+
+    let mut last = query_index_groups.next().map(|group| {
+        let attr_wi = get_attr_wi(index, group.len());
+        index += group.len();
+        attr_wi
+    });
+
+    // iter by windows of size 2
+    while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
+        let attr_wi = get_attr_wi(index, rhs.len());
+        proximity += min_proximity(lhs, attr_wi);
+        last = Some(attr_wi);
+        index += rhs.len();
+    }
+
+    proximity
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct WordsProximity;
+
+impl Criterion for WordsProximity {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let distance = lhs.distance();
+            let attribute = lhs.attribute();
+            let word_index = lhs.word_index();
+            matches_proximity(query_index, distance, attribute, word_index)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let distance = rhs.distance();
+            let attribute = rhs.attribute();
+            let word_index = rhs.word_index();
+            matches_proximity(query_index, distance, attribute, word_index)
+        };
+
+        lhs.cmp(&rhs)
+    }
+
+    fn name(&self) -> &'static str {
+        "WordsProximity"
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn three_different_attributes() {
+
+        // "soup" "of the" "the day"
+        //
+        // { id: 0, attr: 0, attr_index: 0 }
+        // { id: 1, attr: 1, attr_index: 0 }
+        // { id: 2, attr: 1, attr_index: 1 }
+        // { id: 2, attr: 2, attr_index: 0 }
+        // { id: 3, attr: 3, attr_index: 1 }
+
+        let query_index = &[0, 1, 2, 2, 3];
+        let distance    = &[0, 0, 0, 0, 0];
+        let attribute   = &[0, 1, 1, 2, 3];
+        let word_index  = &[0, 0, 1, 0, 1];
+
+        //   soup -> of = 8
+        // + of -> the  = 1
+        // + the -> day = 8 (not 1)
+        assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
+    }
+
+    #[test]
+    fn two_different_attributes() {
+
+        // "soup day" "soup of the day"
+        //
+        // { id: 0, attr: 0, attr_index: 0 }
+        // { id: 0, attr: 1, attr_index: 0 }
+        // { id: 1, attr: 1, attr_index: 1 }
+        // { id: 2, attr: 1, attr_index: 2 }
+        // { id: 3, attr: 0, attr_index: 1 }
+        // { id: 3, attr: 1, attr_index: 3 }
+
+        let query_index = &[0, 0, 1, 2, 3, 3];
+        let distance    = &[0, 0, 0, 0, 0, 0];
+        let attribute   = &[0, 1, 1, 1, 0, 1];
+        let word_index  = &[0, 0, 1, 2, 1, 3];
+
+        //   soup -> of = 1
+        // + of -> the  = 1
+        // + the -> day = 1
+        assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 000000000..a39159ce4
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,135 @@
+mod automaton;
+mod query_builder;
+mod raw_document;
+mod reordered_attrs;
+pub mod criterion;
+pub mod raw_indexer;
+pub mod store;
+
+pub use self::query_builder::QueryBuilder;
+pub use self::raw_document::RawDocument;
+
+use zerocopy::{AsBytes, FromBytes};
+
+pub type BEI64 = zerocopy::I64<byteorder::BigEndian>;
+
+/// Represent an internally generated document unique identifier.
+///
+/// It is used to inform the database the document you want to deserialize.
+/// Helpful for custom ranking.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(AsBytes, FromBytes)]
+#[repr(C)]
+pub struct DocumentId(pub i64);
+
+/// This structure represent the position of a word
+/// in a document and its attributes.
+///
+/// This is stored in the map, generated at index time,
+/// extracted and interpreted at search time.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(AsBytes, FromBytes)]
+#[repr(C)]
+pub struct DocIndex {
+    /// The document identifier where the word was found.
+    pub document_id: DocumentId,
+
+    /// The attribute in the document where the word was found
+    /// along with the index in it.
+    pub attribute: u16,
+    pub word_index: u16,
+
+    /// The position in bytes where the word was found
+    /// along with the length of it.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub char_index: u16,
+    pub char_length: u16,
+}
+
+/// This structure represent a matching word with informations
+/// on the location of the word in the document.
+///
+/// The order of the field is important because it defines
+/// the way these structures are ordered between themselves.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Highlight {
+    /// The attribute in the document where the word was found
+    /// along with the index in it.
+    pub attribute: u16,
+
+    /// The position in bytes where the word was found.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub char_index: u16,
+
+    /// The length in bytes of the found word.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub char_length: u16,
+}
+
+#[doc(hidden)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct TmpMatch {
+    pub query_index: u32,
+    pub distance: u8,
+    pub attribute: u16,
+    pub word_index: u16,
+    pub is_exact: bool,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Document {
+    pub id: DocumentId,
+    pub highlights: Vec<Highlight>,
+
+    #[cfg(test)]
+    pub matches: Vec<TmpMatch>,
+}
+
+impl Document {
+    #[cfg(not(test))]
+    fn from_raw(raw: RawDocument) -> Document {
+        Document { id: raw.id, highlights: raw.highlights }
+    }
+
+    #[cfg(test)]
+    fn from_raw(raw: RawDocument) -> Document {
+        let len = raw.query_index().len();
+        let mut matches = Vec::with_capacity(len);
+
+        let query_index = raw.query_index();
+        let distance = raw.distance();
+        let attribute = raw.attribute();
+        let word_index = raw.word_index();
+        let is_exact = raw.is_exact();
+
+        for i in 0..len {
+            let match_ = TmpMatch {
+                query_index: query_index[i],
+                distance: distance[i],
+                attribute: attribute[i],
+                word_index: word_index[i],
+                is_exact: is_exact[i],
+            };
+            matches.push(match_);
+        }
+
+        Document { id: raw.id, matches, highlights: raw.highlights }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::mem;
+
+    #[test]
+    fn docindex_mem_size() {
+        assert_eq!(mem::size_of::<DocIndex>(), 16);
+    }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 000000000..65cfce3ca
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,52 @@
+use rkv::{Manager, Rkv, SingleStore, Value, StoreOptions};
+use std::{fs, path::Path};
+
+use meilidb_schema::SchemaAttr;
+use new_meilidb::{store, QueryBuilder, DocumentId};
+use new_meilidb::raw_indexer::{RawIndexer, Indexed};
+
+fn main() {
+    let path = Path::new("test.rkv");
+    fs::create_dir_all(path).unwrap();
+
+    // The Manager enforces that each process opens the same environment
+    // at most once by caching a handle to each environment that it opens.
+    // Use it to retrieve the handle to an opened environment—or create one
+    // if it hasn't already been opened:
+    let created_arc = Manager::singleton().write().unwrap().get_or_create(path, Rkv::new).unwrap();
+    let env = created_arc.read().unwrap();
+
+    let (words, synonyms) = store::create(&env, "test").unwrap();
+
+    {
+        let mut writer = env.write().unwrap();
+        let mut raw_indexer = RawIndexer::new();
+
+        let docid = DocumentId(0);
+        let attr = SchemaAttr(0);
+        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
+        raw_indexer.index_text(docid, attr, text);
+
+        let Indexed { words_doc_indexes, .. } = raw_indexer.build();
+
+        let mut fst_builder = fst::SetBuilder::memory();
+        fst_builder.extend_iter(words_doc_indexes.keys());
+        let bytes = fst_builder.into_inner().unwrap();
+        let fst = fst::raw::Fst::from_bytes(bytes).unwrap();
+        let fst = fst::Set::from(fst);
+
+        words.put_words_fst(&mut writer, &fst).unwrap();
+
+        for (word, indexes) in words_doc_indexes {
+            words.put_words_indexes(&mut writer, &word, &indexes).unwrap();
+        }
+
+        writer.commit().unwrap();
+    }
+
+    let reader = env.read().unwrap();
+    let builder = QueryBuilder::new(words, synonyms);
+    let documents = builder.query(&reader, "oubli", 0..20).unwrap();
+
+    println!("{:?}", documents);
+}
diff --git a/src/query_builder.rs b/src/query_builder.rs
new file mode 100644
index 000000000..9cb91b755
--- /dev/null
+++ b/src/query_builder.rs
@@ -0,0 +1,275 @@
+use std::time::{Instant, Duration};
+use std::ops::Range;
+use std::{cmp, mem};
+
+use fst::{IntoStreamer, Streamer};
+use sdset::SetBuf;
+use slice_group_by::{GroupBy, GroupByMut};
+
+use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
+use crate::raw_document::{RawDocument, raw_documents_from};
+use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria};
+use crate::{store, reordered_attrs::ReorderedAttrs};
+
+pub struct Automatons {
+    // TODO better use Vec of SmallVec
+    automatons: Vec<Vec<Automaton>>,
+}
+
+pub struct QueryBuilder<'a> {
+    criteria: Criteria<'a>,
+    searchables_attrs: Option<ReorderedAttrs>,
+    timeout: Duration,
+    words_store: store::Words,
+    synonyms_store: store::Synonyms,
+}
+
+fn multiword_rewrite_matches(
+    mut matches: Vec<(DocumentId, TmpMatch)>,
+    query_enhancer: &QueryEnhancer,
+) -> SetBuf<(DocumentId, TmpMatch)>
+{
+    let mut padded_matches = Vec::with_capacity(matches.len());
+
+    // we sort the matches by word index to make them rewritable
+    matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
+
+    let start = Instant::now();
+    // for each attribute of each document
+    for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
+
+        // padding will only be applied
+        // to word indices in the same attribute
+        let mut padding = 0;
+        let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
+
+        // for each match at the same position
+        // in this document attribute
+        while let Some(same_word_index) = iter.next() {
+
+            // find the biggest padding
+            let mut biggest = 0;
+            for (id, match_) in same_word_index {
+
+                let mut replacement = query_enhancer.replacement(match_.query_index);
+                let replacement_len = replacement.len();
+                let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
+
+                if let Some(query_index) = replacement.next() {
+                    let word_index = match_.word_index + padding as u16;
+                    let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
+                    padded_matches.push((*id, match_));
+                }
+
+                let mut found = false;
+
+                // look ahead and if there already is a match
+                // corresponding to this padding word, abort the padding
+                'padding: for (x, next_group) in nexts.enumerate() {
+
+                    for (i, query_index) in replacement.clone().enumerate().skip(x) {
+                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                        let padmatch = TmpMatch { query_index, word_index, ..match_.clone() };
+
+                        for (_, nmatch_) in next_group {
+                            let mut rep = query_enhancer.replacement(nmatch_.query_index);
+                            let query_index = rep.next().unwrap();
+                            if query_index == padmatch.query_index {
+
+                                if !found {
+                                    // if we find a corresponding padding for the
+                                    // first time we must push preceding paddings
+                                    for (i, query_index) in replacement.clone().enumerate().take(i) {
+                                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                                        let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
+                                        padded_matches.push((*id, match_));
+                                        biggest = biggest.max(i + 1);
+                                    }
+                                }
+
+                                padded_matches.push((*id, padmatch));
+                                found = true;
+                                continue 'padding;
+                            }
+                        }
+                    }
+
+                    // if we do not find a corresponding padding in the
+                    // next groups so stop here and pad what was found
+                    break
+                }
+
+                if !found {
+                    // if no padding was found in the following matches
+                    // we must insert the entire padding
+                    for (i, query_index) in replacement.enumerate() {
+                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                        let match_ = TmpMatch { query_index, word_index, ..match_.clone() };
+                        padded_matches.push((*id, match_));
+                    }
+
+                    biggest = biggest.max(replacement_len - 1);
+                }
+            }
+
+            padding += biggest;
+        }
+    }
+
+    for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
+        document_matches.sort_unstable();
+    }
+
+    SetBuf::new_unchecked(padded_matches)
+}
+
+fn fetch_raw_documents(
+    reader: &rkv::Reader,
+    automatons: &[Automaton],
+    query_enhancer: &QueryEnhancer,
+    searchables: Option<&ReorderedAttrs>,
+    words_store: &store::Words,
+) -> Result<Vec<RawDocument>, rkv::StoreError>
+{
+    let mut matches = Vec::new();
+    let mut highlights = Vec::new();
+
+    for automaton in automatons {
+        let Automaton { index, is_exact, query_len, .. } = automaton;
+        let dfa = automaton.dfa();
+
+        let words = words_store.words_fst(reader)?;
+
+        let mut stream = words.search(&dfa).into_stream();
+        while let Some(input) = stream.next() {
+            let distance = dfa.eval(input).to_u8();
+            let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
+
+            let doc_indexes = match words_store.word_indexes(reader, input)? {
+                Some(doc_indexes) => doc_indexes,
+                None => continue,
+            };
+
+            matches.reserve(doc_indexes.len());
+            highlights.reserve(doc_indexes.len());
+
+            for di in doc_indexes.as_ref() {
+                let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
+                if let Some(attribute) = attribute {
+                    let match_ = TmpMatch {
+                        query_index: *index as u32,
+                        distance,
+                        attribute,
+                        word_index: di.word_index,
+                        is_exact,
+                    };
+
+                    let highlight = Highlight {
+                        attribute: di.attribute,
+                        char_index: di.char_index,
+                        char_length: di.char_length,
+                    };
+
+                    matches.push((di.document_id, match_));
+                    highlights.push((di.document_id, highlight));
+                }
+            }
+        }
+    }
+
+    let matches = multiword_rewrite_matches(matches, &query_enhancer);
+    let highlights = {
+        highlights.sort_unstable_by_key(|(id, _)| *id);
+        SetBuf::new_unchecked(highlights)
+    };
+
+    Ok(raw_documents_from(matches, highlights))
+}
+
+impl<'a> QueryBuilder<'a> {
+    pub fn new(words: store::Words, synonyms: store::Synonyms) -> QueryBuilder<'a> {
+        QueryBuilder {
+            criteria: Criteria::default(),
+            searchables_attrs: None,
+            timeout: Duration::from_secs(1),
+            words_store: words,
+            synonyms_store: synonyms,
+        }
+    }
+
+    pub fn query(
+        self,
+        reader: &rkv::Reader,
+        query: &str,
+        range: Range<usize>,
+    ) -> Result<Vec<Document>, rkv::StoreError>
+    {
+        let start_processing = Instant::now();
+        let mut raw_documents_processed = Vec::new();
+
+        let (automaton_producer, query_enhancer) = AutomatonProducer::new(reader, query, self.synonyms_store);
+        let mut automaton_producer = automaton_producer.into_iter();
+        let mut automatons = Vec::new();
+
+        // aggregate automatons groups by groups after time
+        while let Some(auts) = automaton_producer.next() {
+            automatons.extend(auts);
+
+            // we must retrieve the documents associated
+            // with the current automatons
+            let mut raw_documents = fetch_raw_documents(
+                reader,
+                &automatons,
+                &query_enhancer,
+                self.searchables_attrs.as_ref(),
+                &self.words_store,
+            )?;
+
+            let mut groups = vec![raw_documents.as_mut_slice()];
+
+            'criteria: for criterion in self.criteria.as_ref() {
+                let tmp_groups = mem::replace(&mut groups, Vec::new());
+                let mut documents_seen = 0;
+
+                for group in tmp_groups {
+                    // if this group does not overlap with the requested range,
+                    // push it without sorting and splitting it
+                    if documents_seen + group.len() < range.start {
+                        documents_seen += group.len();
+                        groups.push(group);
+                        continue;
+                    }
+
+                    group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
+
+                    for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
+                        documents_seen += group.len();
+                        groups.push(group);
+
+                        // we have sort enough documents if the last document sorted is after
+                        // the end of the requested range, we can continue to the next criterion
+                        if documents_seen >= range.end { continue 'criteria }
+                    }
+                }
+            }
+
+            // once we classified the documents related to the current
+            // automatons we save that as the next valid result
+            let iter = raw_documents.into_iter().skip(range.start).take(range.len());
+            raw_documents_processed.clear();
+            raw_documents_processed.extend(iter);
+
+            // stop processing after there is no time
+            if start_processing.elapsed() > self.timeout { break }
+        }
+
+        // make real documents now that we know
+        // those must be returned
+        let documents = raw_documents_processed
+            .into_iter()
+            .map(|d| Document::from_raw(d))
+            .collect();
+
+        Ok(documents)
+    }
+}
diff --git a/src/raw_document.rs b/src/raw_document.rs
new file mode 100644
index 000000000..3567c3fd1
--- /dev/null
+++ b/src/raw_document.rs
@@ -0,0 +1,141 @@
+use std::sync::Arc;
+use std::fmt;
+use sdset::SetBuf;
+use slice_group_by::GroupBy;
+use crate::{TmpMatch, DocumentId, Highlight};
+
+#[derive(Clone)]
+pub struct RawDocument {
+    pub id: DocumentId,
+    pub matches: SharedMatches,
+    pub highlights: Vec<Highlight>,
+}
+
+impl RawDocument {
+    fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
+        RawDocument { id, matches, highlights }
+    }
+
+    pub fn query_index(&self) -> &[u32] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn distance(&self) -> &[u8] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn attribute(&self) -> &[u16] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn word_index(&self) -> &[u16] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn is_exact(&self) -> &[bool] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
+    }
+}
+
+impl fmt::Debug for RawDocument {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.write_str("RawDocument {\r\n")?;
+        f.write_fmt(format_args!("{:>15}: {:?},\r\n",    "id",          self.id))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "query_index", self.query_index()))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "distance",    self.distance()))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "attribute",   self.attribute()))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "word_index",  self.word_index()))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact",    self.is_exact()))?;
+        f.write_str("}")?;
+        Ok(())
+    }
+}
+
+pub fn raw_documents_from(
+    matches: SetBuf<(DocumentId, TmpMatch)>,
+    highlights: SetBuf<(DocumentId, Highlight)>,
+) -> Vec<RawDocument>
+{
+    let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
+    let mut matches2 = Matches::with_capacity(matches.len());
+
+    let matches = matches.linear_group_by_key(|(id, _)| *id);
+    let highlights = highlights.linear_group_by_key(|(id, _)| *id);
+
+    for (mgroup, hgroup) in matches.zip(highlights) {
+        debug_assert_eq!(mgroup[0].0, hgroup[0].0);
+
+        let document_id = mgroup[0].0;
+        let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
+        let end = start + mgroup.len();
+
+        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
+        docs_ranges.push((document_id, Range { start, end }, highlights));
+
+        matches2.extend_from_slice(mgroup);
+    }
+
+    let matches = Arc::new(matches2);
+    docs_ranges.into_iter().map(|(id, range, highlights)| {
+        let matches = SharedMatches { range, matches: matches.clone() };
+        RawDocument::new(id, matches, highlights)
+    }).collect()
+}
+
+#[derive(Debug, Copy, Clone)]
+struct Range {
+    start: usize,
+    end: usize,
+}
+
+#[derive(Clone)]
+pub struct SharedMatches {
+    range: Range,
+    matches: Arc<Matches>,
+}
+
+#[derive(Clone)]
+struct Matches {
+    query_index: Vec<u32>,
+    distance: Vec<u8>,
+    attribute: Vec<u16>,
+    word_index: Vec<u16>,
+    is_exact: Vec<bool>,
+}
+
+impl Matches {
+    fn with_capacity(cap: usize) -> Matches {
+        Matches {
+            query_index: Vec::with_capacity(cap),
+            distance: Vec::with_capacity(cap),
+            attribute: Vec::with_capacity(cap),
+            word_index: Vec::with_capacity(cap),
+            is_exact: Vec::with_capacity(cap),
+        }
+    }
+
+    fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
+        for (_, match_) in matches {
+            self.query_index.push(match_.query_index);
+            self.distance.push(match_.distance);
+            self.attribute.push(match_.attribute);
+            self.word_index.push(match_.word_index);
+            self.is_exact.push(match_.is_exact);
+        }
+    }
+}
diff --git a/src/raw_indexer.rs b/src/raw_indexer.rs
new file mode 100644
index 000000000..9c0399be5
--- /dev/null
+++ b/src/raw_indexer.rs
@@ -0,0 +1,208 @@
+use std::collections::{BTreeMap, HashMap};
+use std::convert::TryFrom;
+
+use deunicode::deunicode_with_tofu;
+use crate::{DocumentId, DocIndex};
+use meilidb_schema::SchemaAttr;
+use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
+use sdset::SetBuf;
+
+type Word = Vec<u8>; // TODO make it be a SmallVec
+
+pub struct RawIndexer {
+    word_limit: usize, // the maximum number of indexed words
+    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
+    docs_words: HashMap<DocumentId, Vec<Word>>,
+}
+
+pub struct Indexed {
+    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
+    pub docs_words: HashMap<DocumentId, fst::Set>,
+}
+
+impl RawIndexer {
+    pub fn new() -> RawIndexer {
+        RawIndexer::with_word_limit(1000)
+    }
+
+    pub fn with_word_limit(limit: usize) -> RawIndexer {
+        RawIndexer {
+            word_limit: limit,
+            words_doc_indexes: BTreeMap::new(),
+            docs_words: HashMap::new(),
+        }
+    }
+
+    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
+        let lowercase_text = text.to_lowercase();
+        let deunicoded = deunicode_with_tofu(&lowercase_text, "");
+
+        // TODO compute the deunicoded version after the cjk check
+        let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
+            Some(deunicoded)
+        } else {
+            None
+        };
+        let iter = Some(lowercase_text).into_iter().chain(next);
+
+        for text in iter {
+            for token in Tokenizer::new(&text) {
+                let must_continue = index_token(
+                    token,
+                    id,
+                    attr,
+                    self.word_limit,
+                    &mut self.words_doc_indexes,
+                    &mut self.docs_words,
+                );
+
+                if !must_continue { break }
+            }
+        }
+    }
+
+    pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
+    where I: IntoIterator<Item=&'a str, IntoIter=IT>,
+          IT: Iterator<Item = &'a str> + Clone,
+    {
+        // TODO serialize this to one call to the SeqTokenizer loop
+
+        let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
+        let iter = lowercased.iter().map(|t| t.as_str());
+
+        for token in SeqTokenizer::new(iter) {
+            let must_continue = index_token(
+                token,
+                id,
+                attr,
+                self.word_limit,
+                &mut self.words_doc_indexes,
+                &mut self.docs_words,
+            );
+
+            if !must_continue { break }
+        }
+
+        let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
+            if lowercase_text.contains(is_cjk) { return lowercase_text }
+            let deunicoded = deunicode_with_tofu(&lowercase_text, "");
+            if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
+        }).collect();
+        let iter = deunicoded.iter().map(|t| t.as_str());
+
+        for token in SeqTokenizer::new(iter) {
+            let must_continue = index_token(
+                token,
+                id,
+                attr,
+                self.word_limit,
+                &mut self.words_doc_indexes,
+                &mut self.docs_words,
+            );
+
+            if !must_continue { break }
+        }
+    }
+
+    pub fn build(self) -> Indexed {
+        let words_doc_indexes = self.words_doc_indexes
+            .into_iter()
+            .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
+            .collect();
+
+        let docs_words = self.docs_words
+            .into_iter()
+            .map(|(id, mut words)| {
+                words.sort_unstable();
+                words.dedup();
+                (id, fst::Set::from_iter(words).unwrap())
+            })
+            .collect();
+
+        Indexed { words_doc_indexes, docs_words }
+    }
+}
+
+fn index_token(
+    token: Token,
+    id: DocumentId,
+    attr: SchemaAttr,
+    word_limit: usize,
+    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
+    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
+) -> bool
+{
+    if token.word_index >= word_limit { return false }
+
+    match token_to_docindex(id, attr, token) {
+        Some(docindex) => {
+            let word = Vec::from(token.word);
+            words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
+            docs_words.entry(id).or_insert_with(Vec::new).push(word);
+        },
+        None => return false,
+    }
+
+    true
+}
+
+fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
+    let word_index = u16::try_from(token.word_index).ok()?;
+    let char_index = u16::try_from(token.char_index).ok()?;
+    let char_length = u16::try_from(token.word.chars().count()).ok()?;
+
+    let docindex = DocIndex {
+        document_id: id,
+        attribute: attr.0,
+        word_index,
+        char_index,
+        char_length,
+    };
+
+    Some(docindex)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn strange_apostrophe() {
+        let mut indexer = RawIndexer::new();
+
+        let docid = DocumentId(0);
+        let attr = SchemaAttr(0);
+        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
+        indexer.index_text(docid, attr, text);
+
+        let Indexed { words_doc_indexes, .. } = indexer.build();
+
+        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
+        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
+        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
+        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
+
+        // with the ugly apostrophe...
+        assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
+    }
+
+    #[test]
+    fn strange_apostrophe_in_sequence() {
+        let mut indexer = RawIndexer::new();
+
+        let docid = DocumentId(0);
+        let attr = SchemaAttr(0);
+        let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
+        indexer.index_text_seq(docid, attr, text);
+
+        let Indexed { words_doc_indexes, .. } = indexer.build();
+
+        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
+        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
+        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
+        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
+
+        // with the ugly apostrophe...
+        assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
+    }
+}
diff --git a/src/reordered_attrs.rs b/src/reordered_attrs.rs
new file mode 100644
index 000000000..ed11045ab
--- /dev/null
+++ b/src/reordered_attrs.rs
@@ -0,0 +1,24 @@
+#[derive(Default, Clone)]
+pub struct ReorderedAttrs {
+    count: usize,
+    reorders: Vec<Option<u16>>,
+}
+
+impl ReorderedAttrs {
+    pub fn new() -> ReorderedAttrs {
+        ReorderedAttrs { count: 0, reorders: Vec::new() }
+    }
+
+    pub fn insert_attribute(&mut self, attribute: u16) {
+        self.reorders.resize(attribute as usize + 1, None);
+        self.reorders[attribute as usize] = Some(self.count as u16);
+        self.count += 1;
+    }
+
+    pub fn get(&self, attribute: u16) -> Option<u16> {
+        match self.reorders.get(attribute as usize) {
+            Some(Some(attribute)) => Some(*attribute),
+            _ => None,
+        }
+    }
+}
diff --git a/src/store/mod.rs b/src/store/mod.rs
new file mode 100644
index 000000000..9c6620484
--- /dev/null
+++ b/src/store/mod.rs
@@ -0,0 +1,26 @@
+mod words;
+mod synonyms;
+
+pub use self::words::Words;
+pub use self::synonyms::Synonyms;
+
+const SCHEMA_KEY:              &str = "schema";
+const WORDS_KEY:               &str = "words";
+const SYNONYMS_KEY:            &str = "synonyms";
+const RANKED_MAP_KEY:          &str = "ranked-map";
+const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
+
+fn aligned_to(bytes: &[u8], align: usize) -> bool {
+    (bytes as *const _ as *const () as usize) % align == 0
+}
+
+pub fn create(env: &rkv::Rkv, name: &str) -> Result<(Words, Synonyms), rkv::StoreError> {
+    let main = env.open_single(name, rkv::StoreOptions::create())?;
+    let words_indexes = env.open_single(format!("{}-words-indexes", name).as_str(), rkv::StoreOptions::create())?;
+    let synonyms = env.open_single(format!("{}-synonyms", name).as_str(), rkv::StoreOptions::create())?;
+
+    let words = Words { main, words_indexes };
+    let synonyms = Synonyms { main, synonyms };
+
+    Ok((words, synonyms))
+}
diff --git a/src/store/synonyms.rs b/src/store/synonyms.rs
new file mode 100644
index 000000000..4cf1186cc
--- /dev/null
+++ b/src/store/synonyms.rs
@@ -0,0 +1,23 @@
+pub struct Synonyms {
+    pub(crate) main: rkv::SingleStore,
+    pub(crate) synonyms: rkv::SingleStore,
+}
+
+impl Synonyms {
+    pub fn synonyms_fst<T: rkv::Readable>(
+        &self,
+        reader: &T,
+    ) -> Result<fst::Set, rkv::StoreError>
+    {
+        Ok(fst::Set::default())
+    }
+
+    pub fn alternatives_to<T: rkv::Readable>(
+        &self,
+        reader: &T,
+        word: &[u8],
+    ) -> Result<Option<fst::Set>, rkv::StoreError>
+    {
+        unimplemented!()
+    }
+}
diff --git a/src/store/words.rs b/src/store/words.rs
new file mode 100644
index 000000000..face8a979
--- /dev/null
+++ b/src/store/words.rs
@@ -0,0 +1,91 @@
+use std::borrow::Cow;
+use std::sync::Arc;
+use std::{mem, ptr};
+use zerocopy::{AsBytes, LayoutVerified};
+
+use crate::DocIndex;
+use crate::store::aligned_to;
+use crate::store::WORDS_KEY;
+
+pub struct Words {
+    pub(crate) main: rkv::SingleStore,
+    pub(crate) words_indexes: rkv::SingleStore,
+}
+
+impl Words {
+    pub fn put_words_fst(
+        &self,
+        writer: &mut rkv::Writer,
+        fst: &fst::Set,
+    ) -> Result<(), rkv::StoreError>
+    {
+        let blob = rkv::Value::Blob(fst.as_fst().as_bytes());
+        self.main.put(writer, WORDS_KEY, &blob)
+    }
+
+    pub fn words_fst<T: rkv::Readable>(
+        &self,
+        reader: &T,
+    ) -> Result<fst::Set, rkv::StoreError>
+    {
+        match self.main.get(reader, WORDS_KEY)? {
+            Some(rkv::Value::Blob(bytes)) => {
+                let len = bytes.len();
+                let bytes = Arc::from(bytes);
+                let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
+                Ok(fst::Set::from(fst))
+            },
+            Some(value) => panic!("invalid type {:?}", value),
+            None => panic!("could not find word index"),
+        }
+    }
+
+    pub fn put_words_indexes(
+        &self,
+        writer: &mut rkv::Writer,
+        word: &[u8],
+        words_indexes: &[DocIndex],
+    ) -> Result<(), rkv::StoreError>
+    {
+        let blob = rkv::Value::Blob(words_indexes.as_bytes());
+        self.main.put(writer, word, &blob)
+    }
+
+    pub fn word_indexes<'a, T: rkv::Readable>(
+        &self,
+        reader: &'a T,
+        word: &[u8],
+    ) -> Result<Option<Cow<'a, [DocIndex]>>, rkv::StoreError>
+    {
+        let bytes = match self.main.get(reader, word)? {
+            Some(rkv::Value::Blob(bytes)) => bytes,
+            Some(value) => panic!("invalid type {:?}", value),
+            None => return Ok(None),
+        };
+
+        match LayoutVerified::new_slice(bytes) {
+            Some(layout) => Ok(Some(Cow::Borrowed(layout.into_slice()))),
+            None => {
+                let len = bytes.len();
+                let elem_size = mem::size_of::<DocIndex>();
+
+                // ensure that it is the alignment that is wrong
+                // and the length is valid
+                if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<DocIndex>()) {
+                    let elems = len / elem_size;
+                    let mut vec = Vec::<DocIndex>::with_capacity(elems);
+
+                    unsafe {
+                        let dst = vec.as_mut_ptr() as *mut u8;
+                        ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
+                        vec.set_len(elems);
+                    }
+
+                    return Ok(Some(Cow::Owned(vec)))
+                }
+
+                Ok(None)
+            },
+        }
+    }
+}