chore: Move index related things to the meilidb-core workspace member

2025-07-03 20:07:09 +02:00 · 2019-02-24 19:44:24 +01:00 · 2019-02-24 19:44:24 +01:00 · 14790eeae3
commit 14790eeae3
parent 3056b351fa
44 changed files with 1343 additions and 252 deletions
--- a/meilidb-core/Cargo.toml
+++ b/meilidb-core/Cargo.toml
@ -0,0 +1,21 @@
+[package]
+name = "meilidb-core"
+version = "0.1.0"
+authors = ["Kerollmops <renault.cle@gmail.com>"]
+edition = "2018"
+
+[dependencies]
+byteorder = "1.3.1"
+fst = "0.3.3"
+hashbrown = "0.1.8"
+lazy_static = "1.2.0"
+levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
+log = "0.4.6"
+rayon = "1.0.3"
+sdset = "0.3.1"
+serde = "1.0.88"
+serde_derive = "1.0.88"
+slice-group-by = "0.2.4"
+
+[features]
+i128 = ["byteorder/i128"]
--- a/meilidb-core/src/automaton.rs
+++ b/meilidb-core/src/automaton.rs
@ -0,0 +1,91 @@
+use fst::Automaton;
+use lazy_static::lazy_static;
+use levenshtein_automata::{
+    LevenshteinAutomatonBuilder as LevBuilder,
+    DFA, Distance,
+};
+
+lazy_static! {
+    static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
+    static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
+    static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
+}
+
+pub struct DfaExt {
+    query_len: usize,
+    automaton: DFA,
+}
+
+impl Automaton for DfaExt {
+    type State = <DFA as Automaton>::State;
+
+    fn start(&self) -> Self::State {
+        self.automaton.start()
+    }
+
+    fn is_match(&self, state: &Self::State) -> bool {
+        self.automaton.is_match(state)
+    }
+
+    fn can_match(&self, state: &Self::State) -> bool {
+        self.automaton.can_match(state)
+    }
+
+    fn will_always_match(&self, state: &Self::State) -> bool {
+        self.automaton.will_always_match(state)
+    }
+
+    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+        self.automaton.accept(state, byte)
+    }
+}
+
+impl AutomatonExt for DfaExt {
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
+        self.automaton.eval(s)
+    }
+
+    fn query_len(&self) -> usize {
+        self.query_len
+    }
+}
+
+#[derive(Copy, Clone)]
+enum PrefixSetting {
+    Prefix,
+    NoPrefix,
+}
+
+fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
+    use self::PrefixSetting::{Prefix, NoPrefix};
+
+    let dfa = match query.len() {
+        0 ..= 4 => match setting {
+            Prefix   => LEVDIST0.build_prefix_dfa(query),
+            NoPrefix => LEVDIST0.build_dfa(query),
+        },
+        5 ..= 8 => match setting {
+            Prefix   => LEVDIST1.build_prefix_dfa(query),
+            NoPrefix => LEVDIST1.build_dfa(query),
+        },
+        _ => match setting {
+            Prefix   => LEVDIST2.build_prefix_dfa(query),
+            NoPrefix => LEVDIST2.build_dfa(query),
+        },
+    };
+
+    DfaExt { query_len: query.len(), automaton: dfa }
+}
+
+pub fn build_prefix_dfa(query: &str) -> DfaExt {
+    build_dfa_with_setting(query, PrefixSetting::Prefix)
+}
+
+pub fn build_dfa(query: &str) -> DfaExt {
+    build_dfa_with_setting(query, PrefixSetting::NoPrefix)
+}
+
+pub trait AutomatonExt: Automaton {
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
+    fn query_len(&self) -> usize;
+}
--- a/meilidb-core/src/criterion/document_id.rs
+++ b/meilidb-core/src/criterion/document_id.rs
@ -0,0 +1,12 @@
+use std::cmp::Ordering;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[derive(Debug, Clone, Copy)]
+pub struct DocumentId;
+
+impl Criterion for DocumentId {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        lhs.id.cmp(&rhs.id)
+    }
+}
--- a/meilidb-core/src/criterion/exact.rs
+++ b/meilidb-core/src/criterion/exact.rs
@ -0,0 +1,39 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
+    let mut count = 0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        let len = group.len();
+        count += is_exact[index..index + len].contains(&true) as usize;
+        index += len;
+    }
+
+    count
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct Exact;
+
+impl Criterion for Exact {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let is_exact = lhs.is_exact();
+            number_exact_matches(query_index, is_exact)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let is_exact = rhs.is_exact();
+            number_exact_matches(query_index, is_exact)
+        };
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
--- a/meilidb-core/src/criterion/mod.rs
+++ b/meilidb-core/src/criterion/mod.rs
@ -0,0 +1,112 @@
+mod sum_of_typos;
+mod number_of_words;
+mod words_proximity;
+mod sum_of_words_attribute;
+mod sum_of_words_position;
+mod exact;
+// mod sort_by_attr;
+mod document_id;
+
+use std::cmp::Ordering;
+use crate::RawDocument;
+
+pub use self::{
+    sum_of_typos::SumOfTypos,
+    number_of_words::NumberOfWords,
+    words_proximity::WordsProximity,
+    sum_of_words_attribute::SumOfWordsAttribute,
+    sum_of_words_position::SumOfWordsPosition,
+    exact::Exact,
+    // sort_by_attr::SortByAttr,
+    document_id::DocumentId,
+};
+
+pub trait Criterion: Send + Sync {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
+
+    #[inline]
+    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
+        self.evaluate(lhs, rhs) == Ordering::Equal
+    }
+}
+
+impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        (**self).evaluate(lhs, rhs)
+    }
+
+    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
+        (**self).eq(lhs, rhs)
+    }
+}
+
+impl<T: Criterion + ?Sized> Criterion for Box<T> {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        (**self).evaluate(lhs, rhs)
+    }
+
+    fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
+        (**self).eq(lhs, rhs)
+    }
+}
+
+#[derive(Default)]
+pub struct CriteriaBuilder<'a> {
+    inner: Vec<Box<dyn Criterion + 'a>>
+}
+
+impl<'a> CriteriaBuilder<'a>
+{
+    pub fn new() -> CriteriaBuilder<'a> {
+        CriteriaBuilder { inner: Vec::new() }
+    }
+
+    pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
+        CriteriaBuilder { inner: Vec::with_capacity(capacity) }
+    }
+
+    pub fn reserve(&mut self, additional: usize) {
+        self.inner.reserve(additional)
+    }
+
+    pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
+    where C: Criterion,
+    {
+        self.push(criterion);
+        self
+    }
+
+    pub fn push<C: 'a>(&mut self, criterion: C)
+    where C: Criterion,
+    {
+        self.inner.push(Box::new(criterion));
+    }
+
+    pub fn build(self) -> Criteria<'a> {
+        Criteria { inner: self.inner }
+    }
+}
+
+pub struct Criteria<'a> {
+    inner: Vec<Box<dyn Criterion + 'a>>,
+}
+
+impl<'a> Default for Criteria<'a> {
+    fn default() -> Self {
+        CriteriaBuilder::with_capacity(7)
+            .add(SumOfTypos)
+            .add(NumberOfWords)
+            .add(WordsProximity)
+            .add(SumOfWordsAttribute)
+            .add(SumOfWordsPosition)
+            .add(Exact)
+            .add(DocumentId)
+            .build()
+    }
+}
+
+impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
+    fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
+        &self.inner
+    }
+}
--- a/meilidb-core/src/criterion/number_of_words.rs
+++ b/meilidb-core/src/criterion/number_of_words.rs
@ -0,0 +1,27 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn number_of_query_words(query_index: &[u32]) -> usize {
+    query_index.linear_group().count()
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct NumberOfWords;
+
+impl Criterion for NumberOfWords {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            number_of_query_words(query_index)
+        };
+        let rhs = {
+            let query_index = rhs.query_index();
+            number_of_query_words(query_index)
+        };
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
--- a/meilidb-core/src/criterion/sort_by_attr.rs
+++ b/meilidb-core/src/criterion/sort_by_attr.rs
@ -0,0 +1,122 @@
+use std::cmp::Ordering;
+use std::error::Error;
+use std::fmt;
+
+use crate::database::schema::{Schema, SchemaAttr};
+use crate::criterion::Criterion;
+use crate::database::RankedMap;
+use crate::RawDocument;
+
+/// An helper struct that permit to sort documents by
+/// some of their stored attributes.
+///
+/// # Note
+///
+/// If a document cannot be deserialized it will be considered [`None`][].
+///
+/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
+/// so you must check the [`Ord`] of `Option` implementation.
+///
+/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
+/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
+///
+/// # Example
+///
+/// ```ignore
+/// use serde_derive::Deserialize;
+/// use meilidb::rank::criterion::*;
+///
+/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
+///
+/// let builder = CriteriaBuilder::with_capacity(8)
+///        .add(SumOfTypos)
+///        .add(NumberOfWords)
+///        .add(WordsProximity)
+///        .add(SumOfWordsAttribute)
+///        .add(SumOfWordsPosition)
+///        .add(Exact)
+///        .add(custom_ranking)
+///        .add(DocumentId);
+///
+/// let criterion = builder.build();
+///
+/// ```
+pub struct SortByAttr<'a> {
+    ranked_map: &'a RankedMap,
+    attr: SchemaAttr,
+    reversed: bool,
+}
+
+impl<'a> SortByAttr<'a> {
+    pub fn lower_is_better(
+        ranked_map: &'a RankedMap,
+        schema: &Schema,
+        attr_name: &str,
+    ) -> Result<SortByAttr<'a>, SortByAttrError>
+    {
+        SortByAttr::new(ranked_map, schema, attr_name, false)
+    }
+
+    pub fn higher_is_better(
+        ranked_map: &'a RankedMap,
+        schema: &Schema,
+        attr_name: &str,
+    ) -> Result<SortByAttr<'a>, SortByAttrError>
+    {
+        SortByAttr::new(ranked_map, schema, attr_name, true)
+    }
+
+    fn new(
+        ranked_map: &'a RankedMap,
+        schema: &Schema,
+        attr_name: &str,
+        reversed: bool,
+    ) -> Result<SortByAttr<'a>, SortByAttrError>
+    {
+        let attr = match schema.attribute(attr_name) {
+            Some(attr) => attr,
+            None => return Err(SortByAttrError::AttributeNotFound),
+        };
+
+        if !schema.props(attr).is_ranked() {
+            return Err(SortByAttrError::AttributeNotRegisteredForRanking);
+        }
+
+        Ok(SortByAttr { ranked_map, attr, reversed })
+    }
+}
+
+impl<'a> Criterion for SortByAttr<'a> {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = self.ranked_map.get(&(lhs.id, self.attr));
+        let rhs = self.ranked_map.get(&(rhs.id, self.attr));
+
+        match (lhs, rhs) {
+            (Some(lhs), Some(rhs)) => {
+                let order = lhs.cmp(&rhs);
+                if self.reversed { order.reverse() } else { order }
+            },
+            (None,    Some(_)) => Ordering::Greater,
+            (Some(_), None)    => Ordering::Less,
+            (None,    None)    => Ordering::Equal,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum SortByAttrError {
+    AttributeNotFound,
+    AttributeNotRegisteredForRanking,
+}
+
+impl fmt::Display for SortByAttrError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        use SortByAttrError::*;
+        match self {
+            AttributeNotFound => f.write_str("attribute not found in the schema"),
+            AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
+        }
+    }
+}
+
+impl Error for SortByAttrError { }
--- a/meilidb-core/src/criterion/sum_of_typos.rs
+++ b/meilidb-core/src/criterion/sum_of_typos.rs
@ -0,0 +1,112 @@
+use std::cmp::Ordering;
+
+use slice_group_by::GroupBy;
+
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+// This function is a wrong logarithmic 10 function.
+// It is safe to panic on input number higher than 3,
+// the number of typos is never bigger than that.
+#[inline]
+fn custom_log10(n: u8) -> f32 {
+    match n {
+        0 => 0.0,       // log(1)
+        1 => 0.30102,   // log(2)
+        2 => 0.47712,   // log(3)
+        3 => 0.60205,   // log(4)
+        _ => panic!("invalid number"),
+    }
+}
+
+#[inline]
+fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
+    let mut number_words = 0;
+    let mut sum_typos = 0.0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        sum_typos += custom_log10(distance[index]);
+        number_words += 1;
+        index += group.len();
+    }
+
+    (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SumOfTypos;
+
+impl Criterion for SumOfTypos {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let distance = lhs.distance();
+            sum_matches_typos(query_index, distance)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let distance = rhs.distance();
+            sum_matches_typos(query_index, distance)
+        };
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // typing: "Geox CEO"
+    //
+    // doc0: "Geox SpA: CEO and Executive"
+    // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
+    #[test]
+    fn one_typo_reference() {
+        let query_index0 = &[0, 1];
+        let distance0 = &[0, 0];
+
+        let query_index1 = &[0, 1];
+        let distance1 = &[1, 0];
+
+        let doc0 = sum_matches_typos(query_index0, distance0);
+        let doc1 = sum_matches_typos(query_index1, distance1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+
+    // typing: "bouton manchette"
+    //
+    // doc0: "bouton manchette"
+    // doc1: "bouton"
+    #[test]
+    fn no_typo() {
+        let query_index0 = &[0, 1];
+        let distance0 = &[0, 0];
+
+        let query_index1 = &[0];
+        let distance1 = &[0];
+
+        let doc0 = sum_matches_typos(query_index0, distance0);
+        let doc1 = sum_matches_typos(query_index1, distance1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+
+    // typing: "bouton manchztte"
+    //
+    // doc0: "bouton manchette"
+    // doc1: "bouton"
+    #[test]
+    fn one_typo() {
+        let query_index0 = &[0, 1];
+        let distance0 = &[0, 1];
+
+        let query_index1 = &[0];
+        let distance1 = &[0];
+
+        let doc0 = sum_matches_typos(query_index0, distance0);
+        let doc1 = sum_matches_typos(query_index1, distance1);
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+}
--- a/meilidb-core/src/criterion/sum_of_words_attribute.rs
+++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs
@ -0,0 +1,38 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
+    let mut sum_attributes = 0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        sum_attributes += attribute[index] as usize;
+        index += group.len();
+    }
+
+    sum_attributes
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SumOfWordsAttribute;
+
+impl Criterion for SumOfWordsAttribute {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let attribute = lhs.attribute();
+            sum_matches_attributes(query_index, attribute)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let attribute = rhs.attribute();
+            sum_matches_attributes(query_index, attribute)
+        };
+
+        lhs.cmp(&rhs)
+    }
+}
--- a/meilidb-core/src/criterion/sum_of_words_position.rs
+++ b/meilidb-core/src/criterion/sum_of_words_position.rs
@ -0,0 +1,38 @@
+use std::cmp::Ordering;
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+#[inline]
+fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
+    let mut sum_word_index = 0;
+    let mut index = 0;
+
+    for group in query_index.linear_group() {
+        sum_word_index += word_index[index] as usize;
+        index += group.len();
+    }
+
+    sum_word_index
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct SumOfWordsPosition;
+
+impl Criterion for SumOfWordsPosition {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let word_index = lhs.word_index();
+            sum_matches_attribute_index(query_index, word_index)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let word_index = rhs.word_index();
+            sum_matches_attribute_index(query_index, word_index)
+        };
+
+        lhs.cmp(&rhs)
+    }
+}
--- a/meilidb-core/src/criterion/words_proximity.rs
+++ b/meilidb-core/src/criterion/words_proximity.rs
@ -0,0 +1,151 @@
+use std::cmp::{self, Ordering};
+use slice_group_by::GroupBy;
+use crate::criterion::Criterion;
+use crate::RawDocument;
+
+const MAX_DISTANCE: u16 = 8;
+
+#[inline]
+fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
+    (a.clone(), b.clone())
+}
+
+fn index_proximity(lhs: u16, rhs: u16) -> u16 {
+    if lhs < rhs {
+        cmp::min(rhs - lhs, MAX_DISTANCE)
+    } else {
+        cmp::min(lhs - rhs, MAX_DISTANCE) + 1
+    }
+}
+
+fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
+    if lattr != rattr { return MAX_DISTANCE }
+    index_proximity(lwi, rwi)
+}
+
+fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
+    let mut min_prox = u16::max_value();
+
+    for a in lattr.iter().zip(lwi) {
+        for b in rattr.iter().zip(rwi) {
+            let a = clone_tuple(a);
+            let b = clone_tuple(b);
+            min_prox = cmp::min(min_prox, attribute_proximity(a, b));
+        }
+    }
+
+    min_prox
+}
+
+fn matches_proximity(
+    query_index: &[u32],
+    distance: &[u8],
+    attribute: &[u16],
+    word_index: &[u16],
+) -> u16
+{
+    let mut query_index_groups = query_index.linear_group();
+    let mut proximity = 0;
+    let mut index = 0;
+
+    let get_attr_wi = |index: usize, group_len: usize| {
+        // retrieve the first distance group (with the lowest values)
+        let len = distance[index..index + group_len].linear_group().next().unwrap().len();
+
+        let rattr = &attribute[index..index + len];
+        let rwi = &word_index[index..index + len];
+
+        (rattr, rwi)
+    };
+
+    let mut last = query_index_groups.next().map(|group| {
+        let attr_wi = get_attr_wi(index, group.len());
+        index += group.len();
+        attr_wi
+    });
+
+    // iter by windows of size 2
+    while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
+        let attr_wi = get_attr_wi(index, rhs.len());
+        proximity += min_proximity(lhs, attr_wi);
+        last = Some(attr_wi);
+        index += rhs.len();
+    }
+
+    proximity
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct WordsProximity;
+
+impl Criterion for WordsProximity {
+    fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
+        let lhs = {
+            let query_index = lhs.query_index();
+            let distance = lhs.distance();
+            let attribute = lhs.attribute();
+            let word_index = lhs.word_index();
+            matches_proximity(query_index, distance, attribute, word_index)
+        };
+
+        let rhs = {
+            let query_index = rhs.query_index();
+            let distance = rhs.distance();
+            let attribute = rhs.attribute();
+            let word_index = rhs.word_index();
+            matches_proximity(query_index, distance, attribute, word_index)
+        };
+
+        lhs.cmp(&rhs)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn three_different_attributes() {
+
+        // "soup" "of the" "the day"
+        //
+        // { id: 0, attr: 0, attr_index: 0 }
+        // { id: 1, attr: 1, attr_index: 0 }
+        // { id: 2, attr: 1, attr_index: 1 }
+        // { id: 2, attr: 2, attr_index: 0 }
+        // { id: 3, attr: 3, attr_index: 1 }
+
+        let query_index = &[0, 1, 2, 2, 3];
+        let distance    = &[0, 0, 0, 0, 0];
+        let attribute   = &[0, 1, 1, 2, 3];
+        let word_index  = &[0, 0, 1, 0, 1];
+
+        //   soup -> of = 8
+        // + of -> the  = 1
+        // + the -> day = 8 (not 1)
+        assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
+    }
+
+    #[test]
+    fn two_different_attributes() {
+
+        // "soup day" "soup of the day"
+        //
+        // { id: 0, attr: 0, attr_index: 0 }
+        // { id: 0, attr: 1, attr_index: 0 }
+        // { id: 1, attr: 1, attr_index: 1 }
+        // { id: 2, attr: 1, attr_index: 2 }
+        // { id: 3, attr: 0, attr_index: 1 }
+        // { id: 3, attr: 1, attr_index: 3 }
+
+        let query_index = &[0, 0, 1, 2, 3, 3];
+        let distance    = &[0, 0, 0, 0, 0, 0];
+        let attribute   = &[0, 1, 1, 1, 0, 1];
+        let word_index  = &[0, 0, 1, 2, 1, 3];
+
+        //   soup -> of = 1
+        // + of -> the  = 1
+        // + the -> day = 1
+        assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
+    }
+}
--- a/meilidb-core/src/data/doc_ids.rs
+++ b/meilidb-core/src/data/doc_ids.rs
@ -0,0 +1,61 @@
+use std::slice::from_raw_parts;
+use std::mem::size_of;
+use std::error::Error;
+
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use sdset::Set;
+
+use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
+use crate::write_to_bytes::WriteToBytes;
+use crate::data::SharedData;
+use crate::DocumentId;
+
+use super::into_u8_slice;
+
+#[derive(Default, Clone)]
+pub struct DocIds(SharedData);
+
+impl DocIds {
+    pub fn new(ids: &Set<DocumentId>) -> DocIds {
+        let bytes = unsafe { into_u8_slice(ids.as_slice()) };
+        let data = SharedData::from_bytes(bytes.to_vec());
+        DocIds(data)
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl AsRef<Set<DocumentId>> for DocIds {
+    fn as_ref(&self) -> &Set<DocumentId> {
+        let slice = &self.0;
+        let ptr = slice.as_ptr() as *const DocumentId;
+        let len = slice.len() / size_of::<DocumentId>();
+        let slice = unsafe { from_raw_parts(ptr, len) };
+        Set::new_unchecked(slice)
+    }
+}
+
+impl FromSharedDataCursor for DocIds {
+    type Error = Box<Error>;
+
+    fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let data = cursor.extract(len);
+
+        Ok(DocIds(data))
+    }
+}
+
+impl WriteToBytes for DocIds {
+    fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let len = self.0.len() as u64;
+        bytes.write_u64::<LittleEndian>(len).unwrap();
+        bytes.extend_from_slice(&self.0);
+    }
+}
--- a/meilidb-core/src/data/doc_indexes.rs
+++ b/meilidb-core/src/data/doc_indexes.rs
@ -0,0 +1,231 @@
+use std::io::{self, Write};
+use std::slice::from_raw_parts;
+use std::mem::size_of;
+use std::ops::Index;
+
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use sdset::Set;
+
+use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
+use crate::write_to_bytes::WriteToBytes;
+use crate::data::SharedData;
+use crate::DocIndex;
+
+use super::into_u8_slice;
+
+#[derive(Debug)]
+#[repr(C)]
+struct Range {
+    start: u64,
+    end: u64,
+}
+
+#[derive(Clone, Default)]
+pub struct DocIndexes {
+    ranges: SharedData,
+    indexes: SharedData,
+}
+
+impl DocIndexes {
+    pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
+        self.ranges().get(index).map(|Range { start, end }| {
+            let start = *start as usize;
+            let end = *end as usize;
+            let slice = &self.indexes()[start..end];
+            Set::new_unchecked(slice)
+        })
+    }
+
+    fn ranges(&self) -> &[Range] {
+        let slice = &self.ranges;
+        let ptr = slice.as_ptr() as *const Range;
+        let len = slice.len() / size_of::<Range>();
+        unsafe { from_raw_parts(ptr, len) }
+    }
+
+    fn indexes(&self) -> &[DocIndex] {
+        let slice = &self.indexes;
+        let ptr = slice.as_ptr() as *const DocIndex;
+        let len = slice.len() / size_of::<DocIndex>();
+        unsafe { from_raw_parts(ptr, len) }
+    }
+}
+
+impl Index<usize> for DocIndexes {
+    type Output = [DocIndex];
+
+    fn index(&self, index: usize) -> &Self::Output {
+        match self.get(index) {
+            Some(indexes) => indexes,
+            None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
+        }
+    }
+}
+
+impl FromSharedDataCursor for DocIndexes {
+    type Error = io::Error;
+
+    fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let ranges = cursor.extract(len);
+
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let indexes = cursor.extract(len);
+
+        Ok(DocIndexes { ranges, indexes })
+    }
+}
+
+impl WriteToBytes for DocIndexes {
+    fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let ranges_len = self.ranges.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(ranges_len);
+        bytes.extend_from_slice(&self.ranges);
+
+        let indexes_len = self.indexes.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(indexes_len);
+        bytes.extend_from_slice(&self.indexes);
+    }
+}
+
+pub struct DocIndexesBuilder<W> {
+    ranges: Vec<Range>,
+    indexes: Vec<DocIndex>,
+    wtr: W,
+}
+
+impl DocIndexesBuilder<Vec<u8>> {
+    pub fn memory() -> Self {
+        DocIndexesBuilder {
+            ranges: Vec::new(),
+            indexes: Vec::new(),
+            wtr: Vec::new(),
+        }
+    }
+}
+
+impl<W: Write> DocIndexesBuilder<W> {
+    pub fn new(wtr: W) -> Self {
+        DocIndexesBuilder {
+            ranges: Vec::new(),
+            indexes: Vec::new(),
+            wtr: wtr,
+        }
+    }
+
+    pub fn insert(&mut self, indexes: &Set<DocIndex>) {
+        let len = indexes.len() as u64;
+        let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
+        let range = Range { start, end: start + len };
+        self.ranges.push(range);
+
+        self.indexes.extend_from_slice(indexes);
+    }
+
+    pub fn finish(self) -> io::Result<()> {
+        self.into_inner().map(drop)
+    }
+
+    pub fn into_inner(mut self) -> io::Result<W> {
+        let ranges = unsafe { into_u8_slice(&self.ranges) };
+        let len = ranges.len() as u64;
+        self.wtr.write_u64::<LittleEndian>(len)?;
+        self.wtr.write_all(ranges)?;
+
+        let indexes = unsafe { into_u8_slice(&self.indexes) };
+        let len = indexes.len() as u64;
+        self.wtr.write_u64::<LittleEndian>(len)?;
+        self.wtr.write_all(indexes)?;
+
+        Ok(self.wtr)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::error::Error;
+    use crate::DocumentId;
+    use super::*;
+
+    #[test]
+    fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
+        let a = DocIndex {
+            document_id: DocumentId(0),
+            attribute: 3,
+            word_index: 11,
+            char_index: 30,
+            char_length: 4,
+        };
+        let b = DocIndex {
+            document_id: DocumentId(1),
+            attribute: 4,
+            word_index: 21,
+            char_index: 35,
+            char_length: 6,
+        };
+        let c = DocIndex {
+            document_id: DocumentId(2),
+            attribute: 8,
+            word_index: 2,
+            char_index: 89,
+            char_length: 6,
+        };
+
+        let mut builder = DocIndexesBuilder::memory();
+
+        builder.insert(Set::new(&[a])?);
+        builder.insert(Set::new(&[a, b, c])?);
+        builder.insert(Set::new(&[a, c])?);
+
+        let bytes = builder.into_inner()?;
+        let docs = DocIndexes::from_bytes(bytes)?;
+
+        assert_eq!(docs.get(0), Some(Set::new(&[a])?));
+        assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
+        assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
+        assert_eq!(docs.get(3), None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn serialize_deserialize() -> Result<(), Box<Error>> {
+        let a = DocIndex {
+            document_id: DocumentId(0),
+            attribute: 3,
+            word_index: 11,
+            char_index: 30,
+            char_length: 4,
+        };
+        let b = DocIndex {
+            document_id: DocumentId(1),
+            attribute: 4,
+            word_index: 21,
+            char_index: 35,
+            char_length: 6,
+        };
+        let c = DocIndex {
+            document_id: DocumentId(2),
+            attribute: 8,
+            word_index: 2,
+            char_index: 89,
+            char_length: 6,
+        };
+
+        let mut builder = DocIndexesBuilder::memory();
+
+        builder.insert(Set::new(&[a])?);
+        builder.insert(Set::new(&[a, b, c])?);
+        builder.insert(Set::new(&[a, c])?);
+
+        let builder_bytes = builder.into_inner()?;
+        let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
+
+        let mut bytes = Vec::new();
+        docs.write_to_bytes(&mut bytes);
+
+        assert_eq!(builder_bytes, bytes);
+
+        Ok(())
+    }
+}
--- a/meilidb-core/src/data/mod.rs
+++ b/meilidb-core/src/data/mod.rs
@ -0,0 +1,16 @@
+mod doc_ids;
+mod doc_indexes;
+mod shared_data;
+
+use std::slice::from_raw_parts;
+use std::mem::size_of;
+
+pub use self::doc_ids::DocIds;
+pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
+pub use self::shared_data::SharedData;
+
+unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
+    let ptr = slice.as_ptr() as *const u8;
+    let len = slice.len() * size_of::<T>();
+    from_raw_parts(ptr, len)
+}
--- a/meilidb-core/src/data/shared_data.rs
+++ b/meilidb-core/src/data/shared_data.rs
@ -0,0 +1,48 @@
+use std::sync::Arc;
+use std::ops::Deref;
+
+#[derive(Default, Clone)]
+pub struct SharedData {
+    pub bytes: Arc<Vec<u8>>,
+    pub offset: usize,
+    pub len: usize,
+}
+
+impl SharedData {
+    pub fn from_bytes(vec: Vec<u8>) -> SharedData {
+        let len = vec.len();
+        let bytes = Arc::from(vec);
+        SharedData::new(bytes, 0, len)
+    }
+
+    pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
+        SharedData { bytes, offset, len }
+    }
+
+    pub fn as_slice(&self) -> &[u8] {
+        &self.bytes[self.offset..self.offset + self.len]
+    }
+
+    pub fn range(&self, offset: usize, len: usize) -> SharedData {
+        assert!(offset + len <= self.len);
+        SharedData {
+            bytes: self.bytes.clone(),
+            offset: self.offset + offset,
+            len: len,
+        }
+    }
+}
+
+impl Deref for SharedData {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl AsRef<[u8]> for SharedData {
+    fn as_ref(&self) -> &[u8] {
+        self.as_slice()
+    }
+}
--- a/meilidb-core/src/distinct_map.rs
+++ b/meilidb-core/src/distinct_map.rs
@ -0,0 +1,104 @@
+use std::hash::Hash;
+
+use hashbrown::HashMap;
+
+pub struct DistinctMap<K> {
+    inner: HashMap<K, usize>,
+    limit: usize,
+    len: usize,
+}
+
+impl<K: Hash + Eq> DistinctMap<K> {
+    pub fn new(limit: usize) -> Self {
+        DistinctMap {
+            inner: HashMap::new(),
+            limit: limit,
+            len: 0,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+}
+
+pub struct BufferedDistinctMap<'a, K> {
+    internal: &'a mut DistinctMap<K>,
+    inner: HashMap<K, usize>,
+    len: usize,
+}
+
+impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
+    pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
+        BufferedDistinctMap {
+            internal: internal,
+            inner: HashMap::new(),
+            len: 0,
+        }
+    }
+
+    pub fn register(&mut self, key: K) -> bool {
+        let internal_seen = self.internal.inner.get(&key).unwrap_or(&0);
+        let inner_seen = self.inner.entry(key).or_insert(0);
+        let seen = *internal_seen + *inner_seen;
+
+        if seen < self.internal.limit {
+            *inner_seen += 1;
+            self.len += 1;
+            true
+        } else {
+            false
+        }
+    }
+
+    pub fn register_without_key(&mut self) -> bool {
+        self.len += 1;
+        true
+    }
+
+    pub fn transfert_to_internal(&mut self) {
+        for (k, v) in self.inner.drain() {
+            let value = self.internal.inner.entry(k).or_insert(0);
+            *value += v;
+        }
+
+        self.internal.len += self.len;
+        self.len = 0;
+    }
+
+    pub fn len(&self) -> usize {
+        self.internal.len() + self.len
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn easy_distinct_map() {
+        let mut map = DistinctMap::new(2);
+        let mut buffered = BufferedDistinctMap::new(&mut map);
+
+        for x in &[1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] {
+            buffered.register(x);
+        }
+        buffered.transfert_to_internal();
+        assert_eq!(map.len(), 8);
+
+        let mut map = DistinctMap::new(2);
+        let mut buffered = BufferedDistinctMap::new(&mut map);
+        assert_eq!(buffered.register(1), true);
+        assert_eq!(buffered.register(1), true);
+        assert_eq!(buffered.register(1), false);
+        assert_eq!(buffered.register(1), false);
+
+        assert_eq!(buffered.register(2), true);
+        assert_eq!(buffered.register(3), true);
+        assert_eq!(buffered.register(2), true);
+        assert_eq!(buffered.register(2), false);
+
+        buffered.transfert_to_internal();
+        assert_eq!(map.len(), 5);
+    }
+}
--- a/meilidb-core/src/index.rs
+++ b/meilidb-core/src/index.rs
@ -0,0 +1,175 @@
+use std::error::Error;
+
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use fst::{map, Map, IntoStreamer, Streamer};
+use fst::raw::Fst;
+use sdset::duo::{Union, DifferenceByKey};
+use sdset::{Set, SetOperation};
+
+use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
+use crate::write_to_bytes::WriteToBytes;
+use crate::data::{DocIndexes, DocIndexesBuilder};
+use crate::{DocumentId, DocIndex};
+
+#[derive(Default)]
+pub struct Index {
+    pub map: Map,
+    pub indexes: DocIndexes,
+}
+
+impl Index {
+    pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index {
+        let mut buffer = Vec::new();
+        let mut builder = IndexBuilder::new();
+        let mut stream = self.into_stream();
+
+        while let Some((key, indexes)) = stream.next() {
+            buffer.clear();
+
+            let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x);
+            op.extend_vec(&mut buffer);
+
+            if !buffer.is_empty() {
+                let indexes = Set::new_unchecked(&buffer);
+                builder.insert(key, indexes).unwrap();
+            }
+        }
+
+        builder.build()
+    }
+
+    pub fn union(&self, other: &Index) -> Index {
+        let mut builder = IndexBuilder::new();
+        let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
+
+        let mut buffer = Vec::new();
+        while let Some((key, ivalues)) = stream.next() {
+            buffer.clear();
+            match ivalues {
+                [a, b] => {
+                    let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
+                    let indexes = &indexes[a.value as usize];
+                    let a = Set::new_unchecked(indexes);
+
+                    let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
+                    let indexes = &indexes[b.value as usize];
+                    let b = Set::new_unchecked(indexes);
+
+                    let op = Union::new(a, b);
+                    op.extend_vec(&mut buffer);
+                },
+                [x] => {
+                    let indexes = if x.index == 0 { &self.indexes } else { &other.indexes };
+                    let indexes = &indexes[x.value as usize];
+                    buffer.extend_from_slice(indexes)
+                },
+                _ => continue,
+            }
+
+            if !buffer.is_empty() {
+                let indexes = Set::new_unchecked(&buffer);
+                builder.insert(key, indexes).unwrap();
+            }
+        }
+
+        builder.build()
+    }
+}
+
+impl FromSharedDataCursor for Index {
+    type Error = Box<Error>;
+
+    fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let data = cursor.extract(len);
+
+        let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
+        let map = Map::from(fst);
+
+        let indexes = DocIndexes::from_shared_data_cursor(cursor)?;
+
+        Ok(Index { map, indexes})
+    }
+}
+
+impl WriteToBytes for Index {
+    fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let slice = self.map.as_fst().as_bytes();
+        let len = slice.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(len);
+        bytes.extend_from_slice(slice);
+
+        self.indexes.write_to_bytes(bytes);
+    }
+}
+
+impl<'m, 'a> IntoStreamer<'a> for &'m Index {
+    type Item = (&'a [u8], &'a Set<DocIndex>);
+    type Into = Stream<'m>;
+
+    fn into_stream(self) -> Self::Into {
+        Stream {
+            map_stream: self.map.into_stream(),
+            indexes: &self.indexes,
+        }
+    }
+}
+
+pub struct Stream<'m> {
+    map_stream: map::Stream<'m>,
+    indexes: &'m DocIndexes,
+}
+
+impl<'m, 'a> Streamer<'a> for Stream<'m> {
+    type Item = (&'a [u8], &'a Set<DocIndex>);
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        match self.map_stream.next() {
+            Some((input, index)) => {
+                let indexes = &self.indexes[index as usize];
+                let indexes = Set::new_unchecked(indexes);
+                Some((input, indexes))
+            },
+            None => None,
+        }
+    }
+}
+
+pub struct IndexBuilder {
+    map: fst::MapBuilder<Vec<u8>>,
+    indexes: DocIndexesBuilder<Vec<u8>>,
+    value: u64,
+}
+
+impl IndexBuilder {
+    pub fn new() -> Self {
+        IndexBuilder {
+            map: fst::MapBuilder::memory(),
+            indexes: DocIndexesBuilder::memory(),
+            value: 0,
+        }
+    }
+
+    /// If a key is inserted that is less than or equal to any previous key added,
+    /// then an error is returned. Similarly, if there was a problem writing
+    /// to the underlying writer, an error is returned.
+    // FIXME what if one write doesn't work but the other do ?
+    pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()>
+    where K: AsRef<[u8]>,
+    {
+        self.map.insert(key, self.value)?;
+        self.indexes.insert(indexes);
+        self.value += 1;
+        Ok(())
+    }
+
+    pub fn build(self) -> Index {
+        let map = self.map.into_inner().unwrap();
+        let indexes = self.indexes.into_inner().unwrap();
+
+        let map = Map::from_bytes(map).unwrap();
+        let indexes = DocIndexes::from_bytes(indexes).unwrap();
+
+        Index { map, indexes }
+    }
+}
--- a/meilidb-core/src/lib.rs
+++ b/meilidb-core/src/lib.rs
@ -0,0 +1,297 @@
+pub mod criterion;
+pub mod data;
+mod index;
+mod automaton;
+mod query_builder;
+mod distinct_map;
+
+pub mod shared_data_cursor;
+pub mod write_to_bytes;
+
+use std::sync::Arc;
+use serde_derive::{Serialize, Deserialize};
+
+use slice_group_by::GroupBy;
+use rayon::slice::ParallelSliceMut;
+
+pub use self::index::{Index, IndexBuilder};
+pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
+
+/// Represent an internally generated document unique identifier.
+///
+/// It is used to inform the database the document you want to deserialize.
+/// Helpful for custom ranking.
+#[derive(Serialize, Deserialize)]
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+pub struct DocumentId(pub u64);
+
+/// This structure represent the position of a word
+/// in a document and its attributes.
+///
+/// This is stored in the map, generated at index time,
+/// extracted and interpreted at search time.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[repr(C)]
+pub struct DocIndex {
+    /// The document identifier where the word was found.
+    pub document_id: DocumentId,
+
+    /// The attribute in the document where the word was found
+    /// along with the index in it.
+    pub attribute: u16,
+    pub word_index: u16,
+
+    /// The position in bytes where the word was found
+    /// along with the length of it.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub char_index: u16,
+    pub char_length: u16,
+}
+
+/// This structure represent a matching word with informations
+/// on the location of the word in the document.
+///
+/// The order of the field is important because it defines
+/// the way these structures are ordered between themselves.
+///
+/// The word in itself is not important.
+// TODO do data oriented programming ? very arrays ?
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Match {
+    /// The word index in the query sentence.
+    /// Same as the `attribute_index` but for the query words.
+    ///
+    /// Used to retrieve the automaton that match this word.
+    pub query_index: u32,
+
+    /// The distance the word has with the query word
+    /// (i.e. the Levenshtein distance).
+    pub distance: u8,
+
+    /// The attribute in the document where the word was found
+    /// along with the index in it.
+    pub attribute: u16,
+    pub word_index: u16,
+
+    /// Whether the word that match is an exact match or a prefix.
+    pub is_exact: bool,
+
+    /// The position in bytes where the word was found
+    /// along with the length of it.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub char_index: u16,
+    pub char_length: u16,
+}
+
+impl Match {
+    pub fn zero() -> Self {
+        Match {
+            query_index: 0,
+            distance: 0,
+            attribute: 0,
+            word_index: 0,
+            is_exact: false,
+            char_index: 0,
+            char_length: 0,
+        }
+    }
+
+    pub fn max() -> Self {
+        Match {
+            query_index: u32::max_value(),
+            distance: u8::max_value(),
+            attribute: u16::max_value(),
+            word_index: u16::max_value(),
+            is_exact: true,
+            char_index: u16::max_value(),
+            char_length: u16::max_value(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Document {
+    pub id: DocumentId,
+    pub matches: Vec<Match>,
+}
+
+impl Document {
+    fn from_raw(raw: &RawDocument) -> Document {
+        let len = raw.matches.range.len();
+        let mut matches = Vec::with_capacity(len);
+
+        let query_index = raw.query_index();
+        let distance = raw.distance();
+        let attribute = raw.attribute();
+        let word_index = raw.word_index();
+        let is_exact = raw.is_exact();
+        let char_index = raw.char_index();
+        let char_length = raw.char_length();
+
+        for i in 0..len {
+            let match_ = Match {
+                query_index: query_index[i],
+                distance: distance[i],
+                attribute: attribute[i],
+                word_index: word_index[i],
+                is_exact: is_exact[i],
+                char_index: char_index[i],
+                char_length: char_length[i],
+            };
+            matches.push(match_);
+        }
+
+        Document { id: raw.id, matches }
+    }
+}
+
+#[derive(Clone)]
+pub struct RawDocument {
+    pub id: DocumentId,
+    pub matches: SharedMatches,
+}
+
+impl RawDocument {
+    fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
+        RawDocument { id, matches: SharedMatches { range, matches } }
+    }
+
+    pub fn query_index(&self) -> &[u32] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn distance(&self) -> &[u8] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn attribute(&self) -> &[u16] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn word_index(&self) -> &[u16] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn is_exact(&self) -> &[bool] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn char_index(&self) -> &[u16] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
+    }
+
+    pub fn char_length(&self) -> &[u16] {
+        let r = self.matches.range;
+        // it is safe because construction/modifications
+        // can only be done in this module
+        unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
+    }
+}
+
+pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
+    let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
+    let mut matches2 = Matches::with_capacity(matches.len());
+
+    matches.par_sort_unstable();
+
+    for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
+        let id = group[0].0;
+        let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
+        let end = start + group.len();
+        docs_ranges.push((id, Range { start, end }));
+
+        matches2.extend_from_slice(group);
+    }
+
+    let matches = Arc::new(matches2);
+    docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
+}
+
+#[derive(Debug, Copy, Clone)]
+struct Range {
+    start: usize,
+    end: usize,
+}
+
+impl Range {
+    fn len(self) -> usize {
+        self.end - self.start
+    }
+}
+
+#[derive(Clone)]
+pub struct SharedMatches {
+    range: Range,
+    matches: Arc<Matches>,
+}
+
+#[derive(Clone)]
+struct Matches {
+    query_index: Vec<u32>,
+    distance: Vec<u8>,
+    attribute: Vec<u16>,
+    word_index: Vec<u16>,
+    is_exact: Vec<bool>,
+    char_index: Vec<u16>,
+    char_length: Vec<u16>,
+}
+
+impl Matches {
+    fn with_capacity(cap: usize) -> Matches {
+        Matches {
+            query_index: Vec::with_capacity(cap),
+            distance: Vec::with_capacity(cap),
+            attribute: Vec::with_capacity(cap),
+            word_index: Vec::with_capacity(cap),
+            is_exact: Vec::with_capacity(cap),
+            char_index: Vec::with_capacity(cap),
+            char_length: Vec::with_capacity(cap),
+        }
+    }
+
+    fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
+        for (_, match_) in matches {
+            self.query_index.push(match_.query_index);
+            self.distance.push(match_.distance);
+            self.attribute.push(match_.attribute);
+            self.word_index.push(match_.word_index);
+            self.is_exact.push(match_.is_exact);
+            self.char_index.push(match_.char_index);
+            self.char_length.push(match_.char_length);
+        }
+    }
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::mem;
+
+    #[test]
+    fn docindex_mem_size() {
+        assert_eq!(mem::size_of::<DocIndex>(), 24);
+    }
+}
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@ -0,0 +1,357 @@
+use std::{cmp, mem};
+use std::ops::Range;
+use std::time::Instant;
+use std::hash::Hash;
+use std::rc::Rc;
+
+use rayon::slice::ParallelSliceMut;
+use slice_group_by::{GroupByMut, LinearStrGroupBy};
+use hashbrown::{HashMap, HashSet};
+use fst::Streamer;
+use log::info;
+
+use crate::automaton::{self, DfaExt, AutomatonExt};
+use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
+use crate::criterion::Criteria;
+use crate::{raw_documents_from_matches, RawDocument, Document};
+use crate::{Index, Match, DocumentId};
+
+// query splitting must move out of this crate
+pub fn is_cjk(c: char) -> bool {
+    (c >= '\u{2e80}' && c <= '\u{2eff}') ||
+    (c >= '\u{2f00}' && c <= '\u{2fdf}') ||
+    (c >= '\u{3040}' && c <= '\u{309f}') ||
+    (c >= '\u{30a0}' && c <= '\u{30ff}') ||
+    (c >= '\u{3100}' && c <= '\u{312f}') ||
+    (c >= '\u{3200}' && c <= '\u{32ff}') ||
+    (c >= '\u{3400}' && c <= '\u{4dbf}') ||
+    (c >= '\u{4e00}' && c <= '\u{9fff}') ||
+    (c >= '\u{f900}' && c <= '\u{faff}')
+}
+
+#[derive(Debug, PartialEq, Eq)]
+enum CharCategory {
+    Space,
+    Cjk,
+    Other,
+}
+
+fn classify_char(c: char) -> CharCategory {
+    if c.is_whitespace() { CharCategory::Space }
+    else if is_cjk(c) { CharCategory::Cjk }
+    else { CharCategory::Other }
+}
+
+fn is_word(s: &&str) -> bool {
+    !s.chars().any(char::is_whitespace)
+}
+
+fn same_group_category(a: char, b: char) -> bool {
+    let ca = classify_char(a);
+    let cb = classify_char(b);
+    if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
+}
+
+fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
+    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
+    let mut groups = LinearStrGroupBy::new(query, same_group_category)
+                        .filter(is_word)
+                        .map(str::to_lowercase)
+                        .peekable();
+
+    let mut automatons = Vec::new();
+    while let Some(word) = groups.next() {
+        let has_following_word = groups.peek().is_some();
+        let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
+            automaton::build_dfa(&word)
+        } else {
+            automaton::build_prefix_dfa(&word)
+        };
+        automatons.push(lev);
+    }
+
+    automatons
+}
+
+pub type FilterFunc = fn(DocumentId) -> bool;
+
+pub struct QueryBuilder<'i, 'c, FI> {
+    index: &'i Index,
+    criteria: Criteria<'c>,
+    searchable_attrs: Option<HashSet<u16>>,
+    filter: Option<FI>,
+}
+
+impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> {
+    pub fn new(index: &'i Index) -> Self {
+        QueryBuilder::with_criteria(index, Criteria::default())
+    }
+
+    pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self {
+        QueryBuilder { index, criteria, searchable_attrs: None, filter: None }
+    }
+}
+
+impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
+{
+    pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F>
+    where F: Fn(DocumentId) -> bool,
+    {
+        QueryBuilder {
+            index: self.index,
+            criteria: self.criteria,
+            searchable_attrs: self.searchable_attrs,
+            filter: Some(function)
+        }
+    }
+
+    pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F>
+    where F: Fn(DocumentId) -> Option<K>,
+          K: Hash + Eq,
+    {
+        DistinctQueryBuilder {
+            inner: self,
+            function: function,
+            size: size
+        }
+    }
+
+    pub fn add_searchable_attribute(&mut self, attribute: u16) {
+        let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new);
+        attributes.insert(attribute);
+    }
+
+    fn query_all(&self, query: &str) -> Vec<RawDocument> {
+        let automatons = split_whitespace_automatons(query);
+
+        let mut stream = {
+            let mut op_builder = fst::map::OpBuilder::new();
+            for automaton in &automatons {
+                let stream = self.index.map.search(automaton);
+                op_builder.push(stream);
+            }
+            op_builder.union()
+        };
+
+        let mut matches = Vec::new();
+
+        while let Some((input, indexed_values)) = stream.next() {
+            for iv in indexed_values {
+                let automaton = &automatons[iv.index];
+                let distance = automaton.eval(input).to_u8();
+                let is_exact = distance == 0 && input.len() == automaton.query_len();
+
+                let doc_indexes = &self.index.indexes;
+                let doc_indexes = &doc_indexes[iv.value as usize];
+
+                for di in doc_indexes {
+                    if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
+                        let match_ = Match {
+                            query_index: iv.index as u32,
+                            distance: distance,
+                            attribute: di.attribute,
+                            word_index: di.word_index,
+                            is_exact: is_exact,
+                            char_index: di.char_index,
+                            char_length: di.char_length,
+                        };
+                        matches.push((di.document_id, match_));
+                    }
+                }
+            }
+        }
+
+        let total_matches = matches.len();
+        let raw_documents = raw_documents_from_matches(matches);
+
+        info!("{} total documents to classify", raw_documents.len());
+        info!("{} total matches to classify", total_matches);
+
+        raw_documents
+    }
+}
+
+impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
+where FI: Fn(DocumentId) -> bool,
+{
+    pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
+        // We delegate the filter work to the distinct query builder,
+        // specifying a distinct rule that has no effect.
+        if self.filter.is_some() {
+            let builder = self.with_distinct(|_| None as Option<()>, 1);
+            return builder.query(query, range);
+        }
+
+        let start = Instant::now();
+        let mut documents = self.query_all(query);
+        info!("query_all took {:.2?}", start.elapsed());
+
+        let mut groups = vec![documents.as_mut_slice()];
+
+        'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
+            let tmp_groups = mem::replace(&mut groups, Vec::new());
+            let mut documents_seen = 0;
+
+            for group in tmp_groups {
+                info!("criterion {}, documents group of size {}", ci, group.len());
+
+                // if this group does not overlap with the requested range,
+                // push it without sorting and splitting it
+                if documents_seen + group.len() < range.start {
+                    documents_seen += group.len();
+                    groups.push(group);
+                    continue;
+                }
+
+                let start = Instant::now();
+                group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
+                info!("criterion {} sort took {:.2?}", ci, start.elapsed());
+
+                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
+                    documents_seen += group.len();
+                    groups.push(group);
+
+                    // we have sort enough documents if the last document sorted is after
+                    // the end of the requested range, we can continue to the next criterion
+                    if documents_seen >= range.end { continue 'criteria }
+                }
+            }
+        }
+
+        let offset = cmp::min(documents.len(), range.start);
+        let iter = documents.into_iter().skip(offset).take(range.len());
+        iter.map(|d| Document::from_raw(&d)).collect()
+    }
+}
+
+pub struct DistinctQueryBuilder<'i, 'c, FI, FD> {
+    inner: QueryBuilder<'i, 'c, FI>,
+    function: FD,
+    size: usize,
+}
+
+impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
+{
+    pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD>
+    where F: Fn(DocumentId) -> bool,
+    {
+        DistinctQueryBuilder {
+            inner: self.inner.with_filter(function),
+            function: self.function,
+            size: self.size
+        }
+    }
+
+    pub fn add_searchable_attribute(&mut self, attribute: u16) {
+        self.inner.add_searchable_attribute(attribute);
+    }
+}
+
+impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD>
+where FI: Fn(DocumentId) -> bool,
+      FD: Fn(DocumentId) -> Option<K>,
+      K: Hash + Eq,
+{
+    pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
+        let start = Instant::now();
+        let mut documents = self.inner.query_all(query);
+        info!("query_all took {:.2?}", start.elapsed());
+
+        let mut groups = vec![documents.as_mut_slice()];
+        let mut key_cache = HashMap::new();
+
+        let mut filter_map = HashMap::new();
+        // these two variables informs on the current distinct map and
+        // on the raw offset of the start of the group where the
+        // range.start bound is located according to the distinct function
+        let mut distinct_map = DistinctMap::new(self.size);
+        let mut distinct_raw_offset = 0;
+
+        'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
+            let tmp_groups = mem::replace(&mut groups, Vec::new());
+            let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
+            let mut documents_seen = 0;
+
+            for group in tmp_groups {
+                info!("criterion {}, documents group of size {}", ci, group.len());
+
+                // if this group does not overlap with the requested range,
+                // push it without sorting and splitting it
+                if documents_seen + group.len() < distinct_raw_offset {
+                    documents_seen += group.len();
+                    groups.push(group);
+                    continue;
+                }
+
+                let start = Instant::now();
+                group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
+                info!("criterion {} sort took {:.2?}", ci, start.elapsed());
+
+                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
+                    // we must compute the real distinguished len of this sub-group
+                    for document in group.iter() {
+                        let filter_accepted = match &self.inner.filter {
+                            Some(filter) => {
+                                let entry = filter_map.entry(document.id);
+                                *entry.or_insert_with(|| (filter)(document.id))
+                            },
+                            None => true,
+                        };
+
+                        if filter_accepted {
+                            let entry = key_cache.entry(document.id);
+                            let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new));
+
+                            match key.clone() {
+                                Some(key) => buf_distinct.register(key),
+                                None => buf_distinct.register_without_key(),
+                            };
+                        }
+
+                        // the requested range end is reached: stop computing distinct
+                        if buf_distinct.len() >= range.end { break }
+                    }
+
+                    documents_seen += group.len();
+                    groups.push(group);
+
+                    // if this sub-group does not overlap with the requested range
+                    // we must update the distinct map and its start index
+                    if buf_distinct.len() < range.start {
+                        buf_distinct.transfert_to_internal();
+                        distinct_raw_offset = documents_seen;
+                    }
+
+                    // we have sort enough documents if the last document sorted is after
+                    // the end of the requested range, we can continue to the next criterion
+                    if buf_distinct.len() >= range.end { continue 'criteria }
+                }
+            }
+        }
+
+        let mut out_documents = Vec::with_capacity(range.len());
+        let mut seen = BufferedDistinctMap::new(&mut distinct_map);
+
+        for document in documents.into_iter().skip(distinct_raw_offset) {
+            let filter_accepted = match &self.inner.filter {
+                Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
+                None => true,
+            };
+
+            if filter_accepted {
+                let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
+                let distinct_accepted = match key {
+                    Some(key) => seen.register(key),
+                    None => seen.register_without_key(),
+                };
+
+                if distinct_accepted && seen.len() > range.start {
+                    out_documents.push(Document::from_raw(&document));
+                    if out_documents.len() == range.len() { break }
+                }
+            }
+        }
+
+        out_documents
+    }
+}
--- a/meilidb-core/src/shared_data_cursor.rs
+++ b/meilidb-core/src/shared_data_cursor.rs
@ -0,0 +1,56 @@
+use std::io::{self, Read, Cursor, BufRead};
+use std::sync::Arc;
+use crate::data::SharedData;
+
+pub struct SharedDataCursor(Cursor<SharedData>);
+
+impl SharedDataCursor {
+    pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
+        let len = bytes.len();
+        let bytes = Arc::new(bytes);
+
+        SharedDataCursor::from_shared_bytes(bytes, 0, len)
+    }
+
+    pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor {
+        let data = SharedData::new(bytes, offset, len);
+        let cursor = Cursor::new(data);
+
+        SharedDataCursor(cursor)
+    }
+
+    pub fn extract(&mut self, amt: usize) -> SharedData {
+        let offset = self.0.position() as usize;
+        let extracted = self.0.get_ref().range(offset, amt);
+        self.0.consume(amt);
+
+        extracted
+    }
+}
+
+impl Read for SharedDataCursor {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        self.0.read(buf)
+    }
+}
+
+impl BufRead for SharedDataCursor {
+    fn fill_buf(&mut self) -> io::Result<&[u8]> {
+        self.0.fill_buf()
+    }
+
+    fn consume(&mut self, amt: usize) {
+        self.0.consume(amt)
+    }
+}
+
+pub trait FromSharedDataCursor: Sized {
+    type Error;
+
+    fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>;
+
+    fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> {
+        let mut cursor = SharedDataCursor::from_bytes(bytes);
+        Self::from_shared_data_cursor(&mut cursor)
+    }
+}
--- a/meilidb-core/src/write_to_bytes.rs
+++ b/meilidb-core/src/write_to_bytes.rs
@ -0,0 +1,9 @@
+pub trait WriteToBytes {
+    fn write_to_bytes(&self, bytes: &mut Vec<u8>);
+
+    fn into_bytes(&self) -> Vec<u8> {
+        let mut bytes = Vec::new();
+        self.write_to_bytes(&mut bytes);
+        bytes
+    }
+}