chore: Make the repo use examples and keep the library

2025-07-03 11:57:07 +02:00 · 2018-10-09 18:23:35 +02:00 · 2018-10-09 18:23:35 +02:00 · 7a668dde98
commit 7a668dde98
parent 2944368897
33 changed files with 1456 additions and 551 deletions
--- a/src/automaton.rs
+++ b/src/automaton.rs
@ -0,0 +1,79 @@
+use std::ops::Deref;
+use fst::Automaton;
+use levenshtein_automata::{
+    LevenshteinAutomatonBuilder as LevBuilder,
+    DFA, Distance,
+};
+
+lazy_static! {
+    static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
+    static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
+    static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
+}
+
+pub struct DfaExt {
+    query_len: usize,
+    automaton: DFA,
+}
+
+impl Automaton for DfaExt {
+    type State = <DFA as Automaton>::State;
+
+    fn start(&self) -> Self::State {
+        self.automaton.start()
+    }
+
+    fn is_match(&self, state: &Self::State) -> bool {
+        self.automaton.is_match(state)
+    }
+
+    fn can_match(&self, state: &Self::State) -> bool {
+        self.automaton.can_match(state)
+    }
+
+    fn will_always_match(&self, state: &Self::State) -> bool {
+        self.automaton.will_always_match(state)
+    }
+
+    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+        self.automaton.accept(state, byte)
+    }
+}
+
+impl AutomatonExt for DfaExt {
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
+        self.automaton.eval(s)
+    }
+
+    fn query_len(&self) -> usize {
+        self.query_len
+    }
+}
+
+pub fn build(query: &str) -> DfaExt {
+    let dfa = match query.len() {
+        0 ..= 4 => LEVDIST0.build_prefix_dfa(query),
+        5 ..= 8 => LEVDIST1.build_prefix_dfa(query),
+        _       => LEVDIST2.build_prefix_dfa(query),
+    };
+
+    DfaExt { query_len: query.len(), automaton: dfa }
+}
+
+pub trait AutomatonExt: Automaton {
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
+    fn query_len(&self) -> usize;
+}
+
+impl<T> AutomatonExt for T
+where T: Deref,
+      T::Target: AutomatonExt,
+{
+    fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
+        (**self).eval(s)
+    }
+
+    fn query_len(&self) -> usize {
+        (**self).query_len()
+    }
+}
--- a/src/common_words.rs
+++ b/src/common_words.rs
@ -3,18 +3,24 @@ use std::collections::HashSet;
 use std::path::Path;
 use std::fs::File;

-pub type CommonWords = HashSet<String>;
+#[derive(Debug)]
+pub struct CommonWords(HashSet<String>);

-pub fn from_file<P>(path: P) -> io::Result<CommonWords>
-where P: AsRef<Path>,
-{
-    let file = File::open(path)?;
-    let file = BufReader::new(file);
-    let mut set = HashSet::new();
-    for line in file.lines().filter_map(|l| l.ok()) {
-        for word in line.split_whitespace() {
-            set.insert(word.to_owned());
+impl CommonWords {
+    pub fn from_file<P>(path: P) -> io::Result<Self>
+    where P: AsRef<Path>
+    {
+        let file = File::open(path)?;
+        let file = BufReader::new(file);
+        let mut set = HashSet::new();
+        for line in file.lines().filter_map(|l| l.ok()) {
+            let word = line.trim().to_owned();
+            set.insert(word);
        }
+        Ok(CommonWords(set))
+    }
+
+    pub fn contains(&self, word: &str) -> bool {
+        self.0.contains(word)
    }
-    Ok(set)
 }
--- a/src/index/csv.rs
+++ b/src/index/csv.rs
@ -1,122 +0,0 @@
-use std::collections::BTreeMap;
-use std::path::PathBuf;
-use std::fs::File;
-use std::io;
-
-use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions};
-use raptor::{MetadataBuilder, DocIndex, Tokenizer};
-use unidecode::unidecode;
-use csv::ReaderBuilder;
-
-use crate::common_words::{self, CommonWords};
-use crate::index::csv_feature::CommandCsv;
-
-#[derive(Debug, Deserialize)]
-struct Product {
-    #[serde(rename = "_unit_id")]
-    id: u64,
-    #[serde(rename = "product_title")]
-    title: String,
-    #[serde(rename = "product_image")]
-    image: String,
-    #[serde(rename = "product_description")]
-    description: String,
-}
-
-#[derive(Debug)]
-pub struct CsvIndexer {
-    common_words: CommonWords,
-    products: PathBuf,
-}
-
-impl CsvIndexer {
-    pub fn from_command(command: CommandCsv) -> io::Result<CsvIndexer> {
-        let common_words = common_words::from_file(command.stop_words)?;
-        let products = command.products;
-
-        Ok(CsvIndexer { common_words, products })
-    }
-
-    pub fn index(self) {
-        let random_name = moby_name_gen::random_name();
-        let map_file = format!("{}.map", random_name);
-        let idx_file = format!("{}.idx", random_name);
-        let sst_file = format!("{}.sst", random_name);
-
-        let env_options = EnvOptions::new();
-        let cf_options = ColumnFamilyOptions::new();
-        let mut sst_file_writer = SstFileWriter::new(env_options, cf_options);
-        sst_file_writer.open(&sst_file).expect("open the sst file");
-
-        let map = File::create(&map_file).unwrap();
-        let indexes = File::create(&idx_file).unwrap();
-        let mut builder = MetadataBuilder::new(map, indexes);
-        let mut fields = BTreeMap::new();
-
-        let mut rdr = ReaderBuilder::new().from_path(&self.products).expect("reading product file");
-        let mut errors = 0;
-
-        for result in rdr.deserialize() {
-            let product: Product = match result {
-                Ok(product) => product,
-                Err(e) => { eprintln!("{:?}", e); errors += 1; continue },
-            };
-
-            let title = Tokenizer::new(&product.title);
-            let title = title.iter().filter(|&(_, w)| !self.common_words.contains(w));
-            insert_document_words(&mut builder, product.id, 0, title);
-
-            let description = Tokenizer::new(&product.description);
-            let description = description.iter().filter(|&(_, w)| !self.common_words.contains(w));
-            insert_document_words(&mut builder, product.id, 1, description);
-
-            // TODO simplify this by using functions and
-            //      use the MetadataBuilder internal BTreeMap ?
-            let key = format!("{}-title", product.id);
-            let value = product.title;
-            fields.insert(key, value);
-
-            let key = format!("{}-description", product.id);
-            let value = product.description;
-            fields.insert(key, value);
-
-            let key = format!("{}-image", product.id);
-            let value = product.image;
-            fields.insert(key, value);
-        }
-
-        for (key, value) in fields {
-            sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
-        }
-        let _sst_file_info = sst_file_writer.finish().unwrap();
-
-        builder.finish().unwrap();
-
-        println!("Found {} errorneous lines", errors);
-        println!("Succesfully created {:?} dump.", random_name);
-    }
-}
-
-fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_index: u64, attr: u8, words: I)
-where A: io::Write,
-      B: io::Write,
-      I: IntoIterator<Item=(usize, &'a str)>,
-{
-    for (index, word) in words {
-        let doc_index = DocIndex {
-            document: doc_index,
-            attribute: attr,
-            attribute_index: index as u32,
-        };
-        // insert the exact representation
-        let word_lower = word.to_lowercase();
-
-        // and the unidecoded lowercased version
-        let word_unidecoded = unidecode(word).to_lowercase();
-        if word_lower != word_unidecoded {
-            builder.insert(word_unidecoded, doc_index);
-        }
-
-        builder.insert(word_lower, doc_index);
-    }
-}
--- a/src/index/json_lines.rs
+++ b/src/index/json_lines.rs
@ -1,112 +0,0 @@
-use std::collections::BTreeMap;
-use std::path::PathBuf;
-use std::fs::File;
-use std::io::{self, BufReader, BufRead};
-
-use serde_json::from_str;
-use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions};
-use raptor::{MetadataBuilder, DocIndex, Tokenizer};
-use unidecode::unidecode;
-
-use crate::common_words::{self, CommonWords};
-use crate::index::jsonlines_feature::CommandJsonLines;
-
-#[derive(Debug, Deserialize)]
-struct Product {
-    title: String,
-    group_id: u64,
-    ft: String,
-}
-
-#[derive(Debug)]
-pub struct JsonLinesIndexer {
-    common_words: CommonWords,
-    products: PathBuf,
-}
-
-impl JsonLinesIndexer {
-    pub fn from_command(command: CommandJsonLines) -> io::Result<JsonLinesIndexer> {
-        let common_words = common_words::from_file(command.stop_words)?;
-        let products = command.products;
-
-        Ok(JsonLinesIndexer { common_words, products })
-    }
-
-    pub fn index(self) {
-        let data = File::open(&self.products).unwrap();
-        let data = BufReader::new(data);
-
-        // TODO add a subcommand to pack these files in a tar.xxx archive
-        let random_name = moby_name_gen::random_name();
-        let map_file = format!("{}.map", random_name);
-        let idx_file = format!("{}.idx", random_name);
-        let sst_file = format!("{}.sst", random_name);
-
-        let env_options = EnvOptions::new();
-        let cf_options = ColumnFamilyOptions::new();
-        let mut sst_file_writer = SstFileWriter::new(env_options, cf_options);
-        sst_file_writer.open(&sst_file).expect("open the sst file");
-
-        let map = File::create(&map_file).unwrap();
-        let indexes = File::create(&idx_file).unwrap();
-        let mut builder = MetadataBuilder::new(map, indexes);
-        let mut fields = BTreeMap::new();
-
-        for line in data.lines() {
-            let line = line.unwrap();
-
-            let product: Product = from_str(&line).unwrap();
-
-            let title = Tokenizer::new(&product.title);
-            let title = title.iter().filter(|&(_, w)| !self.common_words.contains(w));
-            insert_document_words(&mut builder, product.group_id, 0, title);
-
-            let description = Tokenizer::new(&product.ft);
-            let description = description.iter().filter(|&(_, w)| !self.common_words.contains(w));
-            insert_document_words(&mut builder, product.group_id, 1, description);
-
-            // TODO simplify this by using functions and
-            //      use the MetadataBuilder internal BTreeMap ?
-            let key = format!("{}-title", product.group_id);
-            let value = product.title;
-            fields.insert(key, value);
-
-            let key = format!("{}-description", product.group_id);
-            let value = product.ft;
-            fields.insert(key, value);
-        }
-
-        for (key, value) in fields {
-            sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
-        }
-        let _sst_file_info = sst_file_writer.finish().unwrap();
-
-        builder.finish().unwrap();
-
-        println!("Succesfully created {:?} dump.", random_name);
-    }
-}
-
-fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_index: u64, attr: u8, words: I)
-where A: io::Write,
-      B: io::Write,
-      I: IntoIterator<Item=(usize, &'a str)>,
-{
-    for (index, word) in words {
-        let doc_index = DocIndex {
-            document: doc_index,
-            attribute: attr,
-            attribute_index: index as u32,
-        };
-        // insert the exact representation
-        let word_lower = word.to_lowercase();
-
-        // and the unidecoded lowercased version
-        let word_unidecoded = unidecode(word).to_lowercase();
-        if word_lower != word_unidecoded {
-            builder.insert(word_unidecoded, doc_index);
-        }
-
-        builder.insert(word_lower, doc_index);
-    }
-}
--- a/src/index/mod.rs
+++ b/src/index/mod.rs
@ -1,71 +0,0 @@
-#[cfg(feature = "index-csv")]
-mod csv;
-
-#[cfg(feature = "index-jsonlines")]
-mod json_lines;
-
-use structopt::StructOpt;
-
-#[derive(Debug, StructOpt)]
-pub enum CommandIndex {
-
-    #[cfg(feature = "index-jsonlines")]
-    /// Index files encoded as json lines.
-    #[structopt(name = "json-lines")]
-    JsonLines(self::jsonlines_feature::CommandJsonLines),
-
-    #[cfg(feature = "index-csv")]
-    /// Index files encoded as csv.
-    #[structopt(name = "csv")]
-    Csv(self::csv_feature::CommandCsv),
-}
-
-#[cfg(feature = "index-jsonlines")]
-pub mod jsonlines_feature {
-    use std::error;
-    use std::path::PathBuf;
-    use structopt::StructOpt;
-
-    #[derive(Debug, StructOpt)]
-    pub struct CommandJsonLines {
-        /// The stop word file, each word must be separated by a newline.
-        #[structopt(long = "stop-words", parse(from_os_str))]
-        pub stop_words: PathBuf,
-
-        /// The csv file to index.
-        #[structopt(parse(from_os_str))]
-        pub products: PathBuf,
-    }
-
-    pub fn json_lines(command: CommandJsonLines) -> Result<(), Box<error::Error>> {
-        use super::json_lines::JsonLinesIndexer;
-
-        let indexer = JsonLinesIndexer::from_command(command)?;
-        Ok(indexer.index())
-    }
-}
-
-#[cfg(feature = "index-csv")]
-pub mod csv_feature {
-    use std::error;
-    use std::path::PathBuf;
-    use structopt::StructOpt;
-
-    #[derive(Debug, StructOpt)]
-    pub struct CommandCsv {
-        /// The stop word file, each word must be separated by a newline.
-        #[structopt(long = "stop-words", parse(from_os_str))]
-        pub stop_words: PathBuf,
-
-        /// The csv file to index.
-        #[structopt(parse(from_os_str))]
-        pub products: PathBuf,
-    }
-
-    pub fn csv(command: CommandCsv) -> Result<(), Box<error::Error>> {
-        use super::csv::CsvIndexer;
-
-        let indexer = CsvIndexer::from_command(command)?;
-        Ok(indexer.index())
-    }
-}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,100 @@
+#[macro_use] extern crate lazy_static;
+
+pub mod rank;
+pub mod metadata;
+pub mod vec_read_only;
+pub mod automaton;
+pub mod tokenizer;
+mod common_words;
+
+pub use self::metadata::{Metadata, MetadataBuilder};
+pub use self::rank::RankedStream;
+pub use self::tokenizer::Tokenizer;
+pub use self::common_words::CommonWords;
+
+pub type DocumentId = u64;
+
+/// This structure represent the position of a word
+/// in a document and its attributes.
+///
+/// This is stored in the map, generated at index time,
+/// extracted and interpreted at search time.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[repr(C)]
+pub struct DocIndex {
+    /// The document identifier where the word was found.
+    pub document: DocumentId,
+
+    /// The attribute identifier in the document
+    /// where the word was found.
+    ///
+    /// This is an `u8` therefore a document
+    /// can not have more than `2^8` attributes.
+    pub attribute: u8,
+
+    /// The index where the word was found in the attribute.
+    ///
+    /// Only the first 1000 words are indexed.
+    pub attribute_index: u32,
+}
+
+/// This structure represent a matching word with informations
+/// on the location of the word in the document.
+///
+/// The order of the field is important because it defines
+/// the way these structures are ordered between themselves.
+///
+/// The word in itself is not important.
+// TODO do data oriented programming ? very arrays ?
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+pub struct Match {
+    /// The word index in the query sentence.
+    /// Same as the `attribute_index` but for the query words.
+    ///
+    /// Used to retrieve the automaton that match this word.
+    pub query_index: u32,
+
+    /// The distance the word has with the query word
+    /// (i.e. the Levenshtein distance).
+    pub distance: u8,
+
+    /// The attribute in which the word is located
+    /// (i.e. Title is 0, Description is 1).
+    ///
+    /// This is an `u8` therefore a document
+    /// can not have more than `2^8` attributes.
+    pub attribute: u8,
+
+    /// Where does this word is located in the attribute string
+    /// (i.e. at the start or the end of the attribute).
+    ///
+    /// The index in the attribute is limited to a maximum of `2^32`
+    /// this is because we index only the first 1000 words
+    /// in an attribute.
+    pub attribute_index: u32,
+
+    /// Whether the word that match is an exact match or a prefix.
+    pub is_exact: bool,
+}
+
+impl Match {
+    pub fn zero() -> Self {
+        Match {
+            query_index: 0,
+            distance: 0,
+            attribute: 0,
+            attribute_index: 0,
+            is_exact: false,
+        }
+    }
+
+    pub fn max() -> Self {
+        Match {
+            query_index: u32::max_value(),
+            distance: u8::max_value(),
+            attribute: u8::max_value(),
+            attribute_index: u32::max_value(),
+            is_exact: true,
+        }
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,50 +0,0 @@
-#[macro_use] extern crate serde_derive;
-
-#[cfg(feature = "index")]
-mod index;
-#[cfg(feature = "serve")]
-mod serve;
-mod common_words;
-
-use structopt::StructOpt;
-
-#[derive(Debug, StructOpt)]
-#[structopt(name = "raptor-cli", about = "A command line to do raptor operations.")]
-enum Command {
-    #[cfg(feature = "index")]
-    /// Index files of different format.
-    #[structopt(name = "index")]
-    Index(index::CommandIndex),
-
-    #[cfg(feature = "serve")]
-    /// Serve indexes.
-    #[structopt(name = "serve")]
-    Serve(serve::CommandServe),
-}
-
-fn main() {
-    let ret = match Command::from_args() {
-
-        #[cfg(feature = "index")]
-        Command::Index(i) => match i {
-
-            #[cfg(feature = "index-jsonlines")]
-            index::CommandIndex::JsonLines(command) => index::jsonlines_feature::json_lines(command),
-
-            #[cfg(feature = "index-csv")]
-            index::CommandIndex::Csv(command) => index::csv_feature::csv(command),
-        },
-
-        #[cfg(feature = "serve")]
-        Command::Serve(s) => match s {
-
-            #[cfg(feature = "serve-http")]
-            serve::CommandServe::Http(command) => serve::http_feature::http(command),
-
-            #[cfg(feature = "serve-console")]
-            serve::CommandServe::Console(command) => serve::console_feature::console(command),
-        },
-    };
-
-    if let Err(e) = ret { eprintln!("{}", e) }
-}
--- a/src/metadata/difference.rs
+++ b/src/metadata/difference.rs
@ -0,0 +1,126 @@
+use fst::{Streamer, Automaton};
+use crate::metadata::ops::{self, IndexedDocIndexes};
+use crate::metadata::{stream_ops, Metadata};
+
+fn union_with_automatons<'a, A>(metas: &'a [Metadata], autos: Vec<A>) -> ops::Union
+where A: 'a + Automaton + Clone,
+{
+    let mut op = ops::OpBuilder::with_automatons(autos);
+    for metadata in metas {
+        op.push(metadata);
+    }
+    op.union()
+}
+
+pub struct Difference<'f> {
+    inner: stream_ops::Difference<'f>,
+}
+
+impl<'f> Difference<'f> {
+    pub fn new<A>(positives: &'f [Metadata], negatives: &'f [Metadata], automatons: Vec<A>) -> Self
+    where A: 'f + Automaton + Clone
+    {
+        let positives = union_with_automatons(positives, automatons.clone());
+        let negatives = union_with_automatons(negatives, automatons);
+
+        let mut builder = stream_ops::OpBuilder::new();
+        builder.push(positives);
+        builder.push(negatives);
+
+        Difference { inner: builder.difference() }
+    }
+}
+
+impl<'a, 'f> Streamer<'a> for Difference<'f> {
+    type Item = (&'a [u8], &'a [IndexedDocIndexes]);
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        self.inner.next()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use fst::automaton::AlwaysMatch;
+    use crate::metadata::{Metadata, MetadataBuilder};
+    use crate::vec_read_only::VecReadOnly;
+    use crate::DocIndex;
+
+    fn construct_metadata(documents: Vec<(String, DocIndex)>) -> Metadata {
+        let mapw = Vec::new();
+        let indexesw = Vec::new();
+
+        let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+        for (string, doc_index) in documents {
+            builder.insert(string, doc_index);
+        }
+
+        let (map, indexes) = builder.into_inner().unwrap();
+        Metadata::from_bytes(map, indexes).unwrap()
+    }
+
+    #[test]
+    fn empty() {
+        let positive_metas = construct_metadata(vec![
+            ("chameau".into(), DocIndex{ document: 12, attribute: 1, attribute_index: 22 }),
+            ("chameau".into(), DocIndex{ document: 31, attribute: 0, attribute_index: 1 }),
+        ]);
+
+        let negative_metas = construct_metadata(vec![
+            ("chameau".into(), DocIndex{ document: 12, attribute: 1, attribute_index: 22 }),
+            ("chameau".into(), DocIndex{ document: 31, attribute: 0, attribute_index: 1 }),
+        ]);
+
+        let positives = &[positive_metas];
+        let negatives = &[negative_metas];
+        let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]);
+
+        assert_eq!(diff.next(), None);
+    }
+
+    #[test]
+    fn one_positive() {
+        let di1 = DocIndex{ document: 12, attribute: 1, attribute_index: 22 };
+        let di2 = DocIndex{ document: 31, attribute: 0, attribute_index: 1 };
+
+        let positive_metas = construct_metadata(vec![
+            ("chameau".into(), di1),
+            ("chameau".into(), di2),
+        ]);
+
+        let negative_metas = construct_metadata(vec![
+            ("chameau".into(), di1),
+        ]);
+
+        let positives = &[positive_metas];
+        let negatives = &[negative_metas];
+        let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]);
+
+        let idi = IndexedDocIndexes{ index: 0, doc_indexes: VecReadOnly::new(vec![di2]) };
+        assert_eq!(diff.next(), Some(("chameau".as_bytes(), &[idi][..])));
+        assert_eq!(diff.next(), None);
+    }
+
+    #[test]
+    fn more_negative_than_positive() {
+        let di1 = DocIndex{ document: 12, attribute: 1, attribute_index: 22 };
+        let di2 = DocIndex{ document: 31, attribute: 0, attribute_index: 1 };
+
+        let positive_metas = construct_metadata(vec![
+            ("chameau".into(), di1),
+        ]);
+
+        let negative_metas = construct_metadata(vec![
+            ("chameau".into(), di1),
+            ("chameau".into(), di2),
+        ]);
+
+        let positives = &[positive_metas];
+        let negatives = &[negative_metas];
+        let mut diff = Difference::new(positives, negatives, vec![AlwaysMatch]);
+
+        assert_eq!(diff.next(), None);
+    }
+}
--- a/src/metadata/doc_indexes.rs
+++ b/src/metadata/doc_indexes.rs
@ -0,0 +1,200 @@
+use std::collections::btree_map::{BTreeMap, Iter, Entry};
+use std::slice::from_raw_parts;
+use std::io::{self, Write};
+use std::path::Path;
+use std::ops::Deref;
+use std::sync::Arc;
+use std::mem;
+use fst::raw::MmapReadOnly;
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use crate::DocIndex;
+
+#[repr(C)]
+struct Range {
+    start: u64,
+    end: u64,
+}
+
+#[derive(Clone)]
+enum DocIndexesData {
+    Shared {
+        vec: Arc<Vec<u8>>,
+        offset: usize,
+        len: usize,
+    },
+    Mmap(MmapReadOnly),
+}
+
+impl Deref for DocIndexesData {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            DocIndexesData::Shared { vec, offset, len } => {
+                &vec[*offset..offset + len]
+            },
+            DocIndexesData::Mmap(m) => m.as_slice(),
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct DocIndexes {
+    ranges: DocIndexesData,
+    indexes: DocIndexesData,
+}
+
+impl DocIndexes {
+    pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
+        let mmap = MmapReadOnly::open_path(path)?;
+
+        let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
+        let range_len = range_len as usize * mem::size_of::<Range>();
+
+        let offset = mem::size_of::<u64>() as usize;
+        let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len));
+
+        let len = mmap.len() - range_len - offset;
+        let offset = offset + range_len;
+        let indexes = DocIndexesData::Mmap(mmap.range(offset, len));
+
+        Ok(DocIndexes { ranges, indexes })
+    }
+
+    pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
+        let vec = Arc::new(vec);
+
+        let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
+        let range_len = range_len as usize * mem::size_of::<Range>();
+
+        let offset = mem::size_of::<u64>() as usize;
+        let ranges = DocIndexesData::Shared {
+            vec: vec.clone(),
+            offset,
+            len: range_len
+        };
+
+        let len = vec.len() - range_len - offset;
+        let offset = offset + range_len;
+        let indexes = DocIndexesData::Shared { vec, offset, len };
+
+        Ok(DocIndexes { ranges, indexes })
+    }
+
+    pub fn get(&self, index: u64) -> Option<&[DocIndex]> {
+        self.ranges().get(index as usize).map(|Range { start, end }| {
+            let start = *start as usize;
+            let end = *end as usize;
+            &self.indexes()[start..end]
+        })
+    }
+
+    fn ranges(&self) -> &[Range] {
+        let slice = &self.ranges;
+        let ptr = slice.as_ptr() as *const Range;
+        let len = slice.len() / mem::size_of::<Range>();
+        unsafe { from_raw_parts(ptr, len) }
+    }
+
+    fn indexes(&self) -> &[DocIndex] {
+        let slice = &self.indexes;
+        let ptr = slice.as_ptr() as *const DocIndex;
+        let len = slice.len() / mem::size_of::<DocIndex>();
+        unsafe { from_raw_parts(ptr, len) }
+    }
+}
+
+pub struct DocIndexesBuilder<W> {
+    keys: BTreeMap<String, u64>,
+    indexes: Vec<Vec<DocIndex>>,
+    number_docs: usize,
+    wtr: W,
+}
+
+impl<W: Write> DocIndexesBuilder<W> {
+    pub fn new(wtr: W) -> Self {
+        Self {
+            keys: BTreeMap::new(),
+            indexes: Vec::new(),
+            number_docs: 0,
+            wtr: wtr,
+        }
+    }
+
+    pub fn number_doc_indexes(&self) -> usize {
+        self.number_docs
+    }
+
+    pub fn insert(&mut self, key: String, value: DocIndex) {
+        match self.keys.entry(key) {
+            Entry::Vacant(e) => {
+                let index = self.indexes.len() as u64;
+                self.indexes.push(vec![value]);
+                e.insert(index);
+            },
+            Entry::Occupied(e) => {
+                let index = *e.get();
+                let vec = &mut self.indexes[index as usize];
+                vec.push(value);
+            },
+        }
+        self.number_docs += 1;
+    }
+
+    pub fn keys(&self) -> Iter<String, u64> {
+        self.keys.iter()
+    }
+
+    pub fn finish(self) -> io::Result<()> {
+        self.into_inner().map(|_| ())
+    }
+
+    pub fn into_inner(mut self) -> io::Result<W> {
+
+        for vec in &mut self.indexes {
+            vec.sort_unstable();
+        }
+
+        let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
+        let len = ranges.len() as u64;
+
+        // TODO check if this is correct
+        self.wtr.write_u64::<LittleEndian>(len)?;
+        unsafe {
+            // write Ranges first
+            let slice = into_u8_slice(ranges.as_slice());
+            self.wtr.write_all(slice)?;
+
+            // write Values after
+            let slice = into_u8_slice(values.as_slice());
+            self.wtr.write_all(slice)?;
+        }
+
+        self.wtr.flush()?;
+        Ok(self.wtr)
+    }
+}
+
+fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>, Vec<T>) {
+    let cap = vecs.len();
+    let mut ranges = Vec::with_capacity(cap);
+    let mut values = Vec::with_capacity(number_docs);
+
+    for v in &vecs {
+        let len = v.len() as u64;
+        let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
+
+        let range = Range { start, end: start + len };
+        ranges.push(range);
+    }
+
+    values.extend(vecs.into_iter().flatten());
+
+    (ranges, values)
+}
+
+unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
+    let ptr = slice.as_ptr() as *const u8;
+    let len = slice.len() * mem::size_of::<T>();
+    from_raw_parts(ptr, len)
+}
--- a/src/metadata/mod.rs
+++ b/src/metadata/mod.rs
@ -0,0 +1,136 @@
+pub mod ops;
+pub mod stream_ops;
+pub mod doc_indexes;
+pub mod difference;
+pub mod ops_indexed_value;
+
+use fst::{Map, MapBuilder};
+use std::error::Error;
+use std::path::Path;
+use std::io::Write;
+use crate::DocIndex;
+use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
+
+pub struct Metadata {
+    map: Map,
+    indexes: DocIndexes,
+}
+
+impl Metadata {
+    pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
+    where P: AsRef<Path>,
+          Q: AsRef<Path>,
+    {
+        let map = Map::from_path(map)?;
+        let indexes = DocIndexes::from_path(indexes)?;
+        Ok(Metadata { map, indexes })
+    }
+
+    pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
+        let map = Map::from_bytes(map)?;
+        let indexes = DocIndexes::from_bytes(indexes)?;
+        Ok(Metadata { map, indexes })
+    }
+
+    pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
+        self.map.get(key).and_then(|index| self.indexes.get(index))
+    }
+
+    pub fn as_map(&self) -> &Map {
+        &self.map
+    }
+
+    pub fn as_indexes(&self) -> &DocIndexes {
+        &self.indexes
+    }
+
+    pub fn explode(self) -> (Map, DocIndexes) {
+        (self.map, self.indexes)
+    }
+}
+
+pub struct MetadataBuilder<W, X> {
+    map: W,
+    indexes: DocIndexesBuilder<X>,
+}
+
+impl<W: Write, X: Write> MetadataBuilder<W, X> {
+    pub fn new(map: W, indexes: X) -> Self {
+        Self { map, indexes: DocIndexesBuilder::new(indexes) }
+    }
+
+    pub fn insert(&mut self, key: String, index: DocIndex) {
+        self.indexes.insert(key, index)
+    }
+
+    pub fn finish(self) -> Result<(), Box<Error>> {
+        self.into_inner().map(|_| ())
+    }
+
+    pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
+        // FIXME insert a magic number that indicates if the endianess
+        //       of the input is the same as the machine that is reading it.
+
+        let map = {
+            let mut keys_builder = MapBuilder::new(self.map)?;
+            let keys = self.indexes.keys().map(|(s, v)| (s, *v));
+            keys_builder.extend_iter(keys)?;
+            keys_builder.into_inner()?
+        };
+
+        let indexes = self.indexes.into_inner()?;
+
+        Ok((map, indexes))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_serialize_deserialize() {
+        let mapw = Vec::new();
+        let indexesw = Vec::new();
+
+        let builder = MetadataBuilder::new(mapw, indexesw);
+        let (map, indexes) = builder.into_inner().unwrap();
+
+        let metas = Metadata::from_bytes(map, indexes).unwrap();
+        assert_eq!(metas.get("chameau"), None);
+    }
+
+    #[test]
+    fn one_doc_serialize_deserialize() {
+        let mapw = Vec::new();
+        let indexesw = Vec::new();
+
+        let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+        let doc = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
+        builder.insert("chameau".into(), doc);
+
+        let (map, indexes) = builder.into_inner().unwrap();
+
+        let metas = Metadata::from_bytes(map, indexes).unwrap();
+        assert_eq!(metas.get("chameau"), Some(&[doc][..]));
+    }
+
+    #[test]
+    fn multiple_docs_serialize_deserialize() {
+        let mapw = Vec::new();
+        let indexesw = Vec::new();
+
+        let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+        let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
+        let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+        builder.insert("chameau".into(), doc1);
+        builder.insert("chameau".into(), doc2);
+
+        let (map, indexes) = builder.into_inner().unwrap();
+
+        let metas = Metadata::from_bytes(map, indexes).unwrap();
+        assert_eq!(metas.get("chameau"), Some(&[doc1, doc2][..]));
+    }
+}
--- a/src/metadata/ops.rs
+++ b/src/metadata/ops.rs
@ -0,0 +1,329 @@
+use std::collections::BTreeMap;
+use fst::{map, Streamer, Automaton};
+use fst::automaton::AlwaysMatch;
+use sdset::multi::OpBuilder as SdOpBuilder;
+use sdset::{SetOperation, Set};
+use crate::metadata::ops_indexed_value::{
+    OpIndexedValueBuilder, UnionIndexedValue,
+};
+use crate::metadata::doc_indexes::DocIndexes;
+use crate::metadata::Metadata;
+use crate::vec_read_only::VecReadOnly;
+use crate::DocIndex;
+
+pub struct OpBuilder<'m, A: Automaton> {
+    // the operation on the maps is always an union.
+    maps: OpIndexedValueBuilder<'m>,
+    automatons: Vec<A>,
+    indexes: Vec<&'m DocIndexes>,
+}
+
+impl<'m> OpBuilder<'m, AlwaysMatch> {
+    pub fn new() -> Self {
+        Self {
+            maps: OpIndexedValueBuilder::new(),
+            automatons: vec![AlwaysMatch],
+            indexes: Vec::new(),
+        }
+    }
+}
+
+/// Do a set operation on multiple maps with the same automatons.
+impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
+    pub fn with_automatons(automatons: Vec<A>) -> Self {
+        Self {
+            maps: OpIndexedValueBuilder::new(),
+            automatons: automatons,
+            indexes: Vec::new(),
+        }
+    }
+
+    pub fn add(mut self, metadata: &'m Metadata) -> Self where A: Clone {
+        self.push(metadata);
+        self
+    }
+
+    pub fn push(&mut self, metadata: &'m Metadata) where A: Clone {
+        let mut op = map::OpBuilder::new();
+        for automaton in self.automatons.iter().cloned() {
+            let stream = metadata.as_map().search(automaton);
+            op.push(stream);
+        }
+
+        let stream = op.union();
+        let indexes = metadata.as_indexes();
+
+        self.maps.push(stream);
+        self.indexes.push(indexes);
+    }
+
+    pub fn union(self) -> Union<'m> {
+        Union::new(self.maps, self.indexes, self.automatons.len())
+    }
+
+    pub fn intersection(self) -> Intersection<'m> {
+        Intersection::new(self.maps, self.indexes, self.automatons.len())
+    }
+
+    pub fn difference(self) -> Difference<'m> {
+        Difference::new(self.maps, self.indexes, self.automatons.len())
+    }
+
+    pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
+        SymmetricDifference::new(self.maps, self.indexes, self.automatons.len())
+    }
+}
+
+#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct IndexedDocIndexes {
+    pub index: usize,
+    pub doc_indexes: VecReadOnly<DocIndex>,
+}
+
+struct SlotIndexedDocIndexes {
+    index: usize,
+    start: usize,
+    len: usize,
+}
+
+macro_rules! logical_operation {
+    (struct $name:ident, $operation:ident) => {
+
+pub struct $name<'m> {
+    maps: UnionIndexedValue<'m>,
+    indexes: Vec<&'m DocIndexes>,
+    number_automatons: usize,
+    outs: Vec<IndexedDocIndexes>,
+}
+
+impl<'m> $name<'m> {
+    fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self {
+        $name {
+            maps: maps.union(),
+            indexes: indexes,
+            number_automatons: number_automatons,
+            outs: Vec::new(),
+        }
+    }
+}
+
+impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
+    type Item = (&'a [u8], &'a [IndexedDocIndexes]);
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        match self.maps.next() {
+            Some((input, ivalues)) => {
+                self.outs.clear();
+
+                let mut builders = vec![BTreeMap::new(); self.number_automatons];
+                for iv in ivalues {
+                    let builder = &mut builders[iv.aut_index];
+                    builder.insert(iv.rdr_index, iv.value);
+                }
+
+                let mut doc_indexes = Vec::new();
+                let mut doc_indexes_slots = Vec::with_capacity(builders.len());
+                for (aut_index, values) in builders.into_iter().enumerate() {
+                    let mut builder = SdOpBuilder::with_capacity(values.len());
+                    for (rdr_index, value) in values {
+                        let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
+                        let indexes = Set::new_unchecked(indexes);
+                        builder.push(indexes);
+                    }
+
+                    let start = doc_indexes.len();
+                    builder.$operation().extend_vec(&mut doc_indexes);
+                    let len = doc_indexes.len() - start;
+                    if len != 0 {
+                        let slot = SlotIndexedDocIndexes {
+                            index: aut_index,
+                            start: start,
+                            len: len,
+                        };
+                        doc_indexes_slots.push(slot);
+                    }
+                }
+
+                let read_only = VecReadOnly::new(doc_indexes);
+                self.outs.reserve(doc_indexes_slots.len());
+                for slot in doc_indexes_slots {
+                    let indexes = IndexedDocIndexes {
+                        index: slot.index,
+                        doc_indexes: read_only.range(slot.start, slot.len),
+                    };
+                    self.outs.push(indexes);
+                }
+
+                if self.outs.is_empty() { return None }
+                Some((input, &self.outs))
+            },
+            None => None,
+        }
+    }
+}
+}}
+
+logical_operation!(struct Union, union);
+logical_operation!(struct Intersection, intersection);
+logical_operation!(struct Difference, difference);
+logical_operation!(struct SymmetricDifference, symmetric_difference);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::MetadataBuilder;
+
+    fn get_exact_key<'m, I, S>(stream: I, key: &[u8]) -> Option<VecReadOnly<DocIndex>>
+    where
+        I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
+        S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
+    {
+        let mut stream = stream.into_stream();
+        while let Some((string, indexes)) = stream.next() {
+            if string == key {
+                return Some(indexes[0].doc_indexes.clone())
+            }
+        }
+        None
+    }
+
+    #[test]
+    fn union_two_metadata() {
+        let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
+        let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+
+        let meta1 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc1);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let meta2 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc2);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let metas = OpBuilder::new().add(&meta1).add(&meta2).union();
+        let value = get_exact_key(metas, b"chameau");
+
+        assert_eq!(&*value.unwrap(), &[doc1, doc2][..]);
+    }
+
+    #[test]
+    fn intersection_two_metadata() {
+        let doc1 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+        let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+
+        let meta1 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc1);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let meta2 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc2);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let metas = OpBuilder::new().add(&meta1).add(&meta2).intersection();
+        let value = get_exact_key(metas, b"chameau");
+
+        assert_eq!(&*value.unwrap(), &[doc1][..]);
+    }
+
+    #[test]
+    fn difference_two_metadata() {
+        let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
+        let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+        let doc3 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+
+        let meta1 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc1);
+            builder.insert("chameau".into(), doc2);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let meta2 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc3);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let metas = OpBuilder::new().add(&meta1).add(&meta2).difference();
+        let value = get_exact_key(metas, b"chameau");
+
+        assert_eq!(&*value.unwrap(), &[doc1][..]);
+    }
+
+    #[test]
+    fn symmetric_difference_two_metadata() {
+        let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
+        let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
+        let doc3 = DocIndex { document: 32, attribute: 0, attribute_index: 1 };
+        let doc4 = DocIndex { document: 34, attribute: 12, attribute_index: 1 };
+
+        let meta1 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc1);
+            builder.insert("chameau".into(), doc2);
+            builder.insert("chameau".into(), doc3);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let meta2 = {
+            let mapw = Vec::new();
+            let indexesw = Vec::new();
+            let mut builder = MetadataBuilder::new(mapw, indexesw);
+
+            builder.insert("chameau".into(), doc2);
+            builder.insert("chameau".into(), doc3);
+            builder.insert("chameau".into(), doc4);
+
+            let (map, indexes) = builder.into_inner().unwrap();
+            Metadata::from_bytes(map, indexes).unwrap()
+        };
+
+        let metas = OpBuilder::new().add(&meta1).add(&meta2).symmetric_difference();
+        let value = get_exact_key(metas, b"chameau");
+
+        assert_eq!(&*value.unwrap(), &[doc1, doc4][..]);
+    }
+}
--- a/src/metadata/ops_indexed_value.rs
+++ b/src/metadata/ops_indexed_value.rs
@ -0,0 +1,203 @@
+use std::collections::BinaryHeap;
+use std::rc::Rc;
+use std::cmp;
+use fst::raw::{self, Output};
+use fst::{self, IntoStreamer, Streamer};
+
+type BoxedStream<'f> = Box<for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])> + 'f>;
+
+pub struct OpIndexedValueBuilder<'f> {
+    streams: Vec<BoxedStream<'f>>,
+}
+
+impl<'f> OpIndexedValueBuilder<'f> {
+    pub fn new() -> Self {
+        Self { streams: Vec::new() }
+    }
+
+    pub fn push<I, S>(&mut self, stream: I)
+    where
+        I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [raw::IndexedValue])>,
+        S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])>,
+    {
+        self.streams.push(Box::new(stream.into_stream()));
+    }
+
+    pub fn union(self) -> UnionIndexedValue<'f> {
+        UnionIndexedValue {
+            heap: StreamIndexedValueHeap::new(self.streams),
+            outs: Vec::new(),
+            cur_slot: None,
+        }
+    }
+}
+
+pub struct UnionIndexedValue<'f> {
+    heap: StreamIndexedValueHeap<'f>,
+    outs: Vec<IndexedValue>,
+    cur_slot: Option<SlotIndexedValue>,
+}
+
+impl<'f> UnionIndexedValue<'f> {
+    pub fn len(&self) -> usize {
+        self.heap.num_slots()
+    }
+}
+
+impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
+    type Item = (&'a [u8], &'a [IndexedValue]);
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        if let Some(slot) = self.cur_slot.take() {
+            self.heap.refill(slot);
+        }
+        let slot = match self.heap.pop() {
+            None => return None,
+            Some(slot) => {
+                self.cur_slot = Some(slot);
+                self.cur_slot.as_mut().unwrap()
+            }
+        };
+        self.outs.clear();
+        self.outs.push(slot.indexed_value());
+        while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
+            self.outs.push(slot2.indexed_value());
+            self.heap.refill(slot2);
+        }
+        Some((slot.input(), &self.outs))
+    }
+}
+
+struct StreamIndexedValueHeap<'f> {
+    rdrs: Vec<BoxedStream<'f>>,
+    heap: BinaryHeap<SlotIndexedValue>,
+}
+
+impl<'f> StreamIndexedValueHeap<'f> {
+    fn new(streams: Vec<BoxedStream<'f>>) -> StreamIndexedValueHeap<'f> {
+        let mut u = StreamIndexedValueHeap {
+            rdrs: streams,
+            heap: BinaryHeap::new(),
+        };
+        for i in 0..u.rdrs.len() {
+            u.refill(SlotIndexedValue::new(i));
+        }
+        u
+    }
+
+    fn pop(&mut self) -> Option<SlotIndexedValue> {
+        self.heap.pop()
+    }
+
+    fn peek_is_duplicate(&self, key: &[u8]) -> bool {
+        self.heap.peek().map(|s| s.input() == key).unwrap_or(false)
+    }
+
+    fn pop_if_equal(&mut self, key: &[u8]) -> Option<SlotIndexedValue> {
+        if self.peek_is_duplicate(key) {
+            self.pop()
+        } else {
+            None
+        }
+    }
+
+    fn pop_if_le(&mut self, key: &[u8]) -> Option<SlotIndexedValue> {
+        if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) {
+            self.pop()
+        } else {
+            None
+        }
+    }
+
+    fn num_slots(&self) -> usize {
+        self.rdrs.len()
+    }
+
+    fn refill(&mut self, mut slot: SlotIndexedValue) {
+        if let Some((input, ivalues)) = self.rdrs[slot.rdr_index].next() {
+            slot.set_input(input);
+            for values in ivalues {
+                slot.set_aut_index(values.index);
+                slot.set_output(values.value);
+                self.heap.push(slot.clone());
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct SlotIndexedValue {
+    rdr_index: usize,
+    aut_index: usize,
+    input: Rc<Vec<u8>>,
+    output: Output,
+}
+
+#[derive(Debug)]
+pub struct IndexedValue {
+    pub rdr_index: usize,
+    pub aut_index: usize,
+    pub value: u64,
+}
+
+impl PartialEq for SlotIndexedValue {
+    fn eq(&self, other: &Self) -> bool {
+        (&self.input, self.rdr_index, self.aut_index, self.output)
+        .eq(&(&other.input, other.rdr_index, other.aut_index, other.output))
+    }
+}
+
+impl Eq for SlotIndexedValue { }
+
+impl PartialOrd for SlotIndexedValue {
+    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+        (&self.input, self.rdr_index, self.aut_index, self.output)
+        .partial_cmp(&(&other.input, other.rdr_index, other.aut_index, other.output))
+        .map(|ord| ord.reverse())
+    }
+}
+
+impl Ord for SlotIndexedValue {
+    fn cmp(&self, other: &Self) -> cmp::Ordering {
+        self.partial_cmp(other).unwrap()
+    }
+}
+
+impl SlotIndexedValue {
+    fn new(rdr_index: usize) -> SlotIndexedValue {
+        SlotIndexedValue {
+            rdr_index: rdr_index,
+            aut_index: 0,
+            input: Rc::new(Vec::with_capacity(64)),
+            output: Output::zero(),
+        }
+    }
+
+    fn indexed_value(&self) -> IndexedValue {
+        IndexedValue {
+            rdr_index: self.rdr_index,
+            aut_index: self.aut_index,
+            value: self.output.value(),
+        }
+    }
+
+    fn input(&self) -> &[u8] {
+        &self.input
+    }
+
+    fn set_aut_index(&mut self, aut_index: usize) {
+        self.aut_index = aut_index;
+    }
+
+    fn set_input(&mut self, input: &[u8]) {
+        if *self.input != input {
+            let inner = Rc::make_mut(&mut self.input);
+            inner.clear();
+            inner.extend(input);
+        }
+    }
+
+    fn set_output(&mut self, output: u64) {
+        self.output = Output::new(output);
+    }
+}
--- a/src/metadata/stream_ops.rs
+++ b/src/metadata/stream_ops.rs
@ -0,0 +1,309 @@
+use std::rc::Rc;
+use std::collections::{BinaryHeap, HashMap, BTreeMap};
+use std::cmp;
+use fst::{IntoStreamer, Streamer};
+use sdset::multi::OpBuilder as SdOpBuilder;
+use sdset::{SetOperation, Set};
+use crate::metadata::ops::IndexedDocIndexes;
+use crate::vec_read_only::VecReadOnly;
+use crate::DocIndex;
+
+type BoxedStream<'f> = Box<for<'a> Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])> + 'f>;
+
+pub struct OpBuilder<'f> {
+    streams: Vec<BoxedStream<'f>>,
+}
+
+impl<'f> OpBuilder<'f> {
+    pub fn new() -> Self {
+        Self { streams: Vec::new() }
+    }
+
+    /// Push a stream of `IndexedDocIndexes`.
+    ///
+    /// # Warning
+    ///
+    /// You must ensure yourself that the automatons are
+    /// all the same in the same order for each stream you push.
+    pub fn push<I, S>(&mut self, stream: I)
+    where
+        I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
+        S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
+    {
+        self.streams.push(Box::new(stream.into_stream()));
+    }
+
+    pub fn union(self) -> Union<'f> {
+        Union {
+            heap: StreamHeap::new(self.streams),
+            outs: Vec::new(),
+            cur_slot: None,
+        }
+    }
+
+    pub fn intersection(self) -> Intersection<'f> {
+        Intersection {
+            heap: StreamHeap::new(self.streams),
+            outs: Vec::new(),
+            cur_slot: None,
+        }
+    }
+
+    pub fn difference(self) -> Difference<'f> {
+        Difference {
+            heap: StreamHeap::new(self.streams),
+            outs: Vec::new(),
+            cur_slot: None,
+        }
+    }
+
+    pub fn symmetric_difference(self) -> SymmetricDifference<'f> {
+        SymmetricDifference {
+            heap: StreamHeap::new(self.streams),
+            outs: Vec::new(),
+            cur_slot: None,
+        }
+    }
+}
+
+// FIXME reuse it from metadata::ops
+struct SlotIndexedDocIndexes {
+    aut_index: usize,
+    start: usize,
+    len: usize,
+}
+
+macro_rules! logical_operation {
+    (struct $name:ident, $operation:ident) => {
+
+pub struct $name<'f> {
+    heap: StreamHeap<'f>,
+    outs: Vec<IndexedDocIndexes>,
+    cur_slot: Option<Slot>,
+}
+
+impl<'a, 'f> Streamer<'a> for $name<'f> {
+    type Item = (&'a [u8], &'a [IndexedDocIndexes]);
+
+    // The Metadata could be types as "key-values present" and "key-values possibly not present"
+    // in other words Metadata that "needs" to have key-values and other that doesn't needs.
+    //
+    // We could probably allow the user to define in Metadata some Document
+    // that needs to be deleted and only declare the DocumentId, and not every DocIndex of each words.
+    fn next(&'a mut self) -> Option<Self::Item> {
+        if let Some(slot) = self.cur_slot.take() {
+            self.heap.refill(slot);
+        }
+        let slot = match self.heap.pop() {
+            None => return None,
+            Some(slot) => {
+                self.cur_slot = Some(slot);
+                self.cur_slot.as_mut().unwrap()
+            }
+        };
+
+        self.outs.clear();
+
+        // retrieve all the doc_indexes of all the streams,
+        // store them in an HashMap which the key is
+        // the aut_index (associated with the state that is ignored),
+        // the doc_indexes must be stored in another BTreeMap which the key
+        // is the rdr_index.
+        //
+        // This will permit us to do set operations on readers (using the rdr_index)
+        // the BTreeMap will gives the rdr_index in order and the final result
+        // will be aggregated in a Vec of IndexedDocIndexes which the aut_index and state
+        // are the key of the first HashMap
+
+        // TODO use the fnv Hasher!
+
+        let mut builders = HashMap::new();
+        let iv = slot.indexed_value();
+        let builder = builders.entry(iv.index).or_insert_with(BTreeMap::new);
+        builder.insert(slot.rdr_index, iv.doc_indexes);
+
+        while let Some(mut slot) = self.heap.pop_if_equal(slot.input()) {
+            let iv = slot.indexed_value();
+            let builder = builders.entry(iv.index).or_insert_with(BTreeMap::new);
+            builder.insert(slot.rdr_index, iv.doc_indexes);
+
+            self.heap.refill(slot);
+        }
+
+        // now that we have accumulated all the doc_indexes like so:
+        // HashMap<(aut_index, state*), BtreeMap<rdr_index, doc_indexes>>
+        // we will be able to retrieve, for each aut_index, the doc_indexes
+        // that are needed to do the set operation
+
+        let mut doc_indexes = Vec::new();
+        let mut doc_indexes_slots = Vec::with_capacity(builders.len());
+        for (aut_index, values) in builders {
+
+            let sets = values.iter().map(|(_, v)| Set::new_unchecked(v.as_slice())).collect();
+            let builder = SdOpBuilder::from_vec(sets);
+
+            let start = doc_indexes.len();
+            builder.$operation().extend_vec(&mut doc_indexes);
+            let len = doc_indexes.len() - start;
+            if len == 0 { continue }
+
+            let slot = SlotIndexedDocIndexes {
+                aut_index: aut_index,
+                start: start,
+                len: len,
+            };
+            doc_indexes_slots.push(slot);
+        }
+
+        let read_only = VecReadOnly::new(doc_indexes);
+        self.outs.reserve(doc_indexes_slots.len());
+        for slot in doc_indexes_slots {
+            let indexes = IndexedDocIndexes {
+                index: slot.aut_index,
+                doc_indexes: read_only.range(slot.start, slot.len),
+            };
+            self.outs.push(indexes);
+        }
+
+        if self.outs.is_empty() { return None }
+        Some((slot.input(), &self.outs))
+    }
+}
+}}
+
+logical_operation!(struct Union, union);
+logical_operation!(struct Intersection, intersection);
+logical_operation!(struct Difference, difference);
+logical_operation!(struct SymmetricDifference, symmetric_difference);
+
+struct StreamHeap<'f> {
+    rdrs: Vec<BoxedStream<'f>>,
+    heap: BinaryHeap<Slot>,
+}
+
+impl<'f> StreamHeap<'f> {
+    fn new(streams: Vec<BoxedStream<'f>>) -> StreamHeap<'f> {
+        let mut heap = StreamHeap {
+            rdrs: streams,
+            heap: BinaryHeap::new(),
+        };
+        for i in 0..heap.rdrs.len() {
+            heap.refill(Slot::new(i));
+        }
+        heap
+    }
+
+    fn pop(&mut self) -> Option<Slot> {
+        self.heap.pop()
+    }
+
+    fn peek_is_duplicate(&self, key: &[u8]) -> bool {
+        self.heap.peek().map(|s| s.input() == key).unwrap_or(false)
+    }
+
+    fn pop_if_equal(&mut self, key: &[u8]) -> Option<Slot> {
+        if self.peek_is_duplicate(key) {
+            self.pop()
+        } else {
+            None
+        }
+    }
+
+    fn pop_if_le(&mut self, key: &[u8]) -> Option<Slot> {
+        if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) {
+            self.pop()
+        } else {
+            None
+        }
+    }
+
+    fn num_slots(&self) -> usize {
+        self.rdrs.len()
+    }
+
+    fn refill(&mut self, mut slot: Slot) {
+        if let Some((input, outputs)) = self.rdrs[slot.rdr_index].next() {
+            slot.set_input(input);
+            for output in outputs {
+                slot.set_aut_index(output.index);
+                slot.set_output(output.doc_indexes.clone());
+                self.heap.push(slot.clone());
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct Slot {
+    rdr_index: usize,
+    aut_index: usize,
+    input: Rc<Vec<u8>>,
+    output: Option<VecReadOnly<DocIndex>>,
+}
+
+impl PartialEq for Slot {
+    fn eq(&self, other: &Self) -> bool {
+        (&self.input, self.rdr_index, self.aut_index)
+        .eq(&(&other.input, other.rdr_index, other.aut_index))
+    }
+}
+
+impl Eq for Slot { }
+
+impl PartialOrd for Slot {
+    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+        (&self.input, self.rdr_index, self.aut_index)
+        .partial_cmp(&(&other.input, other.rdr_index, other.aut_index))
+        .map(|ord| ord.reverse())
+    }
+}
+
+impl Ord for Slot {
+    fn cmp(&self, other: &Self) -> cmp::Ordering {
+        self.partial_cmp(other).unwrap()
+    }
+}
+
+impl Slot {
+    fn new(rdr_index: usize) -> Self {
+        Slot {
+            rdr_index: rdr_index,
+            aut_index: 0,
+            input: Rc::new(Vec::with_capacity(64)),
+            output: None,
+        }
+    }
+
+    fn indexed_value(&mut self) -> IndexedDocIndexes {
+        IndexedDocIndexes {
+            index: self.aut_index,
+            doc_indexes: self.output.take().unwrap(),
+        }
+    }
+
+    fn input(&self) -> &[u8] {
+        &self.input
+    }
+
+    fn set_input(&mut self, input: &[u8]) {
+        if *self.input != input {
+            let inner = Rc::make_mut(&mut self.input);
+            inner.clear();
+            inner.extend(input);
+        }
+    }
+
+    fn set_aut_index(&mut self, aut_index: usize) {
+        self.aut_index = aut_index;
+    }
+
+    fn set_output(&mut self, output: VecReadOnly<DocIndex>) {
+        self.output = Some(output);
+    }
+}
+
+#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct IndexedValueWithState {
+    pub index: usize,
+    pub value: u64,
+}
--- a/src/rank/exact.rs
+++ b/src/rank/exact.rs
@ -0,0 +1,22 @@
+use std::cmp::Ordering;
+use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};
+
+#[inline]
+fn contains_exact(matches: &[Match]) -> bool {
+    matches.iter().any(|m| m.is_exact)
+}
+
+#[inline]
+fn number_exact_matches(matches: &[Match]) -> usize {
+    GroupBy::new(matches, match_query_index).map(contains_exact).count()
+}
+
+#[inline]
+pub fn exact(lhs: &Document, rhs: &Document) -> Ordering {
+    let lhs = number_exact_matches(&lhs.matches);
+    let rhs = number_exact_matches(&rhs.matches);
+
+    lhs.cmp(&rhs).reverse()
+}
--- a/src/rank/mod.rs
+++ b/src/rank/mod.rs
@ -0,0 +1,169 @@
+mod sum_of_typos;
+mod number_of_words;
+mod words_proximity;
+mod sum_of_words_attribute;
+mod sum_of_words_position;
+mod exact;
+
+use std::cmp::Ordering;
+use std::rc::Rc;
+use std::{mem, vec};
+use fst::Streamer;
+use fnv::FnvHashMap;
+use group_by::GroupByMut;
+use crate::automaton::{DfaExt, AutomatonExt};
+use crate::metadata::Metadata;
+use crate::metadata::ops::{OpBuilder, Union};
+use crate::{Match, DocumentId};
+
+use self::{
+    sum_of_typos::sum_of_typos,
+    number_of_words::number_of_words,
+    words_proximity::words_proximity,
+    sum_of_words_attribute::sum_of_words_attribute,
+    sum_of_words_position::sum_of_words_position,
+    exact::exact,
+};
+
+#[inline]
+fn match_query_index(a: &Match, b: &Match) -> bool {
+    a.query_index == b.query_index
+}
+
+#[derive(Debug, Clone)]
+pub struct Document {
+    pub document_id: DocumentId,
+    pub matches: Vec<Match>,
+}
+
+impl Document {
+    pub fn new(doc: DocumentId, match_: Match) -> Self {
+        Self::from_sorted_matches(doc, vec![match_])
+    }
+
+    pub fn from_sorted_matches(doc: DocumentId, matches: Vec<Match>) -> Self {
+        Self {
+            document_id: doc,
+            matches: matches,
+        }
+    }
+}
+
+fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize) -> vec::IntoIter<Document> {
+    let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| {
+        matches.sort_unstable();
+        Document::from_sorted_matches(id, matches)
+    }).collect();
+
+    let sorts = &[
+        sum_of_typos,
+        number_of_words,
+        words_proximity,
+        sum_of_words_attribute,
+        sum_of_words_position,
+        exact,
+    ];
+
+    let mut groups = vec![documents.as_mut_slice()];
+
+    for sort in sorts {
+        let temp = mem::replace(&mut groups, Vec::new());
+        let mut computed = 0;
+
+        'grp: for group in temp {
+            group.sort_unstable_by(sort);
+            for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) {
+                computed += group.len();
+                groups.push(group);
+                if computed >= limit { break 'grp }
+            }
+        }
+    }
+
+    documents.truncate(limit);
+    documents.into_iter()
+}
+
+pub struct RankedStream<'m>(RankedStreamInner<'m>);
+
+impl<'m> RankedStream<'m> {
+    pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>, limit: usize) -> Self {
+        let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
+        let mut builder = OpBuilder::with_automatons(automatons.clone());
+        builder.push(metadata);
+
+        let inner = RankedStreamInner::Fed {
+            inner: builder.union(),
+            automatons: automatons,
+            limit: limit,
+            matches: FnvHashMap::default(),
+        };
+
+        RankedStream(inner)
+    }
+}
+
+impl<'m, 'a> fst::Streamer<'a> for RankedStream<'m> {
+    type Item = Document;
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        self.0.next()
+    }
+}
+
+enum RankedStreamInner<'m> {
+    Fed {
+        inner: Union<'m>,
+        automatons: Vec<Rc<DfaExt>>,
+        limit: usize,
+        matches: FnvHashMap<DocumentId, Vec<Match>>,
+    },
+    Pours {
+        inner: vec::IntoIter<Document>,
+    },
+}
+
+impl<'m, 'a> fst::Streamer<'a> for RankedStreamInner<'m> {
+    type Item = Document;
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        loop {
+            match self {
+                RankedStreamInner::Fed { inner, automatons, limit, matches } => {
+                    match inner.next() {
+                        Some((string, indexed_values)) => {
+                            for iv in indexed_values {
+
+                                let automaton = &automatons[iv.index];
+                                let distance = automaton.eval(string).to_u8();
+                                let same_length = string.len() == automaton.query_len();
+
+                                for di in iv.doc_indexes.as_slice() {
+                                    let match_ = Match {
+                                        query_index: iv.index as u32,
+                                        distance: distance,
+                                        attribute: di.attribute,
+                                        attribute_index: di.attribute_index,
+                                        is_exact: distance == 0 && same_length,
+                                    };
+                                    matches.entry(di.document)
+                                           .or_insert_with(Vec::new)
+                                           .push(match_);
+                                }
+                            }
+                        },
+                        None => {
+                            let matches = mem::replace(matches, FnvHashMap::default());
+                            *self = RankedStreamInner::Pours {
+                                inner: matches_into_iter(matches, *limit).into_iter()
+                            };
+                        },
+                    }
+                },
+                RankedStreamInner::Pours { inner } => {
+                    return inner.next()
+                },
+            }
+        }
+    }
+}
--- a/src/rank/number_of_words.rs
+++ b/src/rank/number_of_words.rs
@ -0,0 +1,17 @@
+use std::cmp::Ordering;
+use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};
+
+#[inline]
+fn number_of_query_words(matches: &[Match]) -> usize {
+    GroupBy::new(matches, match_query_index).count()
+}
+
+#[inline]
+pub fn number_of_words(lhs: &Document, rhs: &Document) -> Ordering {
+    let lhs = number_of_query_words(&lhs.matches);
+    let rhs = number_of_query_words(&rhs.matches);
+
+    lhs.cmp(&rhs).reverse()
+}
--- a/src/rank/sum_of_typos.rs
+++ b/src/rank/sum_of_typos.rs
@ -0,0 +1,123 @@
+use std::cmp::Ordering;
+use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};
+
+#[inline]
+fn sum_matches_typos(matches: &[Match]) -> i8 {
+    let mut sum_typos = 0;
+    let mut number_words = 0;
+
+    // note that GroupBy will never return an empty group
+    // so we can do this assumption safely
+    for group in GroupBy::new(matches, match_query_index) {
+        sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
+        number_words += 1;
+    }
+
+    sum_typos - number_words
+}
+
+#[inline]
+pub fn sum_of_typos(lhs: &Document, rhs: &Document) -> Ordering {
+    let lhs = sum_matches_typos(&lhs.matches);
+    let rhs = sum_matches_typos(&rhs.matches);
+
+    lhs.cmp(&rhs)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // typing: "Geox CEO"
+    //
+    // doc0: "Geox SpA: CEO and Executive"
+    // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
+    #[test]
+    fn one_typo_reference() {
+        let doc0 = {
+            let matches = vec![
+                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+                Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
+            ];
+            Document {
+                document_id: 0,
+                matches: matches,
+            }
+        };
+
+        let doc1 = {
+            let matches = vec![
+                Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
+                Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
+            ];
+            Document {
+                document_id: 1,
+                matches: matches,
+            }
+        };
+
+        assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Less);
+    }
+
+    // typing: "bouton manchette"
+    //
+    // doc0: "bouton manchette"
+    // doc1: "bouton"
+    #[test]
+    fn no_typo() {
+        let doc0 = {
+            let matches = vec![
+                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+                Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
+            ];
+            Document {
+                document_id: 0,
+                matches: matches,
+            }
+        };
+
+        let doc1 = {
+            let matches = vec![
+                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+            ];
+            Document {
+                document_id: 1,
+                matches: matches,
+            }
+        };
+
+        assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Less);
+    }
+
+    // typing: "bouton manchztte"
+    //
+    // doc0: "bouton manchette"
+    // doc1: "bouton"
+    #[test]
+    fn one_typo() {
+        let doc0 = {
+            let matches = vec![
+                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+                Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
+            ];
+            Document {
+                document_id: 0,
+                matches: matches,
+            }
+        };
+
+        let doc1 = {
+            let matches = vec![
+                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+            ];
+            Document {
+                document_id: 1,
+                matches: matches,
+            }
+        };
+
+        assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Equal);
+    }
+}
--- a/src/rank/sum_of_words_attribute.rs
+++ b/src/rank/sum_of_words_attribute.rs
@ -0,0 +1,21 @@
+use std::cmp::Ordering;
+use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};
+
+#[inline]
+fn sum_matches_attributes(matches: &[Match]) -> u8 {
+    // note that GroupBy will never return an empty group
+    // so we can do this assumption safely
+    GroupBy::new(matches, match_query_index).map(|group| unsafe {
+        group.get_unchecked(0).attribute
+    }).sum()
+}
+
+#[inline]
+pub fn sum_of_words_attribute(lhs: &Document, rhs: &Document) -> Ordering {
+    let lhs = sum_matches_attributes(&lhs.matches);
+    let rhs = sum_matches_attributes(&rhs.matches);
+
+    lhs.cmp(&rhs)
+}
--- a/src/rank/sum_of_words_position.rs
+++ b/src/rank/sum_of_words_position.rs
@ -0,0 +1,21 @@
+use std::cmp::Ordering;
+use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};
+
+#[inline]
+fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
+    // note that GroupBy will never return an empty group
+    // so we can do this assumption safely
+    GroupBy::new(matches, match_query_index).map(|group| unsafe {
+        group.get_unchecked(0).attribute_index
+    }).sum()
+}
+
+#[inline]
+pub fn sum_of_words_position(lhs: &Document, rhs: &Document) -> Ordering {
+    let lhs = sum_matches_attribute_index(&lhs.matches);
+    let rhs = sum_matches_attribute_index(&rhs.matches);
+
+    lhs.cmp(&rhs)
+}
--- a/src/rank/words_proximity.rs
+++ b/src/rank/words_proximity.rs
@ -0,0 +1,104 @@
+use std::cmp::{self, Ordering};
+use group_by::GroupBy;
+use crate::Match;
+use crate::rank::{match_query_index, Document};
+
+const MAX_DISTANCE: u32 = 8;
+
+fn index_proximity(lhs: u32, rhs: u32) -> u32 {
+    if lhs < rhs {
+        cmp::min(rhs - lhs, MAX_DISTANCE)
+    } else {
+        cmp::min(lhs - rhs, MAX_DISTANCE) + 1
+    }
+}
+
+fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
+    if lhs.attribute != rhs.attribute { return MAX_DISTANCE }
+    index_proximity(lhs.attribute_index, rhs.attribute_index)
+}
+
+fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
+    let mut min_prox = u32::max_value();
+    for a in lhs {
+        for b in rhs {
+            min_prox = cmp::min(min_prox, attribute_proximity(a, b));
+        }
+    }
+    min_prox
+}
+
+fn matches_proximity(matches: &[Match]) -> u32 {
+    let mut proximity = 0;
+    let mut iter = GroupBy::new(matches, match_query_index);
+
+    // iterate over groups by windows of size 2
+    let mut last = iter.next();
+    while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
+        proximity += min_proximity(lhs, rhs);
+        last = Some(rhs);
+    }
+
+    proximity
+}
+
+pub fn words_proximity(lhs: &Document, rhs: &Document) -> Ordering {
+    matches_proximity(&lhs.matches).cmp(&matches_proximity(&rhs.matches))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn three_different_attributes() {
+
+        // "soup" "of the" "the day"
+        //
+        // { id: 0, attr: 0, attr_index: 0 }
+        // { id: 1, attr: 1, attr_index: 0 }
+        // { id: 2, attr: 1, attr_index: 1 }
+        // { id: 2, attr: 2, attr_index: 0 }
+        // { id: 3, attr: 3, attr_index: 1 }
+
+        let matches = &[
+            Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
+            Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() },
+            Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
+            Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
+            Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
+        ];
+
+        //   soup -> of = 8
+        // + of -> the  = 1
+        // + the -> day = 8 (not 1)
+        assert_eq!(matches_proximity(matches), 17);
+    }
+
+    #[test]
+    fn two_different_attributes() {
+
+        // "soup day" "soup of the day"
+        //
+        // { id: 0, attr: 0, attr_index: 0 }
+        // { id: 0, attr: 1, attr_index: 0 }
+        // { id: 1, attr: 1, attr_index: 1 }
+        // { id: 2, attr: 1, attr_index: 2 }
+        // { id: 3, attr: 0, attr_index: 1 }
+        // { id: 3, attr: 1, attr_index: 3 }
+
+        let matches = &[
+            Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
+            Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() },
+            Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
+            Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
+            Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
+            Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
+        ];
+
+        //   soup -> of = 1
+        // + of -> the  = 1
+        // + the -> day = 1
+        assert_eq!(matches_proximity(matches), 3);
+    }
+}
--- a/src/serve/console.rs
+++ b/src/serve/console.rs
@ -1,72 +0,0 @@
-use std::str::from_utf8_unchecked;
-use std::io::{self, Write};
-
-use fst::Streamer;
-use elapsed::measure_time;
-use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
-use raptor::{automaton, Metadata, RankedStream};
-
-use crate::serve::console_feature::CommandConsole;
-use crate::common_words::{self, CommonWords};
-
-pub struct ConsoleSearch {
-    common_words: CommonWords,
-    metadata: Metadata,
-    db: DB,
-}
-
-impl ConsoleSearch {
-    pub fn from_command(command: CommandConsole) -> io::Result<ConsoleSearch> {
-        let common_words = common_words::from_file(command.stop_words)?;
-
-        let meta_name = command.meta_name.display();
-        let map_file = format!("{}.map", meta_name);
-        let idx_file = format!("{}.idx", meta_name);
-        let sst_file = format!("{}.sst", meta_name);
-        let metadata = unsafe { Metadata::from_paths(map_file, idx_file).unwrap() };
-
-        let rocksdb = "rocksdb/storage";
-        let db = DB::open_default(rocksdb).unwrap();
-        db.ingest_external_file(&IngestExternalFileOptions::new(), &[&sst_file]).unwrap();
-        drop(db);
-        let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap();
-
-        Ok(ConsoleSearch { common_words, metadata, db })
-    }
-
-    pub fn serve(self) {
-        loop {
-            print!("Searching for: ");
-            io::stdout().flush().unwrap();
-
-            let mut query = String::new();
-            io::stdin().read_line(&mut query).unwrap();
-            let query = query.trim().to_lowercase();
-
-            if query.is_empty() { break }
-
-            let (elapsed, _) = measure_time(|| search(&self.metadata, &self.db, &self.common_words, &query));
-            println!("Finished in {}", elapsed);
-        }
-    }
-}
-
-fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) {
-    let mut automatons = Vec::new();
-    for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) {
-        let lev = automaton::build(query);
-        automatons.push(lev);
-    }
-
-    let mut stream = RankedStream::new(&metadata, automatons, 20);
-    while let Some(document) = stream.next() {
-        print!("{:?}", document.document_id);
-
-        let title_key = format!("{}-title", document.document_id);
-        let title = database.get(title_key.as_bytes()).unwrap().unwrap();
-        let title = unsafe { from_utf8_unchecked(&title) };
-        print!(" {:?}", title);
-
-        println!();
-    }
-}
--- a/src/serve/http.rs
+++ b/src/serve/http.rs
@ -1,120 +0,0 @@
-use std::str::from_utf8_unchecked;
-use std::io::{self, Write};
-use std::net::SocketAddr;
-use std::error::Error;
-use std::sync::Arc;
-
-use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
-use raptor::{automaton, Metadata};
-use raptor::rank::RankedStream;
-use fst::Streamer;
-use warp::Filter;
-
-use crate::serve::http_feature::CommandHttp;
-use crate::common_words::{self, CommonWords};
-
-#[derive(Debug, Serialize)]
-struct Document<'a> {
-    id: u64,
-    title: &'a str,
-    description: &'a str,
-    image: &'a str,
-}
-
-#[derive(Debug, Deserialize)]
-struct SearchQuery { q: String }
-
-pub struct HttpServer {
-    listen_addr: SocketAddr,
-    common_words: Arc<CommonWords>,
-    metadata: Arc<Metadata>,
-    db: Arc<DB>,
-}
-
-impl HttpServer {
-    pub fn from_command(command: CommandHttp) -> io::Result<HttpServer> {
-        let common_words = common_words::from_file(command.stop_words)?;
-
-        let meta_name = command.meta_name.display();
-        let map_file = format!("{}.map", meta_name);
-        let idx_file = format!("{}.idx", meta_name);
-        let sst_file = format!("{}.sst", meta_name);
-        let metadata = unsafe { Metadata::from_paths(map_file, idx_file).unwrap() };
-
-        let rocksdb = "rocksdb/storage";
-        let db = DB::open_default(rocksdb).unwrap();
-        db.ingest_external_file(&IngestExternalFileOptions::new(), &[&sst_file]).unwrap();
-        drop(db);
-        let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap();
-
-        Ok(HttpServer {
-            listen_addr: command.listen_addr,
-            common_words: Arc::new(common_words),
-            metadata: Arc::new(metadata),
-            db: Arc::new(db),
-        })
-    }
-
-    pub fn serve(self) {
-        let HttpServer { listen_addr, common_words, metadata, db } = self;
-
-        let routes = warp::path("search")
-            .and(warp::query())
-            .map(move |query: SearchQuery| {
-                let body = search(metadata.clone(), db.clone(), common_words.clone(), &query.q).unwrap();
-                body
-            })
-            .with(warp::reply::with::header("Content-Type", "application/json"))
-            .with(warp::reply::with::header("Access-Control-Allow-Origin", "*"));
-
-        warp::serve(routes).run(listen_addr)
-    }
-}
-
-fn search<M, D, C>(metadata: M, database: D, common_words: C, query: &str) -> Result<String, Box<Error>>
-where M: AsRef<Metadata>,
-      D: AsRef<DB>,
-      C: AsRef<CommonWords>,
-{
-    let mut automatons = Vec::new();
-    for query in query.split_whitespace().map(str::to_lowercase) {
-        if common_words.as_ref().contains(&query) { continue }
-        let lev = automaton::build(&query);
-        automatons.push(lev);
-    }
-
-    let mut stream = RankedStream::new(metadata.as_ref(), automatons, 20);
-    let mut body = Vec::new();
-    write!(&mut body, "[")?;
-
-    let mut first = true;
-    while let Some(document) = stream.next() {
-        let title_key = format!("{}-title", document.document_id);
-        let title = database.as_ref().get(title_key.as_bytes()).unwrap().unwrap();
-        let title = unsafe { from_utf8_unchecked(&title) };
-
-        let description_key = format!("{}-description", document.document_id);
-        let description = database.as_ref().get(description_key.as_bytes()).unwrap().unwrap();
-        let description = unsafe { from_utf8_unchecked(&description) };
-
-        let image_key = format!("{}-image", document.document_id);
-        let image = database.as_ref().get(image_key.as_bytes()).unwrap().unwrap();
-        let image = unsafe { from_utf8_unchecked(&image) };
-
-        let document = Document {
-            id: document.document_id,
-            title: title,
-            description: description,
-            image: image,
-        };
-
-        if !first { write!(&mut body, ",")? }
-        serde_json::to_writer(&mut body, &document)?;
-
-        first = false;
-    }
-
-    write!(&mut body, "]")?;
-
-    Ok(String::from_utf8(body)?)
-}
--- a/src/serve/mod.rs
+++ b/src/serve/mod.rs
@ -1,76 +0,0 @@
-#[cfg(feature = "serve-http")]
-mod http;
-
-#[cfg(feature = "serve-console")]
-mod console;
-
-use structopt::StructOpt;
-
-#[derive(Debug, StructOpt)]
-pub enum CommandServe {
-
-    #[cfg(feature = "serve-http")]
-    /// Serve an index under an http protocol.
-    #[structopt(name = "http")]
-    Http(self::http_feature::CommandHttp),
-
-    #[cfg(feature = "serve-console")]
-    /// Serve an index under a simple console.
-    #[structopt(name = "console")]
-    Console(self::console_feature::CommandConsole),
-}
-
-#[cfg(feature = "serve-http")]
-pub mod http_feature {
-    use std::error;
-    use std::path::PathBuf;
-    use std::net::SocketAddr;
-    use structopt::StructOpt;
-
-    #[derive(Debug, StructOpt)]
-    pub struct CommandHttp {
-        /// The address and port to bind the server to.
-        #[structopt(short = "l", default_value = "127.0.0.1:3030")]
-        pub listen_addr: SocketAddr,
-
-        /// The stop word file, each word must be separated by a newline.
-        #[structopt(long = "stop-words", parse(from_os_str))]
-        pub stop_words: PathBuf,
-
-        /// Meta file name (e.g. relaxed-colden).
-        #[structopt(parse(from_os_str))]
-        pub meta_name: PathBuf,
-    }
-
-    pub fn http(command: CommandHttp) -> Result<(), Box<error::Error>> {
-        use super::http::HttpServer;
-
-        let server = HttpServer::from_command(command)?;
-        Ok(server.serve())
-    }
-}
-
-#[cfg(feature = "serve-console")]
-pub mod console_feature {
-    use std::error;
-    use std::path::PathBuf;
-    use structopt::StructOpt;
-
-    #[derive(Debug, StructOpt)]
-    pub struct CommandConsole {
-        /// The stop word file, each word must be separated by a newline.
-        #[structopt(long = "stop-words", parse(from_os_str))]
-        pub stop_words: PathBuf,
-
-        /// Meta file name (e.g. relaxed-colden).
-        #[structopt(parse(from_os_str))]
-        pub meta_name: PathBuf,
-    }
-
-    pub fn console(command: CommandConsole) -> Result<(), Box<error::Error>> {
-        use super::console::ConsoleSearch;
-
-        let search = ConsoleSearch::from_command(command)?;
-        Ok(search.serve())
-    }
-}
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@ -0,0 +1,137 @@
+use std::mem;
+use self::Separator::*;
+
+pub struct Tokenizer<'a> {
+    inner: &'a str,
+}
+
+impl<'a> Tokenizer<'a> {
+    pub fn new(string: &str) -> Tokenizer {
+        Tokenizer { inner: string }
+    }
+
+    pub fn iter(&self) -> Tokens {
+        Tokens::new(self.inner)
+    }
+}
+
+pub struct Tokens<'a> {
+    index: usize,
+    inner: &'a str,
+}
+
+impl<'a> Tokens<'a> {
+    fn new(string: &str) -> Tokens {
+        Tokens {
+            index: 0,
+            inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-'][..]),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+enum Separator {
+    Short,
+    Long,
+}
+
+impl Separator {
+    fn add(self, add: Separator) -> Separator {
+        match (self, add) {
+            (_,     Long)  => Long,
+            (Short, Short) => Short,
+            (Long,  Short) => Long,
+        }
+    }
+
+    fn to_usize(self) -> usize {
+        match self {
+            Short => 1,
+            Long => 8,
+        }
+    }
+}
+
+impl<'a> Iterator for Tokens<'a> {
+    type Item = (usize, &'a str);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut start_word = None;
+        let mut distance = None;
+
+        for (i, c) in self.inner.char_indices() {
+            let separator = match c {
+                '.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
+                ' ' => Some(Short),
+                _   => None,
+            };
+
+            match separator {
+                Some(dist) => {
+                    if let Some(start_word) = start_word {
+                        let (word, tail) = self.inner.split_at(i);
+
+                        self.inner = tail;
+                        self.index += distance.map(Separator::to_usize).unwrap_or(0);
+
+                        let word = &word[start_word..];
+                        return Some((self.index, word))
+                    }
+                    distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
+                },
+                None => { start_word.get_or_insert(i); },
+            }
+        }
+
+        if let Some(start_word) = start_word {
+            let word = mem::replace(&mut self.inner, "");
+            self.index += distance.map(Separator::to_usize).unwrap_or(0);
+
+            let word = &word[start_word..];
+            return Some((self.index, word))
+        }
+
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn easy() {
+        let tokenizer = Tokenizer::new("salut");
+        let mut tokens = tokenizer.iter();
+
+        assert_eq!(tokens.next(), Some((0, "salut")));
+        assert_eq!(tokens.next(), None);
+
+        let tokenizer = Tokenizer::new("yo    ");
+        let mut tokens = tokenizer.iter();
+
+        assert_eq!(tokens.next(), Some((0, "yo")));
+        assert_eq!(tokens.next(), None);
+    }
+
+    #[test]
+    fn hard() {
+        let tokenizer = Tokenizer::new(" .? yo lolo. aïe");
+        let mut tokens = tokenizer.iter();
+
+        assert_eq!(tokens.next(), Some((0, "yo")));
+        assert_eq!(tokens.next(), Some((1, "lolo")));
+        assert_eq!(tokens.next(), Some((9, "aïe")));
+        assert_eq!(tokens.next(), None);
+
+        let tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
+        let mut tokens = tokenizer.iter();
+
+        assert_eq!(tokens.next(), Some((0, "yo")));
+        assert_eq!(tokens.next(), Some((8, "lolo")));
+        assert_eq!(tokens.next(), Some((16, "wtf")));
+        assert_eq!(tokens.next(), Some((24, "lol")));
+        assert_eq!(tokens.next(), Some((32, "aïe")));
+        assert_eq!(tokens.next(), None);
+    }
+}
--- a/src/vec_read_only.rs
+++ b/src/vec_read_only.rs
@ -0,0 +1,44 @@
+use std::ops::Deref;
+use std::sync::Arc;
+
+#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct VecReadOnly<T> {
+    inner: Arc<Vec<T>>,
+    offset: usize,
+    len: usize,
+}
+
+impl<T> VecReadOnly<T> {
+    pub fn new(vec: Vec<T>) -> Self {
+        let len = vec.len();
+        Self {
+            inner: Arc::new(vec),
+            offset: 0,
+            len: len,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn range(&self, offset: usize, len: usize) -> Self {
+        Self {
+            inner: self.inner.clone(),
+            offset: self.offset + offset,
+            len: len,
+        }
+    }
+
+    pub fn as_slice(&self) -> &[T] {
+        &self.inner[self.offset..self.offset + self.len]
+    }
+}
+
+impl<T> Deref for VecReadOnly<T> {
+    type Target = [T];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}