MeiliSearch/src/rank/mod.rs

mod sum_of_typos;
mod number_of_words;
mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;

use std::cmp::Ordering;
use std::rc::Rc;
use std::{mem, vec};
use fst::Streamer;
use fnv::FnvHashMap;
use group_by::GroupByMut;
use crate::automaton::{DfaExt, AutomatonExt};
use crate::metadata::Metadata;
use crate::metadata::ops::{OpBuilder, Union};
use crate::{Match, DocumentId};

use self::{
    sum_of_typos::sum_of_typos,
    number_of_words::number_of_words,
    words_proximity::words_proximity,
    sum_of_words_attribute::sum_of_words_attribute,
    sum_of_words_position::sum_of_words_position,
    exact::exact,
};

#[inline]
fn match_query_index(a: &Match, b: &Match) -> bool {
    a.query_index == b.query_index
}

#[derive(Debug, Clone)]
pub struct Document {
    pub document_id: DocumentId,
    pub matches: Vec<Match>,
}

impl Document {
    pub fn new(doc: DocumentId, match_: Match) -> Self {
        Self::from_sorted_matches(doc, vec![match_])
    }

    pub fn from_sorted_matches(doc: DocumentId, matches: Vec<Match>) -> Self {
        Self {
            document_id: doc,
            matches: matches,
        }
    }
}

fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize) -> vec::IntoIter<Document> {
    let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| {
        matches.sort_unstable();
        Document::from_sorted_matches(id, matches)
    }).collect();

    let sorts = &[
        sum_of_typos,
        number_of_words,
        words_proximity,
        sum_of_words_attribute,
        sum_of_words_position,
        exact,
    ];

    let mut groups = vec![documents.as_mut_slice()];

    for sort in sorts {
        let temp = mem::replace(&mut groups, Vec::new());
        let mut computed = 0;

        'grp: for group in temp {
            group.sort_unstable_by(sort);
            for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) {
                computed += group.len();
                groups.push(group);
                if computed >= limit { break 'grp }
            }
        }
    }

    documents.truncate(limit);
    documents.into_iter()
}

pub struct RankedStream<'m>(RankedStreamInner<'m>);

impl<'m> RankedStream<'m> {
    pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>, limit: usize) -> Self {
        let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
        let mut builder = OpBuilder::with_automatons(automatons.clone());
        builder.push(metadata);

        let inner = RankedStreamInner::Fed {
            inner: builder.union(),
            automatons: automatons,
            limit: limit,
            matches: FnvHashMap::default(),
        };

        RankedStream(inner)
    }
}

impl<'m, 'a> fst::Streamer<'a> for RankedStream<'m> {
    type Item = Document;

    fn next(&'a mut self) -> Option<Self::Item> {
        self.0.next()
    }
}

enum RankedStreamInner<'m> {
    Fed {
        inner: Union<'m>,
        automatons: Vec<Rc<DfaExt>>,
        limit: usize,
        matches: FnvHashMap<DocumentId, Vec<Match>>,
    },
    Pours {
        inner: vec::IntoIter<Document>,
    },
}

impl<'m, 'a> fst::Streamer<'a> for RankedStreamInner<'m> {
    type Item = Document;

    fn next(&'a mut self) -> Option<Self::Item> {
        loop {
            match self {
                RankedStreamInner::Fed { inner, automatons, limit, matches } => {
                    match inner.next() {
                        Some((string, indexed_values)) => {
                            for iv in indexed_values {

                                let automaton = &automatons[iv.index];
                                let distance = automaton.eval(string).to_u8();
                                let same_length = string.len() == automaton.query_len();

                                for di in iv.doc_indexes.as_slice() {
                                    let match_ = Match {
                                        query_index: iv.index as u32,
                                        distance: distance,
                                        attribute: di.attribute,
                                        attribute_index: di.attribute_index,
                                        is_exact: distance == 0 && same_length,
                                    };
                                    matches.entry(di.document)
                                           .or_insert_with(Vec::new)
                                           .push(match_);
                                }
                            }
                        },
                        None => {
                            let matches = mem::replace(matches, FnvHashMap::default());
                            *self = RankedStreamInner::Pours {
                                inner: matches_into_iter(matches, *limit).into_iter()
                            };
                        },
                    }
                },
                RankedStreamInner::Pours { inner } => {
                    return inner.next()
                },
            }
        }
    }
}
chore: Move ranking functions in separated files 2018-07-07 14:43:29 +02:00			`mod sum_of_typos;`
			`mod number_of_words;`
			`mod words_proximity;`
			`mod sum_of_words_attribute;`
			`mod sum_of_words_position;`
			`mod exact;`

			`use std::cmp::Ordering;`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`use std::rc::Rc;`
chore: Move ranking functions in separated files 2018-07-07 14:43:29 +02:00			`use std::{mem, vec};`
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`use fst::Streamer;`
feat: Improve performances by using a fnv Hasher 2018-08-23 21:32:31 +02:00			`use fnv::FnvHashMap;`
chore: Move ranking functions in separated files 2018-07-07 14:43:29 +02:00			`use group_by::GroupByMut;`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`use crate::automaton::{DfaExt, AutomatonExt};`
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`use crate::metadata::Metadata;`
			`use crate::metadata::ops::{OpBuilder, Union};`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`use crate::{Match, DocumentId};`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Clean-up ranking functions 2018-08-25 13:15:04 +02:00			`use self::{`
			`sum_of_typos::sum_of_typos,`
			`number_of_words::number_of_words,`
			`words_proximity::words_proximity,`
			`sum_of_words_attribute::sum_of_words_attribute,`
			`sum_of_words_position::sum_of_words_position,`
			`exact::exact,`
			`};`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Fix the ranking algorithm to sort only the needed documents 2018-06-25 22:26:49 +02:00			`#[inline]`
			`fn match_query_index(a: &Match, b: &Match) -> bool {`
			`a.query_index == b.query_index`
			`}`

			`#[derive(Debug, Clone)]`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`pub struct Document {`
feat: Make the Stream return a Document 2018-07-06 22:05:51 +02:00			`pub document_id: DocumentId,`
			`pub matches: Vec<Match>,`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`

			`impl Document {`
			`pub fn new(doc: DocumentId, match_: Match) -> Self {`
			`Self::from_sorted_matches(doc, vec![match_])`
			`}`

			`pub fn from_sorted_matches(doc: DocumentId, matches: Vec<Match>) -> Self {`
			`Self {`
			`document_id: doc,`
			`matches: matches,`
			`}`
			`}`
			`}`

feat: Improve performances by reusing the documents HashMap 2018-08-24 21:01:53 +02:00			`fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize) -> vec::IntoIter<Document> {`
			`let mut documents: Vec<_> = matches.into_iter().map(\|(id, mut matches)\| {`
			`matches.sort_unstable();`
			`Document::from_sorted_matches(id, matches)`
			`}).collect();`

			`let sorts = &[`
			`sum_of_typos,`
			`number_of_words,`
			`words_proximity,`
			`sum_of_words_attribute,`
			`sum_of_words_position,`
			`exact,`
			`];`

feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`let mut groups = vec![documents.as_mut_slice()];`

			`for sort in sorts {`
			`let temp = mem::replace(&mut groups, Vec::new());`
			`let mut computed = 0;`

feat: Break the groups loops when limit is reached 2018-09-09 13:40:37 +02:00			`'grp: for group in temp {`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`group.sort_unstable_by(sort);`
			`for group in GroupByMut::new(group, \|a, b\| sort(a, b) == Ordering::Equal) {`
			`computed += group.len();`
			`groups.push(group);`
feat: Break the groups loops when limit is reached 2018-09-09 13:40:37 +02:00			`if computed >= limit { break 'grp }`
feat: Fix the ranking algorithm to sort only the needed documents 2018-06-25 22:26:49 +02:00			`}`
			`}`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`
feat: Improve performances by reusing the documents HashMap 2018-08-24 21:01:53 +02:00
			`documents.truncate(limit);`
			`documents.into_iter()`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`

feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`pub struct RankedStream<'m>(RankedStreamInner<'m>);`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`impl<'m> RankedStream<'m> {`
			`pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>, limit: usize) -> Self {`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();`
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`let mut builder = OpBuilder::with_automatons(automatons.clone());`
			`builder.push(metadata);`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`let inner = RankedStreamInner::Fed {`
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`inner: builder.union(),`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`automatons: automatons,`
feat: Improve performances by reusing the documents HashMap 2018-08-24 21:01:53 +02:00			`limit: limit,`
			`matches: FnvHashMap::default(),`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`};`

			`RankedStream(inner)`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`
			`}`

feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`impl<'m, 'a> fst::Streamer<'a> for RankedStream<'m> {`
feat: Make the Stream return a Document 2018-07-06 22:05:51 +02:00			`type Item = Document;`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
			`fn next(&'a mut self) -> Option<Self::Item> {`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`self.0.next()`
			`}`
			`}`

feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`enum RankedStreamInner<'m> {`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`Fed {`
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`inner: Union<'m>,`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`automatons: Vec<Rc<DfaExt>>,`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`limit: usize,`
			`matches: FnvHashMap<DocumentId, Vec<Match>>,`
			`},`
			`Pours {`
			`inner: vec::IntoIter<Document>,`
			`},`
			`}`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`impl<'m, 'a> fst::Streamer<'a> for RankedStreamInner<'m> {`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`type Item = Document;`

			`fn next(&'a mut self) -> Option<Self::Item> {`
			`loop {`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`match self {`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`RankedStreamInner::Fed { inner, automatons, limit, matches } => {`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`match inner.next() {`
feat: Implement the excat match ranking rule 2018-07-06 20:58:06 +02:00			`Some((string, indexed_values)) => {`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`for iv in indexed_values {`

feat: Implement the excat match ranking rule 2018-07-06 20:58:06 +02:00			`let automaton = &automatons[iv.index];`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`let distance = automaton.eval(string).to_u8();`
			`let same_length = string.len() == automaton.query_len();`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00
feat: Remove the State from most of the code 2018-09-09 13:35:12 +02:00			`for di in iv.doc_indexes.as_slice() {`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`let match_ = Match {`
			`query_index: iv.index as u32,`
			`distance: distance,`
			`attribute: di.attribute,`
			`attribute_index: di.attribute_index,`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`is_exact: distance == 0 && same_length,`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`};`
			`matches.entry(di.document)`
feat: Simplify the levenshtein construction 2018-09-09 11:13:58 +02:00			`.or_insert_with(Vec::new)`
			`.push(match_);`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`}`
			`}`
			`},`
			`None => {`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`let matches = mem::replace(matches, FnvHashMap::default());`
			`*self = RankedStreamInner::Pours {`
			`inner: matches_into_iter(matches, *limit).into_iter()`
			`};`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`},`
			`}`
			`},`
feat: Simplify the RankedStrem code logic 2018-08-25 12:35:29 +02:00			`RankedStreamInner::Pours { inner } => {`
feat: Make the Stream return a Document 2018-07-06 22:05:51 +02:00			`return inner.next()`
feat: Introduce basic ranking rules 2018-05-27 15:23:43 +02:00			`},`
			`}`
			`}`
			`}`
			`}`