MeiliSearch/src/rank/ranked_stream.rs

118 lines
3.9 KiB
Rust
Raw Normal View History

use std::ops::Range;
2018-10-10 16:57:21 +02:00
use std::rc::Rc;
use std::{mem, vec, cmp};
2018-10-10 16:57:21 +02:00
use fnv::FnvHashMap;
use fst::Streamer;
use group_by::GroupByMut;
use crate::automaton::{DfaExt, AutomatonExt};
use crate::metadata::Metadata;
2018-10-11 14:04:41 +02:00
use crate::metadata::ops::OpBuilder;
use crate::rank::criterion::Criterion;
2018-10-10 16:57:21 +02:00
use crate::rank::Document;
2018-10-11 14:04:41 +02:00
use crate::Match;
2018-10-10 16:57:21 +02:00
2018-10-11 14:04:41 +02:00
#[derive(Clone)]
pub struct RankedStreamBuilder<'m, C> {
metadata: &'m Metadata,
automatons: Vec<Rc<DfaExt>>,
criteria: Vec<C>,
2018-10-10 16:57:21 +02:00
}
2018-10-11 14:04:41 +02:00
impl<'m, C> RankedStreamBuilder<'m, C> {
pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>) -> Self {
RankedStreamBuilder {
metadata: metadata,
automatons: automatons.into_iter().map(Rc::new).collect(),
criteria: Vec::new(), // hummm... prefer the criterion::default() ones !
}
}
2018-10-10 16:57:21 +02:00
2018-10-11 14:04:41 +02:00
pub fn criteria(&mut self, criteria: Vec<C>) {
self.criteria = criteria;
2018-10-10 16:57:21 +02:00
}
2018-10-11 14:04:41 +02:00
pub fn build(&self) -> RankedStream<C> {
let mut builder = OpBuilder::with_automatons(self.automatons.clone());
builder.push(self.metadata);
2018-10-10 16:57:21 +02:00
2018-10-11 14:04:41 +02:00
RankedStream {
stream: builder.union(),
automatons: &self.automatons,
criteria: &self.criteria,
}
2018-10-10 16:57:21 +02:00
}
}
2018-10-11 14:04:41 +02:00
pub struct RankedStream<'a, 'm, C> {
stream: crate::metadata::ops::Union<'m>,
automatons: &'a [Rc<DfaExt>],
criteria: &'a [C],
2018-10-10 16:57:21 +02:00
}
2018-10-11 14:04:41 +02:00
impl<'a, 'm, C> RankedStream<'a, 'm, C> {
pub fn retrieve_documents(&mut self, range: Range<usize>) -> Vec<Document>
2018-10-11 14:04:41 +02:00
where C: Criterion
{
let mut matches = FnvHashMap::default();
while let Some((string, indexed_values)) = self.stream.next() {
for iv in indexed_values {
let automaton = &self.automatons[iv.index];
let distance = automaton.eval(string).to_u8();
let is_exact = distance == 0 && string.len() == automaton.query_len();
for di in iv.doc_indexes.as_slice() {
let match_ = Match {
query_index: iv.index as u32,
distance: distance,
attribute: di.attribute,
attribute_index: di.attribute_index,
is_exact: is_exact,
};
matches.entry(di.document).or_insert_with(Vec::new).push(match_);
}
2018-10-10 16:57:21 +02:00
}
}
2018-10-11 14:04:41 +02:00
// collect matches from an HashMap into a Vec
let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| {
matches.sort_unstable();
unsafe { Document::from_sorted_matches(id, matches) }
}).collect();
let mut groups = vec![documents.as_mut_slice()];
for criterion in self.criteria {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut current_range = Range { start: 0, end: 0 };
'grp: for group in tmp_groups {
current_range.end += group.len();
// if a part of the current group is in the range returned
// we must sort it and emit the sub-groups
if current_range.contains(&range.start) {
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b)) {
groups.push(group);
if current_range.end >= range.end { break 'grp }
}
} else {
groups.push(group)
2018-10-11 14:04:41 +02:00
}
current_range.start = current_range.end;
2018-10-10 16:57:21 +02:00
}
}
// TODO find a better algorithm, here we allocate for too many documents
// and we do a useless allocation, we should reuse the documents Vec
let start = cmp::min(range.start, documents.len());
let mut documents = documents.split_off(start);
documents.truncate(range.len());
2018-10-11 14:04:41 +02:00
documents
}
2018-10-10 16:57:21 +02:00
}