MeiliSearch/src/rank/ranked_stream.rs

104 lines
3.2 KiB
Rust
Raw Normal View History

2018-10-10 16:57:21 +02:00
use std::rc::Rc;
use std::{mem, vec};
use fnv::FnvHashMap;
use fst::Streamer;
use group_by::GroupByMut;
use crate::automaton::{DfaExt, AutomatonExt};
use crate::metadata::Metadata;
2018-10-11 14:04:41 +02:00
use crate::metadata::ops::OpBuilder;
use crate::rank::criterion::Criterion;
2018-10-10 16:57:21 +02:00
use crate::rank::Document;
2018-10-11 14:04:41 +02:00
use crate::Match;
2018-10-10 16:57:21 +02:00
2018-10-11 14:04:41 +02:00
#[derive(Clone)]
pub struct RankedStreamBuilder<'m, C> {
metadata: &'m Metadata,
automatons: Vec<Rc<DfaExt>>,
criteria: Vec<C>,
2018-10-10 16:57:21 +02:00
}
2018-10-11 14:04:41 +02:00
impl<'m, C> RankedStreamBuilder<'m, C> {
pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>) -> Self {
RankedStreamBuilder {
metadata: metadata,
automatons: automatons.into_iter().map(Rc::new).collect(),
criteria: Vec::new(), // hummm... prefer the criterion::default() ones !
}
}
2018-10-10 16:57:21 +02:00
2018-10-11 14:04:41 +02:00
pub fn criteria(&mut self, criteria: Vec<C>) {
self.criteria = criteria;
2018-10-10 16:57:21 +02:00
}
2018-10-11 14:04:41 +02:00
pub fn build(&self) -> RankedStream<C> {
let mut builder = OpBuilder::with_automatons(self.automatons.clone());
builder.push(self.metadata);
2018-10-10 16:57:21 +02:00
2018-10-11 14:04:41 +02:00
RankedStream {
stream: builder.union(),
automatons: &self.automatons,
criteria: &self.criteria,
}
2018-10-10 16:57:21 +02:00
}
}
2018-10-11 14:04:41 +02:00
pub struct RankedStream<'a, 'm, C> {
stream: crate::metadata::ops::Union<'m>,
automatons: &'a [Rc<DfaExt>],
criteria: &'a [C],
2018-10-10 16:57:21 +02:00
}
2018-10-11 14:04:41 +02:00
impl<'a, 'm, C> RankedStream<'a, 'm, C> {
pub fn retrieve_documents(&mut self, limit: usize) -> Vec<Document>
where C: Criterion
{
let mut matches = FnvHashMap::default();
while let Some((string, indexed_values)) = self.stream.next() {
for iv in indexed_values {
let automaton = &self.automatons[iv.index];
let distance = automaton.eval(string).to_u8();
let is_exact = distance == 0 && string.len() == automaton.query_len();
for di in iv.doc_indexes.as_slice() {
let match_ = Match {
query_index: iv.index as u32,
distance: distance,
attribute: di.attribute,
attribute_index: di.attribute_index,
is_exact: is_exact,
};
matches.entry(di.document).or_insert_with(Vec::new).push(match_);
}
2018-10-10 16:57:21 +02:00
}
}
2018-10-11 14:04:41 +02:00
// collect matches from an HashMap into a Vec
let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| {
matches.sort_unstable();
unsafe { Document::from_sorted_matches(id, matches) }
}).collect();
let mut groups = vec![documents.as_mut_slice()];
for criterion in self.criteria {
let temp = mem::replace(&mut groups, Vec::new());
let mut computed = 0;
'grp: for group in temp {
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b)) {
computed += group.len();
groups.push(group);
if computed >= limit { break 'grp }
}
2018-10-10 16:57:21 +02:00
}
}
2018-10-11 14:04:41 +02:00
documents.truncate(limit);
documents
}
2018-10-10 16:57:21 +02:00
}