2019-12-11 17:02:10 +01:00
|
|
|
use std::cmp::{self, Ordering};
|
2020-01-13 14:36:06 +01:00
|
|
|
use std::collections::HashMap;
|
|
|
|
use std::ops::Range;
|
2019-12-11 17:02:10 +01:00
|
|
|
|
|
|
|
use compact_arena::SmallArena;
|
|
|
|
use sdset::SetBuf;
|
|
|
|
use slice_group_by::GroupBy;
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2020-01-16 14:24:45 +01:00
|
|
|
use crate::bucket_sort::{SimpleMatch, PostingsListView};
|
2019-12-13 11:14:12 +01:00
|
|
|
use crate::database::MainT;
|
2020-01-13 14:36:06 +01:00
|
|
|
use crate::query_tree::QueryId;
|
|
|
|
use crate::{store, RawDocument, MResult};
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2019-12-11 17:02:10 +01:00
|
|
|
mod typo;
|
|
|
|
mod words;
|
|
|
|
mod proximity;
|
|
|
|
mod attribute;
|
|
|
|
mod words_position;
|
|
|
|
mod exact;
|
|
|
|
mod document_id;
|
|
|
|
mod sort_by_attr;
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2019-12-11 17:02:10 +01:00
|
|
|
pub use self::typo::Typo;
|
|
|
|
pub use self::words::Words;
|
|
|
|
pub use self::proximity::Proximity;
|
|
|
|
pub use self::attribute::Attribute;
|
|
|
|
pub use self::words_position::WordsPosition;
|
|
|
|
pub use self::exact::Exact;
|
|
|
|
pub use self::document_id::DocumentId;
|
|
|
|
pub use self::sort_by_attr::SortByAttr;
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2019-12-11 17:02:10 +01:00
|
|
|
pub trait Criterion {
|
2019-10-09 16:15:31 +02:00
|
|
|
fn name(&self) -> &str;
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
2019-12-11 17:02:10 +01:00
|
|
|
&self,
|
2020-01-13 14:36:06 +01:00
|
|
|
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
2019-12-13 11:14:12 +01:00
|
|
|
_documents: &mut [RawDocument<'r, 'tag>],
|
|
|
|
) -> MResult<()>
|
|
|
|
{
|
|
|
|
Ok(())
|
2019-12-12 11:33:39 +01:00
|
|
|
}
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
fn evaluate<'p, 'tag, 'txn, 'q, 'r>(
|
2019-12-11 17:02:10 +01:00
|
|
|
&self,
|
2020-01-13 14:36:06 +01:00
|
|
|
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
2019-12-12 11:33:39 +01:00
|
|
|
lhs: &RawDocument<'r, 'tag>,
|
|
|
|
rhs: &RawDocument<'r, 'tag>,
|
2019-12-11 17:02:10 +01:00
|
|
|
) -> Ordering;
|
2019-10-02 17:34:32 +02:00
|
|
|
|
2019-12-11 17:02:10 +01:00
|
|
|
#[inline]
|
2020-01-13 14:36:06 +01:00
|
|
|
fn eq<'p, 'tag, 'txn, 'q, 'r>(
|
2019-12-11 17:02:10 +01:00
|
|
|
&self,
|
2020-01-13 14:36:06 +01:00
|
|
|
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
2019-12-12 11:33:39 +01:00
|
|
|
lhs: &RawDocument<'r, 'tag>,
|
|
|
|
rhs: &RawDocument<'r, 'tag>,
|
2019-12-11 17:02:10 +01:00
|
|
|
) -> bool
|
|
|
|
{
|
2019-12-12 11:33:39 +01:00
|
|
|
self.evaluate(ctx, lhs, rhs) == Ordering::Equal
|
2019-10-02 17:34:32 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> {
|
2019-12-13 11:14:12 +01:00
|
|
|
pub reader: &'h heed::RoTxn<MainT>,
|
2019-12-12 11:33:39 +01:00
|
|
|
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
2020-01-13 14:36:06 +01:00
|
|
|
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
2019-12-13 11:14:12 +01:00
|
|
|
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
|
2019-12-12 11:33:39 +01:00
|
|
|
}
|
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
pub struct Context<'p, 'tag, 'txn, 'q> {
|
2019-12-12 11:33:39 +01:00
|
|
|
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
2020-01-13 14:36:06 +01:00
|
|
|
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
2019-12-12 11:33:39 +01:00
|
|
|
}
|
|
|
|
|
2019-10-02 17:34:32 +02:00
|
|
|
#[derive(Default)]
|
|
|
|
pub struct CriteriaBuilder<'a> {
|
2019-10-18 13:05:28 +02:00
|
|
|
inner: Vec<Box<dyn Criterion + 'a>>,
|
2019-10-02 17:34:32 +02:00
|
|
|
}
|
|
|
|
|
2019-10-18 13:05:28 +02:00
|
|
|
impl<'a> CriteriaBuilder<'a> {
|
2019-10-02 17:34:32 +02:00
|
|
|
pub fn new() -> CriteriaBuilder<'a> {
|
|
|
|
CriteriaBuilder { inner: Vec::new() }
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
|
2019-10-18 13:05:28 +02:00
|
|
|
CriteriaBuilder {
|
|
|
|
inner: Vec::with_capacity(capacity),
|
|
|
|
}
|
2019-10-02 17:34:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn reserve(&mut self, additional: usize) {
|
|
|
|
self.inner.reserve(additional)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
|
2019-10-18 13:05:28 +02:00
|
|
|
where
|
|
|
|
C: Criterion,
|
2019-10-02 17:34:32 +02:00
|
|
|
{
|
|
|
|
self.push(criterion);
|
|
|
|
self
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn push<C: 'a>(&mut self, criterion: C)
|
2019-10-18 13:05:28 +02:00
|
|
|
where
|
|
|
|
C: Criterion,
|
2019-10-02 17:34:32 +02:00
|
|
|
{
|
|
|
|
self.inner.push(Box::new(criterion));
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn build(self) -> Criteria<'a> {
|
|
|
|
Criteria { inner: self.inner }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct Criteria<'a> {
|
|
|
|
inner: Vec<Box<dyn Criterion + 'a>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Default for Criteria<'a> {
|
|
|
|
fn default() -> Self {
|
|
|
|
CriteriaBuilder::with_capacity(7)
|
2019-12-11 17:02:10 +01:00
|
|
|
.add(Typo)
|
|
|
|
.add(Words)
|
|
|
|
.add(Proximity)
|
|
|
|
.add(Attribute)
|
|
|
|
.add(WordsPosition)
|
2019-10-02 17:34:32 +02:00
|
|
|
.add(Exact)
|
|
|
|
.add(DocumentId)
|
|
|
|
.build()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
|
|
|
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
|
|
|
|
&self.inner
|
|
|
|
}
|
|
|
|
}
|
2019-12-11 17:02:10 +01:00
|
|
|
|
|
|
|
fn prepare_query_distances<'a, 'tag, 'txn>(
|
|
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
2020-01-13 14:36:06 +01:00
|
|
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
2019-12-11 17:02:10 +01:00
|
|
|
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
|
|
|
) {
|
|
|
|
for document in documents {
|
|
|
|
if !document.processed_distances.is_empty() { continue }
|
|
|
|
|
|
|
|
let mut processed = Vec::new();
|
2019-12-13 12:38:54 +01:00
|
|
|
for m in document.bare_matches.iter() {
|
2019-12-11 17:02:10 +01:00
|
|
|
if postings_lists[m.postings_list].is_empty() { continue }
|
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
let range = query_mapping[&(m.query_index as usize)].clone();
|
2019-12-11 17:02:10 +01:00
|
|
|
let new_len = cmp::max(range.end as usize, processed.len());
|
|
|
|
processed.resize(new_len, None);
|
|
|
|
|
|
|
|
for index in range {
|
|
|
|
let index = index as usize;
|
|
|
|
processed[index] = match processed[index] {
|
|
|
|
Some(distance) if distance > m.distance => Some(m.distance),
|
|
|
|
Some(distance) => Some(distance),
|
|
|
|
None => Some(m.distance),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
document.processed_distances = processed;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-13 12:38:54 +01:00
|
|
|
fn prepare_bare_matches<'a, 'tag, 'txn>(
|
2019-12-11 17:02:10 +01:00
|
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
|
|
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
2020-01-13 14:36:06 +01:00
|
|
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
2019-12-11 17:02:10 +01:00
|
|
|
) {
|
|
|
|
for document in documents {
|
|
|
|
if !document.processed_matches.is_empty() { continue }
|
|
|
|
|
|
|
|
let mut processed = Vec::new();
|
2019-12-13 12:38:54 +01:00
|
|
|
for m in document.bare_matches.iter() {
|
2019-12-11 17:02:10 +01:00
|
|
|
let postings_list = &postings_lists[m.postings_list];
|
|
|
|
processed.reserve(postings_list.len());
|
|
|
|
for di in postings_list.as_ref() {
|
|
|
|
let simple_match = SimpleMatch {
|
|
|
|
query_index: m.query_index,
|
|
|
|
distance: m.distance,
|
|
|
|
attribute: di.attribute,
|
|
|
|
word_index: di.word_index,
|
|
|
|
is_exact: m.is_exact,
|
|
|
|
};
|
|
|
|
processed.push(simple_match);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
let processed = multiword_rewrite_matches(&mut processed, query_mapping);
|
2019-12-11 17:02:10 +01:00
|
|
|
document.processed_matches = processed.into_vec();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn multiword_rewrite_matches(
|
|
|
|
matches: &mut [SimpleMatch],
|
2020-01-13 14:36:06 +01:00
|
|
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
2019-12-11 17:02:10 +01:00
|
|
|
) -> SetBuf<SimpleMatch>
|
|
|
|
{
|
|
|
|
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
|
|
|
|
|
|
|
let mut padded_matches = Vec::with_capacity(matches.len());
|
|
|
|
|
|
|
|
// let before_padding = Instant::now();
|
|
|
|
// for each attribute of each document
|
|
|
|
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
|
|
|
|
// padding will only be applied
|
|
|
|
// to word indices in the same attribute
|
|
|
|
let mut padding = 0;
|
|
|
|
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
|
|
|
|
|
|
|
|
// for each match at the same position
|
|
|
|
// in this document attribute
|
|
|
|
while let Some(same_word_index) = iter.next() {
|
|
|
|
// find the biggest padding
|
|
|
|
let mut biggest = 0;
|
|
|
|
for match_ in same_word_index {
|
2020-01-13 14:36:06 +01:00
|
|
|
let mut replacement = query_mapping[&(match_.query_index as usize)].clone();
|
2019-12-11 17:02:10 +01:00
|
|
|
let replacement_len = replacement.len();
|
|
|
|
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
|
|
|
|
|
|
|
if let Some(query_index) = replacement.next() {
|
|
|
|
let word_index = match_.word_index + padding as u16;
|
|
|
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
|
|
|
padded_matches.push(match_);
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut found = false;
|
|
|
|
|
|
|
|
// look ahead and if there already is a match
|
|
|
|
// corresponding to this padding word, abort the padding
|
|
|
|
'padding: for (x, next_group) in nexts.enumerate() {
|
|
|
|
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
|
|
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
|
|
|
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
|
|
|
|
|
|
|
for nmatch_ in next_group {
|
2020-01-13 14:36:06 +01:00
|
|
|
let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone();
|
2020-01-14 12:13:41 +01:00
|
|
|
let query_index = rep.next().unwrap();
|
2019-12-11 17:02:10 +01:00
|
|
|
if query_index == padmatch.query_index {
|
|
|
|
if !found {
|
|
|
|
// if we find a corresponding padding for the
|
|
|
|
// first time we must push preceding paddings
|
2020-01-14 13:30:12 +01:00
|
|
|
for (i, query_index) in replacement.clone().enumerate().take(i) {
|
2019-12-11 17:02:10 +01:00
|
|
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
|
|
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
|
|
|
padded_matches.push(match_);
|
|
|
|
biggest = biggest.max(i + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
padded_matches.push(padmatch);
|
|
|
|
found = true;
|
|
|
|
continue 'padding;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// if we do not find a corresponding padding in the
|
|
|
|
// next groups so stop here and pad what was found
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if !found {
|
|
|
|
// if no padding was found in the following matches
|
|
|
|
// we must insert the entire padding
|
|
|
|
for (i, query_index) in replacement.enumerate() {
|
|
|
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
|
|
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
|
|
|
padded_matches.push(match_);
|
|
|
|
}
|
|
|
|
|
|
|
|
biggest = biggest.max(replacement_len - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
padding += biggest;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// debug!("padding matches took {:.02?}", before_padding.elapsed());
|
|
|
|
|
|
|
|
// With this check we can see that the loop above takes something
|
|
|
|
// like 43% of the search time even when no rewrite is needed.
|
|
|
|
// assert_eq!(before_matches, padded_matches);
|
|
|
|
|
|
|
|
SetBuf::from_dirty(padded_matches)
|
|
|
|
}
|