2019-11-30 16:53:34 +01:00
|
|
|
use std::borrow::Cow;
|
2020-01-16 14:56:16 +01:00
|
|
|
use std::collections::HashMap;
|
2019-11-30 16:53:34 +01:00
|
|
|
use std::mem;
|
2019-12-30 14:37:31 +01:00
|
|
|
use std::ops::Deref;
|
2019-11-30 16:53:34 +01:00
|
|
|
use std::ops::Range;
|
|
|
|
use std::rc::Rc;
|
2019-12-23 12:42:22 +01:00
|
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
2020-01-16 15:55:55 +01:00
|
|
|
use std::time::Instant;
|
|
|
|
use std::fmt;
|
2019-11-30 16:53:34 +01:00
|
|
|
|
|
|
|
use compact_arena::{SmallArena, Idx32, mk_arena};
|
|
|
|
use log::debug;
|
2019-12-11 17:36:53 +01:00
|
|
|
use meilisearch_types::DocIndex;
|
2020-01-14 16:52:24 +01:00
|
|
|
use sdset::{Set, SetBuf, exponential_search};
|
2019-11-30 16:53:34 +01:00
|
|
|
use slice_group_by::{GroupBy, GroupByMut};
|
|
|
|
|
2020-02-02 22:59:19 +01:00
|
|
|
use crate::error::Error;
|
2019-12-12 11:33:39 +01:00
|
|
|
use crate::criterion::{Criteria, Context, ContextMut};
|
2019-12-11 17:36:53 +01:00
|
|
|
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
2019-12-11 17:02:10 +01:00
|
|
|
use crate::raw_document::RawDocument;
|
2019-11-30 16:53:34 +01:00
|
|
|
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
|
|
|
use crate::{store, Document, DocumentId, MResult};
|
2020-01-16 14:56:16 +01:00
|
|
|
use crate::query_tree::{create_query_tree, traverse_query_tree};
|
|
|
|
use crate::query_tree::{Operation, QueryResult, QueryKind, QueryId, PostingsKey};
|
2020-01-08 13:37:22 +01:00
|
|
|
use crate::query_tree::Context as QTContext;
|
2019-11-30 16:53:34 +01:00
|
|
|
|
2019-12-11 17:36:53 +01:00
|
|
|
pub fn bucket_sort<'c, FI>(
|
2019-11-30 16:53:34 +01:00
|
|
|
reader: &heed::RoTxn<MainT>,
|
|
|
|
query: &str,
|
|
|
|
range: Range<usize>,
|
2019-12-11 17:36:53 +01:00
|
|
|
filter: Option<FI>,
|
2019-12-11 17:02:10 +01:00
|
|
|
criteria: Criteria<'c>,
|
2019-12-13 13:22:54 +01:00
|
|
|
searchable_attrs: Option<ReorderedAttrs>,
|
2019-11-30 16:53:34 +01:00
|
|
|
main_store: store::Main,
|
|
|
|
postings_lists_store: store::PostingsLists,
|
|
|
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
|
|
synonyms_store: store::Synonyms,
|
2019-12-30 11:44:42 +01:00
|
|
|
prefix_documents_cache_store: store::PrefixDocumentsCache,
|
2019-12-31 12:53:40 +01:00
|
|
|
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
2019-11-30 16:53:34 +01:00
|
|
|
) -> MResult<Vec<Document>>
|
2019-12-11 17:36:53 +01:00
|
|
|
where
|
|
|
|
FI: Fn(DocumentId) -> bool,
|
2019-11-30 16:53:34 +01:00
|
|
|
{
|
2019-12-11 17:36:53 +01:00
|
|
|
// We delegate the filter work to the distinct query builder,
|
|
|
|
// specifying a distinct rule that has no effect.
|
|
|
|
if filter.is_some() {
|
|
|
|
let distinct = |_| None;
|
|
|
|
let distinct_size = 1;
|
|
|
|
return bucket_sort_with_distinct(
|
|
|
|
reader,
|
|
|
|
query,
|
|
|
|
range,
|
|
|
|
filter,
|
|
|
|
distinct,
|
|
|
|
distinct_size,
|
|
|
|
criteria,
|
2019-12-13 13:22:54 +01:00
|
|
|
searchable_attrs,
|
2019-12-11 17:36:53 +01:00
|
|
|
main_store,
|
|
|
|
postings_lists_store,
|
|
|
|
documents_fields_counts_store,
|
|
|
|
synonyms_store,
|
2019-12-30 11:44:42 +01:00
|
|
|
prefix_documents_cache_store,
|
2019-12-31 12:53:40 +01:00
|
|
|
prefix_postings_lists_cache_store,
|
2019-12-11 17:36:53 +01:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
|
|
|
Some(words) => words,
|
|
|
|
None => return Ok(Vec::new()),
|
|
|
|
};
|
2019-12-22 19:04:21 +01:00
|
|
|
|
2020-02-10 16:50:55 +01:00
|
|
|
let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default();
|
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
let context = QTContext {
|
|
|
|
words_set,
|
2020-02-10 16:50:55 +01:00
|
|
|
stop_words,
|
2020-01-13 14:36:06 +01:00
|
|
|
synonyms: synonyms_store,
|
|
|
|
postings_lists: postings_lists_store,
|
|
|
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
|
|
|
};
|
2019-11-30 16:53:34 +01:00
|
|
|
|
2020-01-17 10:41:27 +01:00
|
|
|
let (operation, mapping) = create_query_tree(reader, &context, query)?;
|
2020-01-16 15:55:55 +01:00
|
|
|
debug!("operation:\n{:?}", operation);
|
|
|
|
debug!("mapping:\n{:?}", mapping);
|
2019-12-21 13:44:19 +01:00
|
|
|
|
2020-01-16 14:56:16 +01:00
|
|
|
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
|
|
|
|
match operation {
|
|
|
|
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
|
|
|
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
|
|
|
Operation::Query(query) => { map.insert(query.id, &query.kind); },
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut queries_kinds = HashMap::new();
|
|
|
|
recurs_operation(&mut queries_kinds, &operation);
|
|
|
|
|
2020-01-17 10:41:27 +01:00
|
|
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
|
2020-01-21 11:05:34 +01:00
|
|
|
debug!("found {} documents", docids.len());
|
|
|
|
debug!("number of postings {:?}", queries.len());
|
2020-01-13 14:36:06 +01:00
|
|
|
|
|
|
|
let before = Instant::now();
|
|
|
|
mk_arena!(arena);
|
2020-01-16 15:55:55 +01:00
|
|
|
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
|
2020-01-21 11:05:34 +01:00
|
|
|
debug!("matches cleaned in {:.02?}", before.elapsed());
|
2019-12-06 19:15:19 +01:00
|
|
|
|
2020-01-13 14:36:06 +01:00
|
|
|
let before_bucket_sort = Instant::now();
|
2019-11-30 16:53:34 +01:00
|
|
|
|
|
|
|
let before_raw_documents_building = Instant::now();
|
|
|
|
let mut raw_documents = Vec::new();
|
2019-12-13 12:38:54 +01:00
|
|
|
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
2020-01-13 14:36:06 +01:00
|
|
|
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
|
|
|
raw_documents.push(raw_document);
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
2020-01-13 14:36:06 +01:00
|
|
|
debug!("creating {} candidates documents took {:.02?}",
|
2019-11-30 16:53:34 +01:00
|
|
|
raw_documents.len(),
|
|
|
|
before_raw_documents_building.elapsed(),
|
|
|
|
);
|
|
|
|
|
2019-12-22 18:39:50 +01:00
|
|
|
let before_criterion_loop = Instant::now();
|
2019-12-23 12:42:22 +01:00
|
|
|
let proximity_count = AtomicUsize::new(0);
|
|
|
|
|
2019-11-30 16:53:34 +01:00
|
|
|
let mut groups = vec![raw_documents.as_mut_slice()];
|
|
|
|
|
2019-12-11 17:02:10 +01:00
|
|
|
'criteria: for criterion in criteria.as_ref() {
|
2019-11-30 16:53:34 +01:00
|
|
|
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
|
|
|
let mut documents_seen = 0;
|
|
|
|
|
|
|
|
for mut group in tmp_groups {
|
|
|
|
let before_criterion_preparation = Instant::now();
|
2019-12-12 11:33:39 +01:00
|
|
|
|
|
|
|
let ctx = ContextMut {
|
2019-12-13 11:14:12 +01:00
|
|
|
reader,
|
2019-12-12 11:33:39 +01:00
|
|
|
postings_lists: &mut arena,
|
2020-01-13 14:36:06 +01:00
|
|
|
query_mapping: &mapping,
|
2019-12-13 11:14:12 +01:00
|
|
|
documents_fields_counts_store,
|
2019-12-12 11:33:39 +01:00
|
|
|
};
|
|
|
|
|
2019-12-13 11:14:12 +01:00
|
|
|
criterion.prepare(ctx, &mut group)?;
|
2019-11-30 16:53:34 +01:00
|
|
|
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
|
|
|
|
|
2019-12-12 11:33:39 +01:00
|
|
|
let ctx = Context {
|
|
|
|
postings_lists: &arena,
|
2020-01-13 14:36:06 +01:00
|
|
|
query_mapping: &mapping,
|
2019-12-12 11:33:39 +01:00
|
|
|
};
|
|
|
|
|
2019-11-30 16:53:34 +01:00
|
|
|
let before_criterion_sort = Instant::now();
|
2020-01-16 15:55:55 +01:00
|
|
|
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
|
2019-11-30 16:53:34 +01:00
|
|
|
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
|
|
|
|
|
2019-12-12 11:33:39 +01:00
|
|
|
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
|
2019-11-30 16:53:34 +01:00
|
|
|
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
|
|
|
|
|
|
|
|
documents_seen += group.len();
|
|
|
|
groups.push(group);
|
|
|
|
|
|
|
|
// we have sort enough documents if the last document sorted is after
|
|
|
|
// the end of the requested range, we can continue to the next criterion
|
|
|
|
if documents_seen >= range.end {
|
|
|
|
continue 'criteria;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-22 18:39:50 +01:00
|
|
|
debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed());
|
2019-12-23 12:42:22 +01:00
|
|
|
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
|
2019-12-22 18:39:50 +01:00
|
|
|
|
2020-01-29 18:30:21 +01:00
|
|
|
let schema = main_store.schema(reader)?.ok_or(Error::SchemaMissing)?;
|
2019-11-30 16:53:34 +01:00
|
|
|
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
2020-02-02 22:59:19 +01:00
|
|
|
let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref(), &schema));
|
2019-12-22 19:04:21 +01:00
|
|
|
let documents = iter.collect();
|
|
|
|
|
|
|
|
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
|
2020-02-02 22:59:19 +01:00
|
|
|
|
2019-12-22 19:04:21 +01:00
|
|
|
Ok(documents)
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
|
2019-12-11 17:36:53 +01:00
|
|
|
pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
|
|
|
reader: &heed::RoTxn<MainT>,
|
|
|
|
query: &str,
|
|
|
|
range: Range<usize>,
|
|
|
|
filter: Option<FI>,
|
|
|
|
distinct: FD,
|
|
|
|
distinct_size: usize,
|
|
|
|
criteria: Criteria<'c>,
|
2019-12-13 13:22:54 +01:00
|
|
|
searchable_attrs: Option<ReorderedAttrs>,
|
2019-12-11 17:36:53 +01:00
|
|
|
main_store: store::Main,
|
|
|
|
postings_lists_store: store::PostingsLists,
|
|
|
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
|
|
synonyms_store: store::Synonyms,
|
2020-01-16 15:55:55 +01:00
|
|
|
_prefix_documents_cache_store: store::PrefixDocumentsCache,
|
2019-12-31 12:53:40 +01:00
|
|
|
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
2019-12-11 17:36:53 +01:00
|
|
|
) -> MResult<Vec<Document>>
|
|
|
|
where
|
|
|
|
FI: Fn(DocumentId) -> bool,
|
|
|
|
FD: Fn(DocumentId) -> Option<u64>,
|
|
|
|
{
|
2020-01-16 15:55:55 +01:00
|
|
|
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
|
|
|
Some(words) => words,
|
|
|
|
None => return Ok(Vec::new()),
|
|
|
|
};
|
|
|
|
|
2020-02-10 16:50:55 +01:00
|
|
|
let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default();
|
|
|
|
|
2020-01-16 15:55:55 +01:00
|
|
|
let context = QTContext {
|
|
|
|
words_set,
|
2020-02-10 16:50:55 +01:00
|
|
|
stop_words,
|
2020-01-16 15:55:55 +01:00
|
|
|
synonyms: synonyms_store,
|
|
|
|
postings_lists: postings_lists_store,
|
|
|
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
|
|
|
};
|
|
|
|
|
2020-01-17 10:41:27 +01:00
|
|
|
let (operation, mapping) = create_query_tree(reader, &context, query)?;
|
2020-01-16 15:55:55 +01:00
|
|
|
debug!("operation:\n{:?}", operation);
|
|
|
|
debug!("mapping:\n{:?}", mapping);
|
|
|
|
|
|
|
|
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
|
|
|
|
match operation {
|
|
|
|
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
|
|
|
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
|
|
|
Operation::Query(query) => { map.insert(query.id, &query.kind); },
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut queries_kinds = HashMap::new();
|
|
|
|
recurs_operation(&mut queries_kinds, &operation);
|
|
|
|
|
2020-01-17 10:41:27 +01:00
|
|
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
|
2020-01-21 11:05:34 +01:00
|
|
|
debug!("found {} documents", docids.len());
|
|
|
|
debug!("number of postings {:?}", queries.len());
|
2020-01-16 15:55:55 +01:00
|
|
|
|
|
|
|
let before = Instant::now();
|
|
|
|
mk_arena!(arena);
|
|
|
|
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
|
2020-01-21 11:05:34 +01:00
|
|
|
debug!("matches cleaned in {:.02?}", before.elapsed());
|
2020-01-16 15:55:55 +01:00
|
|
|
|
|
|
|
let before_raw_documents_building = Instant::now();
|
|
|
|
let mut raw_documents = Vec::new();
|
|
|
|
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
|
|
|
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
|
|
|
raw_documents.push(raw_document);
|
|
|
|
}
|
|
|
|
debug!("creating {} candidates documents took {:.02?}",
|
|
|
|
raw_documents.len(),
|
|
|
|
before_raw_documents_building.elapsed(),
|
|
|
|
);
|
|
|
|
|
|
|
|
let mut groups = vec![raw_documents.as_mut_slice()];
|
|
|
|
let mut key_cache = HashMap::new();
|
|
|
|
|
|
|
|
let mut filter_map = HashMap::new();
|
|
|
|
// these two variables informs on the current distinct map and
|
|
|
|
// on the raw offset of the start of the group where the
|
|
|
|
// range.start bound is located according to the distinct function
|
|
|
|
let mut distinct_map = DistinctMap::new(distinct_size);
|
|
|
|
let mut distinct_raw_offset = 0;
|
|
|
|
|
|
|
|
'criteria: for criterion in criteria.as_ref() {
|
|
|
|
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
|
|
|
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
|
|
|
let mut documents_seen = 0;
|
|
|
|
|
|
|
|
for mut group in tmp_groups {
|
|
|
|
// if this group does not overlap with the requested range,
|
|
|
|
// push it without sorting and splitting it
|
|
|
|
if documents_seen + group.len() < distinct_raw_offset {
|
|
|
|
documents_seen += group.len();
|
|
|
|
groups.push(group);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
let ctx = ContextMut {
|
|
|
|
reader,
|
|
|
|
postings_lists: &mut arena,
|
|
|
|
query_mapping: &mapping,
|
|
|
|
documents_fields_counts_store,
|
|
|
|
};
|
|
|
|
|
|
|
|
let before_criterion_preparation = Instant::now();
|
|
|
|
criterion.prepare(ctx, &mut group)?;
|
|
|
|
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
|
|
|
|
|
|
|
|
let ctx = Context {
|
|
|
|
postings_lists: &arena,
|
|
|
|
query_mapping: &mapping,
|
|
|
|
};
|
|
|
|
|
|
|
|
let before_criterion_sort = Instant::now();
|
|
|
|
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
|
|
|
|
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
|
|
|
|
|
|
|
|
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
|
|
|
|
// we must compute the real distinguished len of this sub-group
|
|
|
|
for document in group.iter() {
|
|
|
|
let filter_accepted = match &filter {
|
|
|
|
Some(filter) => {
|
|
|
|
let entry = filter_map.entry(document.id);
|
|
|
|
*entry.or_insert_with(|| (filter)(document.id))
|
|
|
|
}
|
|
|
|
None => true,
|
|
|
|
};
|
|
|
|
|
|
|
|
if filter_accepted {
|
|
|
|
let entry = key_cache.entry(document.id);
|
|
|
|
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
|
|
|
|
|
|
|
|
match key.clone() {
|
|
|
|
Some(key) => buf_distinct.register(key),
|
|
|
|
None => buf_distinct.register_without_key(),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
// the requested range end is reached: stop computing distinct
|
|
|
|
if buf_distinct.len() >= range.end {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
documents_seen += group.len();
|
|
|
|
groups.push(group);
|
|
|
|
|
|
|
|
// if this sub-group does not overlap with the requested range
|
|
|
|
// we must update the distinct map and its start index
|
|
|
|
if buf_distinct.len() < range.start {
|
|
|
|
buf_distinct.transfert_to_internal();
|
|
|
|
distinct_raw_offset = documents_seen;
|
|
|
|
}
|
|
|
|
|
|
|
|
// we have sort enough documents if the last document sorted is after
|
|
|
|
// the end of the requested range, we can continue to the next criterion
|
|
|
|
if buf_distinct.len() >= range.end {
|
|
|
|
continue 'criteria;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// once we classified the documents related to the current
|
|
|
|
// automatons we save that as the next valid result
|
|
|
|
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
2020-01-29 18:30:21 +01:00
|
|
|
let schema = main_store.schema(reader)?.ok_or(Error::SchemaMissing)?;
|
2020-01-16 15:55:55 +01:00
|
|
|
|
|
|
|
let mut documents = Vec::with_capacity(range.len());
|
|
|
|
for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
|
|
|
|
let filter_accepted = match &filter {
|
|
|
|
Some(_) => filter_map.remove(&raw_document.id).unwrap(),
|
|
|
|
None => true,
|
|
|
|
};
|
|
|
|
|
|
|
|
if filter_accepted {
|
|
|
|
let key = key_cache.remove(&raw_document.id).unwrap();
|
|
|
|
let distinct_accepted = match key {
|
|
|
|
Some(key) => seen.register(key),
|
|
|
|
None => seen.register_without_key(),
|
|
|
|
};
|
|
|
|
|
|
|
|
if distinct_accepted && seen.len() > range.start {
|
2020-02-02 22:59:19 +01:00
|
|
|
documents.push(Document::from_raw(raw_document, &queries_kinds, &arena, searchable_attrs.as_ref(), &schema));
|
2020-01-16 15:55:55 +01:00
|
|
|
if documents.len() == range.len() {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(documents)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn cleanup_bare_matches<'tag, 'txn>(
|
|
|
|
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
|
|
|
docids: &Set<DocumentId>,
|
|
|
|
queries: HashMap<PostingsKey, Cow<'txn, Set<DocIndex>>>,
|
|
|
|
) -> Vec<BareMatch<'tag>>
|
|
|
|
{
|
|
|
|
let docidslen = docids.len() as f32;
|
|
|
|
let mut bare_matches = Vec::new();
|
|
|
|
|
|
|
|
for (PostingsKey { query, input, distance, is_exact }, matches) in queries {
|
|
|
|
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
|
|
|
let pllen = postings_list_view.len() as f32;
|
|
|
|
|
|
|
|
if docidslen / pllen >= 0.8 {
|
|
|
|
let mut offset = 0;
|
|
|
|
for matches in postings_list_view.linear_group_by_key(|m| m.document_id) {
|
|
|
|
let document_id = matches[0].document_id;
|
|
|
|
if docids.contains(&document_id) {
|
|
|
|
let range = postings_list_view.range(offset, matches.len());
|
|
|
|
let posting_list_index = arena.add(range);
|
|
|
|
|
|
|
|
let bare_match = BareMatch {
|
|
|
|
document_id,
|
|
|
|
query_index: query.id,
|
|
|
|
distance,
|
|
|
|
is_exact,
|
|
|
|
postings_list: posting_list_index,
|
|
|
|
};
|
|
|
|
|
|
|
|
bare_matches.push(bare_match);
|
|
|
|
}
|
|
|
|
|
|
|
|
offset += matches.len();
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
let mut offset = 0;
|
|
|
|
for id in docids.as_slice() {
|
|
|
|
let di = DocIndex { document_id: *id, ..DocIndex::default() };
|
|
|
|
let pos = exponential_search(&postings_list_view[offset..], &di).unwrap_or_else(|x| x);
|
|
|
|
|
|
|
|
offset += pos;
|
|
|
|
|
|
|
|
let group = postings_list_view[offset..]
|
|
|
|
.linear_group_by_key(|m| m.document_id)
|
|
|
|
.next()
|
|
|
|
.filter(|matches| matches[0].document_id == *id);
|
|
|
|
|
|
|
|
if let Some(matches) = group {
|
|
|
|
let range = postings_list_view.range(offset, matches.len());
|
|
|
|
let posting_list_index = arena.add(range);
|
|
|
|
|
|
|
|
let bare_match = BareMatch {
|
|
|
|
document_id: *id,
|
|
|
|
query_index: query.id,
|
|
|
|
distance,
|
|
|
|
is_exact,
|
|
|
|
postings_list: posting_list_index,
|
|
|
|
};
|
|
|
|
|
|
|
|
bare_matches.push(bare_match);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let before_raw_documents_presort = Instant::now();
|
|
|
|
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
|
|
|
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
|
|
|
|
|
|
|
bare_matches
|
2019-12-11 17:36:53 +01:00
|
|
|
}
|
|
|
|
|
2019-11-30 16:53:34 +01:00
|
|
|
pub struct BareMatch<'tag> {
|
|
|
|
pub document_id: DocumentId,
|
2020-01-14 12:13:41 +01:00
|
|
|
pub query_index: usize,
|
2019-11-30 16:53:34 +01:00
|
|
|
pub distance: u8,
|
|
|
|
pub is_exact: bool,
|
|
|
|
pub postings_list: Idx32<'tag>,
|
|
|
|
}
|
|
|
|
|
2019-12-07 13:32:43 +01:00
|
|
|
impl fmt::Debug for BareMatch<'_> {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
f.debug_struct("BareMatch")
|
|
|
|
.field("document_id", &self.document_id)
|
|
|
|
.field("query_index", &self.query_index)
|
|
|
|
.field("distance", &self.distance)
|
|
|
|
.field("is_exact", &self.is_exact)
|
|
|
|
.finish()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-30 16:53:34 +01:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
|
|
|
pub struct SimpleMatch {
|
2020-01-14 12:13:41 +01:00
|
|
|
pub query_index: usize,
|
2019-11-30 16:53:34 +01:00
|
|
|
pub distance: u8,
|
|
|
|
pub attribute: u16,
|
|
|
|
pub word_index: u16,
|
|
|
|
pub is_exact: bool,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone)]
|
2019-12-10 12:19:38 +01:00
|
|
|
pub enum PostingsListView<'txn> {
|
|
|
|
Original {
|
|
|
|
input: Rc<[u8]>,
|
|
|
|
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
|
|
|
|
offset: usize,
|
|
|
|
len: usize,
|
|
|
|
},
|
|
|
|
Rewritten {
|
|
|
|
input: Rc<[u8]>,
|
|
|
|
postings_list: SetBuf<DocIndex>,
|
|
|
|
},
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
|
2019-12-09 15:30:14 +01:00
|
|
|
impl fmt::Debug for PostingsListView<'_> {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
f.debug_struct("PostingsListView")
|
2019-12-10 12:19:38 +01:00
|
|
|
.field("input", &std::str::from_utf8(&self.input()).unwrap())
|
2019-12-09 15:30:14 +01:00
|
|
|
.field("postings_list", &self.as_ref())
|
|
|
|
.finish()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-30 16:53:34 +01:00
|
|
|
impl<'txn> PostingsListView<'txn> {
|
2019-12-10 12:19:38 +01:00
|
|
|
pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
|
2019-11-30 16:53:34 +01:00
|
|
|
let len = postings_list.len();
|
2019-12-10 12:19:38 +01:00
|
|
|
PostingsListView::Original { input, postings_list, offset: 0, len }
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
|
|
|
|
PostingsListView::Rewritten { input, postings_list }
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
|
2019-12-11 15:34:30 +01:00
|
|
|
let input = match self {
|
|
|
|
PostingsListView::Original { input, .. } => input.clone(),
|
|
|
|
PostingsListView::Rewritten { input, .. } => input.clone(),
|
2019-12-10 12:19:38 +01:00
|
|
|
};
|
2019-12-11 15:34:30 +01:00
|
|
|
*self = PostingsListView::rewritten(input, postings_list);
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn len(&self) -> usize {
|
2019-12-10 12:19:38 +01:00
|
|
|
match self {
|
|
|
|
PostingsListView::Original { len, .. } => *len,
|
|
|
|
PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
|
|
|
|
}
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn input(&self) -> &[u8] {
|
2019-12-10 12:19:38 +01:00
|
|
|
match self {
|
|
|
|
PostingsListView::Original { ref input, .. } => input,
|
|
|
|
PostingsListView::Rewritten { ref input, .. } => input,
|
|
|
|
}
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
|
2019-12-10 12:19:38 +01:00
|
|
|
pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
|
|
|
|
match self {
|
|
|
|
PostingsListView::Original { input, postings_list, offset, len } => {
|
|
|
|
assert!(range_offset + range_len <= *len);
|
|
|
|
PostingsListView::Original {
|
|
|
|
input: input.clone(),
|
|
|
|
postings_list: postings_list.clone(),
|
|
|
|
offset: offset + range_offset,
|
|
|
|
len: range_len,
|
|
|
|
}
|
|
|
|
},
|
|
|
|
PostingsListView::Rewritten { .. } => {
|
|
|
|
panic!("Cannot create a range on a rewritten postings list view");
|
|
|
|
}
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
|
|
|
|
fn as_ref(&self) -> &Set<DocIndex> {
|
2019-12-10 12:19:38 +01:00
|
|
|
self
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Deref for PostingsListView<'_> {
|
|
|
|
type Target = Set<DocIndex>;
|
|
|
|
|
|
|
|
fn deref(&self) -> &Set<DocIndex> {
|
2019-12-10 12:19:38 +01:00
|
|
|
match *self {
|
|
|
|
PostingsListView::Original { ref postings_list, offset, len, .. } => {
|
|
|
|
Set::new_unchecked(&postings_list[offset..offset + len])
|
|
|
|
},
|
|
|
|
PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
|
|
|
|
}
|
2019-11-30 16:53:34 +01:00
|
|
|
}
|
|
|
|
}
|