mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-06-06 12:57:48 +02:00
480 lines
15 KiB
Rust
480 lines
15 KiB
Rust
use std::cmp::{self, Ordering, Reverse};
|
|
use std::borrow::Cow;
|
|
use std::sync::atomic::{self, AtomicUsize};
|
|
|
|
use slice_group_by::{GroupBy, GroupByMut};
|
|
use compact_arena::SmallArena;
|
|
use sdset::{Set, SetBuf};
|
|
|
|
use crate::{DocIndex, DocumentId};
|
|
use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView};
|
|
use crate::automaton::QueryEnhancer;
|
|
|
|
type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>;
|
|
|
|
pub trait Criterion {
|
|
fn name(&self) -> &str;
|
|
|
|
fn prepare<'a, 'tag, 'txn>(
|
|
&self,
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
|
query_enhancer: &QueryEnhancer,
|
|
);
|
|
|
|
fn evaluate<'a, 'tag, 'txn>(
|
|
&self,
|
|
lhs: &RawDocument<'a, 'tag>,
|
|
rhs: &RawDocument<'a, 'tag>,
|
|
postings_lists: &PostingsListsArena<'tag, 'txn>,
|
|
) -> Ordering;
|
|
|
|
#[inline]
|
|
fn eq<'a, 'tag, 'txn>(
|
|
&self,
|
|
lhs: &RawDocument<'a, 'tag>,
|
|
rhs: &RawDocument<'a, 'tag>,
|
|
postings_lists: &PostingsListsArena<'tag, 'txn>,
|
|
) -> bool
|
|
{
|
|
self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal
|
|
}
|
|
}
|
|
|
|
pub struct Typo;
|
|
|
|
impl Criterion for Typo {
|
|
fn name(&self) -> &str { "typo" }
|
|
|
|
fn prepare(
|
|
&self,
|
|
documents: &mut [RawDocument],
|
|
postings_lists: &mut PostingsListsArena,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
for document in documents {
|
|
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, bm.distance));
|
|
}
|
|
}
|
|
|
|
fn evaluate(
|
|
&self,
|
|
lhs: &RawDocument,
|
|
rhs: &RawDocument,
|
|
postings_lists: &PostingsListsArena,
|
|
) -> Ordering
|
|
{
|
|
// This function is a wrong logarithmic 10 function.
|
|
// It is safe to panic on input number higher than 3,
|
|
// the number of typos is never bigger than that.
|
|
#[inline]
|
|
fn custom_log10(n: u8) -> f32 {
|
|
match n {
|
|
0 => 0.0, // log(1)
|
|
1 => 0.30102, // log(2)
|
|
2 => 0.47712, // log(3)
|
|
3 => 0.60205, // log(4)
|
|
_ => panic!("invalid number"),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn compute_typos(matches: &[BareMatch]) -> usize {
|
|
let mut number_words: usize = 0;
|
|
let mut sum_typos = 0.0;
|
|
|
|
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
|
sum_typos += custom_log10(group[0].distance);
|
|
number_words += 1;
|
|
}
|
|
|
|
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
|
}
|
|
|
|
let lhs = compute_typos(&lhs.raw_matches);
|
|
let rhs = compute_typos(&rhs.raw_matches);
|
|
|
|
lhs.cmp(&rhs).reverse()
|
|
}
|
|
}
|
|
|
|
pub struct Words;
|
|
|
|
impl Criterion for Words {
|
|
fn name(&self) -> &str { "words" }
|
|
|
|
fn prepare(
|
|
&self,
|
|
documents: &mut [RawDocument],
|
|
postings_lists: &mut PostingsListsArena,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
for document in documents {
|
|
document.raw_matches.sort_unstable_by_key(|bm| bm.query_index);
|
|
}
|
|
}
|
|
|
|
fn evaluate(
|
|
&self,
|
|
lhs: &RawDocument,
|
|
rhs: &RawDocument,
|
|
postings_lists: &PostingsListsArena,
|
|
) -> Ordering
|
|
{
|
|
#[inline]
|
|
fn number_of_query_words(matches: &[BareMatch]) -> usize {
|
|
matches.linear_group_by_key(|bm| bm.query_index).count()
|
|
}
|
|
|
|
let lhs = number_of_query_words(&lhs.raw_matches);
|
|
let rhs = number_of_query_words(&rhs.raw_matches);
|
|
|
|
lhs.cmp(&rhs).reverse()
|
|
}
|
|
}
|
|
|
|
fn process_raw_matches<'a, 'tag, 'txn>(
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
for document in documents {
|
|
if document.processed_matches.is_some() { continue }
|
|
|
|
let mut processed = Vec::new();
|
|
let document_id = document.raw_matches[0].document_id;
|
|
|
|
for m in document.raw_matches.iter() {
|
|
let postings_list = &postings_lists[m.postings_list];
|
|
processed.reserve(postings_list.len());
|
|
for di in postings_list.as_ref() {
|
|
let simple_match = SimpleMatch {
|
|
query_index: m.query_index,
|
|
distance: m.distance,
|
|
attribute: di.attribute,
|
|
word_index: di.word_index,
|
|
is_exact: m.is_exact,
|
|
};
|
|
processed.push(simple_match);
|
|
}
|
|
}
|
|
|
|
let processed = multiword_rewrite_matches(&mut processed, query_enhancer);
|
|
document.processed_matches = Some(processed.into_vec());
|
|
}
|
|
}
|
|
|
|
pub struct Proximity;
|
|
|
|
impl Criterion for Proximity {
|
|
fn name(&self) -> &str { "proximity" }
|
|
|
|
fn prepare<'a, 'tag, 'txn>(
|
|
&self,
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
process_raw_matches(documents, postings_lists, query_enhancer);
|
|
}
|
|
|
|
fn evaluate<'a, 'tag, 'txn>(
|
|
&self,
|
|
lhs: &RawDocument<'a, 'tag>,
|
|
rhs: &RawDocument<'a, 'tag>,
|
|
postings_lists: &PostingsListsArena<'tag, 'txn>,
|
|
) -> Ordering
|
|
{
|
|
const MAX_DISTANCE: u16 = 8;
|
|
|
|
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
|
if lhs < rhs {
|
|
cmp::min(rhs - lhs, MAX_DISTANCE)
|
|
} else {
|
|
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
|
}
|
|
}
|
|
|
|
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
|
|
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
|
|
else { index_proximity(lhs.word_index, rhs.word_index) }
|
|
}
|
|
|
|
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
|
|
let mut min_prox = u16::max_value();
|
|
for a in lhs {
|
|
for b in rhs {
|
|
let prox = attribute_proximity(*a, *b);
|
|
min_prox = cmp::min(min_prox, prox);
|
|
}
|
|
}
|
|
min_prox
|
|
}
|
|
|
|
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
|
|
let mut proximity = 0;
|
|
let mut iter = matches.linear_group_by_key(|m| m.query_index);
|
|
|
|
// iterate over groups by windows of size 2
|
|
let mut last = iter.next();
|
|
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
|
proximity += min_proximity(lhs, rhs);
|
|
last = Some(rhs);
|
|
}
|
|
|
|
proximity
|
|
}
|
|
|
|
let lhs = matches_proximity(&lhs.processed_matches.as_ref().unwrap());
|
|
let rhs = matches_proximity(&rhs.processed_matches.as_ref().unwrap());
|
|
|
|
lhs.cmp(&rhs)
|
|
}
|
|
}
|
|
|
|
pub struct Attribute;
|
|
|
|
impl Criterion for Attribute {
|
|
fn name(&self) -> &str { "attribute" }
|
|
|
|
fn prepare<'a, 'tag, 'txn>(
|
|
&self,
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
process_raw_matches(documents, postings_lists, query_enhancer);
|
|
}
|
|
|
|
fn evaluate<'a, 'tag, 'txn>(
|
|
&self,
|
|
lhs: &RawDocument<'a, 'tag>,
|
|
rhs: &RawDocument<'a, 'tag>,
|
|
postings_lists: &PostingsListsArena<'tag, 'txn>,
|
|
) -> Ordering
|
|
{
|
|
#[inline]
|
|
fn sum_attribute(matches: &[SimpleMatch]) -> usize {
|
|
let mut sum_attribute = 0;
|
|
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
|
sum_attribute += group[0].attribute as usize;
|
|
}
|
|
sum_attribute
|
|
}
|
|
|
|
let lhs = sum_attribute(&lhs.processed_matches.as_ref().unwrap());
|
|
let rhs = sum_attribute(&rhs.processed_matches.as_ref().unwrap());
|
|
|
|
lhs.cmp(&rhs)
|
|
}
|
|
}
|
|
|
|
pub struct WordsPosition;
|
|
|
|
impl Criterion for WordsPosition {
|
|
fn name(&self) -> &str { "words position" }
|
|
|
|
fn prepare<'a, 'tag, 'txn>(
|
|
&self,
|
|
documents: &mut [RawDocument<'a, 'tag>],
|
|
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
process_raw_matches(documents, postings_lists, query_enhancer);
|
|
}
|
|
|
|
fn evaluate<'a, 'tag, 'txn>(
|
|
&self,
|
|
lhs: &RawDocument<'a, 'tag>,
|
|
rhs: &RawDocument<'a, 'tag>,
|
|
postings_lists: &PostingsListsArena<'tag, 'txn>,
|
|
) -> Ordering
|
|
{
|
|
#[inline]
|
|
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
|
|
let mut sum_words_position = 0;
|
|
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
|
sum_words_position += group[0].word_index as usize;
|
|
}
|
|
sum_words_position
|
|
}
|
|
|
|
let lhs = sum_words_position(&lhs.processed_matches.as_ref().unwrap());
|
|
let rhs = sum_words_position(&rhs.processed_matches.as_ref().unwrap());
|
|
|
|
lhs.cmp(&rhs)
|
|
}
|
|
}
|
|
|
|
pub struct Exact;
|
|
|
|
impl Criterion for Exact {
|
|
fn name(&self) -> &str { "exact" }
|
|
|
|
fn prepare(
|
|
&self,
|
|
documents: &mut [RawDocument],
|
|
postings_lists: &mut PostingsListsArena,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
for document in documents {
|
|
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
|
|
}
|
|
}
|
|
|
|
fn evaluate(
|
|
&self,
|
|
lhs: &RawDocument,
|
|
rhs: &RawDocument,
|
|
postings_lists: &PostingsListsArena,
|
|
) -> Ordering
|
|
{
|
|
#[inline]
|
|
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
|
|
let mut sum_exact_query_words = 0;
|
|
|
|
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
|
sum_exact_query_words += group[0].is_exact as usize;
|
|
}
|
|
|
|
sum_exact_query_words
|
|
}
|
|
|
|
let lhs = sum_exact_query_words(&lhs.raw_matches);
|
|
let rhs = sum_exact_query_words(&rhs.raw_matches);
|
|
|
|
lhs.cmp(&rhs).reverse()
|
|
}
|
|
}
|
|
|
|
pub struct StableDocId;
|
|
|
|
impl Criterion for StableDocId {
|
|
fn name(&self) -> &str { "stable document id" }
|
|
|
|
fn prepare(
|
|
&self,
|
|
documents: &mut [RawDocument],
|
|
postings_lists: &mut PostingsListsArena,
|
|
query_enhancer: &QueryEnhancer,
|
|
) {
|
|
// ...
|
|
}
|
|
|
|
fn evaluate(
|
|
&self,
|
|
lhs: &RawDocument,
|
|
rhs: &RawDocument,
|
|
postings_lists: &PostingsListsArena,
|
|
) -> Ordering
|
|
{
|
|
let lhs = &lhs.raw_matches[0].document_id;
|
|
let rhs = &rhs.raw_matches[0].document_id;
|
|
|
|
lhs.cmp(rhs)
|
|
}
|
|
}
|
|
|
|
pub fn multiword_rewrite_matches(
|
|
matches: &mut [SimpleMatch],
|
|
query_enhancer: &QueryEnhancer,
|
|
) -> SetBuf<SimpleMatch>
|
|
{
|
|
let mut padded_matches = Vec::with_capacity(matches.len());
|
|
|
|
// let before_sort = Instant::now();
|
|
// we sort the matches by word index to make them rewritable
|
|
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
|
// debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
|
|
|
|
// let before_padding = Instant::now();
|
|
// for each attribute of each document
|
|
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
|
|
// padding will only be applied
|
|
// to word indices in the same attribute
|
|
let mut padding = 0;
|
|
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
|
|
|
|
// for each match at the same position
|
|
// in this document attribute
|
|
while let Some(same_word_index) = iter.next() {
|
|
// find the biggest padding
|
|
let mut biggest = 0;
|
|
for match_ in same_word_index {
|
|
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
|
|
let replacement_len = replacement.len();
|
|
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
|
|
|
if let Some(query_index) = replacement.next() {
|
|
let word_index = match_.word_index + padding as u16;
|
|
let query_index = query_index as u16;
|
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
|
padded_matches.push(match_);
|
|
}
|
|
|
|
let mut found = false;
|
|
|
|
// look ahead and if there already is a match
|
|
// corresponding to this padding word, abort the padding
|
|
'padding: for (x, next_group) in nexts.enumerate() {
|
|
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
|
let query_index = query_index as u16;
|
|
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
|
|
|
for nmatch_ in next_group {
|
|
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
|
|
let query_index = rep.next().unwrap() as u16;
|
|
if query_index == padmatch.query_index {
|
|
if !found {
|
|
// if we find a corresponding padding for the
|
|
// first time we must push preceding paddings
|
|
for (i, query_index) in replacement.clone().enumerate().take(i)
|
|
{
|
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
|
let query_index = query_index as u16;
|
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
|
padded_matches.push(match_);
|
|
biggest = biggest.max(i + 1);
|
|
}
|
|
}
|
|
|
|
padded_matches.push(padmatch);
|
|
found = true;
|
|
continue 'padding;
|
|
}
|
|
}
|
|
}
|
|
|
|
// if we do not find a corresponding padding in the
|
|
// next groups so stop here and pad what was found
|
|
break;
|
|
}
|
|
|
|
if !found {
|
|
// if no padding was found in the following matches
|
|
// we must insert the entire padding
|
|
for (i, query_index) in replacement.enumerate() {
|
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
|
let query_index = query_index as u16;
|
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
|
padded_matches.push(match_);
|
|
}
|
|
|
|
biggest = biggest.max(replacement_len - 1);
|
|
}
|
|
}
|
|
|
|
padding += biggest;
|
|
}
|
|
}
|
|
|
|
// debug!("padding matches took {:.02?}", before_padding.elapsed());
|
|
|
|
// With this check we can see that the loop above takes something
|
|
// like 43% of the search time even when no rewrite is needed.
|
|
// assert_eq!(before_matches, padded_matches);
|
|
|
|
SetBuf::from_dirty(padded_matches)
|
|
}
|