mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Update the criteria to the new ones
This commit is contained in:
parent
ea148575cf
commit
248ccfc0d8
20 changed files with 693 additions and 1775 deletions
48
meilisearch-core/src/criterion/attribute.rs
Normal file
48
meilisearch-core/src/criterion/attribute.rs
Normal file
|
@ -0,0 +1,48 @@
|
|||
use std::cmp::{self, Ordering};
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
|
||||
use super::{Criterion, prepare_raw_matches};
|
||||
|
||||
pub struct Attribute;
|
||||
|
||||
impl Criterion for Attribute {
|
||||
fn name(&self) -> &str { "attribute" }
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
|
||||
}
|
||||
|
||||
fn evaluate<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
lhs: &RawDocument<'a, 'tag>,
|
||||
rhs: &RawDocument<'a, 'tag>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Ordering
|
||||
{
|
||||
#[inline]
|
||||
fn best_attribute(matches: &[SimpleMatch]) -> u16 {
|
||||
let mut best_attribute = u16::max_value();
|
||||
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
||||
best_attribute = cmp::min(best_attribute, group[0].attribute);
|
||||
}
|
||||
best_attribute
|
||||
}
|
||||
|
||||
let lhs = best_attribute(&lhs.processed_matches);
|
||||
let rhs = best_attribute(&rhs.processed_matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
|
@ -1,16 +1,37 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
use compact_arena::SmallArena;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
use super::Criterion;
|
||||
|
||||
pub struct DocumentId;
|
||||
|
||||
impl Criterion for DocumentId {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
lhs.id.cmp(&rhs.id)
|
||||
fn name(&self) -> &str { "stable document id" }
|
||||
|
||||
fn prepare(
|
||||
&self,
|
||||
documents: &mut [RawDocument],
|
||||
postings_lists: &mut SmallArena<PostingsListView>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
// ...
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"DocumentId"
|
||||
fn evaluate(
|
||||
&self,
|
||||
lhs: &RawDocument,
|
||||
rhs: &RawDocument,
|
||||
postings_lists: &SmallArena<PostingsListView>,
|
||||
) -> Ordering
|
||||
{
|
||||
let lhs = &lhs.id;
|
||||
let rhs = &rhs.id;
|
||||
|
||||
lhs.cmp(rhs)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,131 +1,51 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
|
||||
use sdset::Set;
|
||||
use compact_arena::SmallArena;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::{AttrCount, RawDocument};
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
use super::Criterion;
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(
|
||||
query_index: &[u32],
|
||||
attribute: &[u16],
|
||||
is_exact: &[bool],
|
||||
fields_counts: &Set<AttrCount>,
|
||||
) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
let len = group.len();
|
||||
|
||||
let mut found_exact = false;
|
||||
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
|
||||
if *is_exact {
|
||||
found_exact = true;
|
||||
let attr = &attribute[index + pos];
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) {
|
||||
let AttrCount { count, .. } = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
count += found_exact as usize;
|
||||
index += len;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Exact;
|
||||
|
||||
impl Criterion for Exact {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let is_exact = lhs.is_exact();
|
||||
let attribute = lhs.attribute();
|
||||
let fields_counts = lhs.fields_counts.as_ref().unwrap();
|
||||
fn name(&self) -> &str { "exact" }
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
fn prepare(
|
||||
&self,
|
||||
documents: &mut [RawDocument],
|
||||
postings_lists: &mut SmallArena<PostingsListView>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
for document in documents {
|
||||
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
|
||||
}
|
||||
}
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let is_exact = rhs.is_exact();
|
||||
let attribute = rhs.attribute();
|
||||
let fields_counts = rhs.fields_counts.as_ref().unwrap();
|
||||
fn evaluate(
|
||||
&self,
|
||||
lhs: &RawDocument,
|
||||
rhs: &RawDocument,
|
||||
postings_lists: &SmallArena<PostingsListView>,
|
||||
) -> Ordering
|
||||
{
|
||||
#[inline]
|
||||
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
|
||||
let mut sum_exact_query_words = 0;
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
||||
sum_exact_query_words += group[0].is_exact as usize;
|
||||
}
|
||||
|
||||
sum_exact_query_words
|
||||
}
|
||||
|
||||
let lhs = sum_exact_query_words(&lhs.raw_matches);
|
||||
let rhs = sum_exact_query_words(&rhs.raw_matches);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Exact"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: "Soulier bleu"
|
||||
// doc1: "souliereres rouge"
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[false];
|
||||
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: { 0. "soulier" }
|
||||
// doc1: { 0. "soulier bleu et blanc" }
|
||||
#[test]
|
||||
fn basic() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,58 +1,58 @@
|
|||
mod document_id;
|
||||
mod exact;
|
||||
mod number_of_words;
|
||||
mod sort_by_attr;
|
||||
mod sum_of_typos;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod words_proximity;
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
pub use self::{
|
||||
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
|
||||
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
|
||||
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
|
||||
words_proximity::WordsProximity,
|
||||
};
|
||||
mod typo;
|
||||
mod words;
|
||||
mod proximity;
|
||||
mod attribute;
|
||||
mod words_position;
|
||||
mod exact;
|
||||
mod document_id;
|
||||
mod sort_by_attr;
|
||||
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
pub use self::typo::Typo;
|
||||
pub use self::words::Words;
|
||||
pub use self::proximity::Proximity;
|
||||
pub use self::attribute::Attribute;
|
||||
pub use self::words_position::WordsPosition;
|
||||
pub use self::exact::Exact;
|
||||
pub use self::document_id::DocumentId;
|
||||
pub use self::sort_by_attr::SortByAttr;
|
||||
|
||||
pub trait Criterion {
|
||||
fn name(&self) -> &str;
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
);
|
||||
|
||||
fn evaluate<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
lhs: &RawDocument<'a, 'tag>,
|
||||
rhs: &RawDocument<'a, 'tag>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
fn eq<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
lhs: &RawDocument<'a, 'tag>,
|
||||
rhs: &RawDocument<'a, 'tag>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> bool
|
||||
{
|
||||
self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -103,11 +103,11 @@ pub struct Criteria<'a> {
|
|||
impl<'a> Default for Criteria<'a> {
|
||||
fn default() -> Self {
|
||||
CriteriaBuilder::with_capacity(7)
|
||||
.add(SumOfTypos)
|
||||
.add(NumberOfWords)
|
||||
.add(WordsProximity)
|
||||
.add(SumOfWordsAttribute)
|
||||
.add(SumOfWordsPosition)
|
||||
.add(Typo)
|
||||
.add(Words)
|
||||
.add(Proximity)
|
||||
.add(Attribute)
|
||||
.add(WordsPosition)
|
||||
.add(Exact)
|
||||
.add(DocumentId)
|
||||
.build()
|
||||
|
@ -119,3 +119,165 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
|||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) {
|
||||
for document in documents {
|
||||
if !document.processed_distances.is_empty() { continue }
|
||||
|
||||
let mut processed = Vec::new();
|
||||
for m in document.raw_matches.iter() {
|
||||
if postings_lists[m.postings_list].is_empty() { continue }
|
||||
|
||||
let range = query_enhancer.replacement(m.query_index as u32);
|
||||
let new_len = cmp::max(range.end as usize, processed.len());
|
||||
processed.resize(new_len, None);
|
||||
|
||||
for index in range {
|
||||
let index = index as usize;
|
||||
processed[index] = match processed[index] {
|
||||
Some(distance) if distance > m.distance => Some(m.distance),
|
||||
Some(distance) => Some(distance),
|
||||
None => Some(m.distance),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
document.processed_distances = processed;
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_raw_matches<'a, 'tag, 'txn>(
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
for document in documents {
|
||||
if !document.processed_matches.is_empty() { continue }
|
||||
|
||||
let mut processed = Vec::new();
|
||||
for m in document.raw_matches.iter() {
|
||||
let postings_list = &postings_lists[m.postings_list];
|
||||
processed.reserve(postings_list.len());
|
||||
for di in postings_list.as_ref() {
|
||||
let simple_match = SimpleMatch {
|
||||
query_index: m.query_index,
|
||||
distance: m.distance,
|
||||
attribute: di.attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact: m.is_exact,
|
||||
};
|
||||
processed.push(simple_match);
|
||||
}
|
||||
}
|
||||
|
||||
let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
|
||||
document.processed_matches = processed.into_vec();
|
||||
}
|
||||
}
|
||||
|
||||
fn multiword_rewrite_matches(
|
||||
matches: &mut [SimpleMatch],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) -> SetBuf<SimpleMatch>
|
||||
{
|
||||
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
||||
|
||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||
|
||||
// let before_padding = Instant::now();
|
||||
// for each attribute of each document
|
||||
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
|
||||
// padding will only be applied
|
||||
// to word indices in the same attribute
|
||||
let mut padding = 0;
|
||||
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
|
||||
|
||||
// for each match at the same position
|
||||
// in this document attribute
|
||||
while let Some(same_word_index) = iter.next() {
|
||||
// find the biggest padding
|
||||
let mut biggest = 0;
|
||||
for match_ in same_word_index {
|
||||
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
|
||||
let replacement_len = replacement.len();
|
||||
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
||||
|
||||
if let Some(query_index) = replacement.next() {
|
||||
let word_index = match_.word_index + padding as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
}
|
||||
|
||||
let mut found = false;
|
||||
|
||||
// look ahead and if there already is a match
|
||||
// corresponding to this padding word, abort the padding
|
||||
'padding: for (x, next_group) in nexts.enumerate() {
|
||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
|
||||
for nmatch_ in next_group {
|
||||
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
|
||||
let query_index = rep.next().unwrap() as u16;
|
||||
if query_index == padmatch.query_index {
|
||||
if !found {
|
||||
// if we find a corresponding padding for the
|
||||
// first time we must push preceding paddings
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||
{
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
biggest = biggest.max(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
padded_matches.push(padmatch);
|
||||
found = true;
|
||||
continue 'padding;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we do not find a corresponding padding in the
|
||||
// next groups so stop here and pad what was found
|
||||
break;
|
||||
}
|
||||
|
||||
if !found {
|
||||
// if no padding was found in the following matches
|
||||
// we must insert the entire padding
|
||||
for (i, query_index) in replacement.enumerate() {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
}
|
||||
|
||||
biggest = biggest.max(replacement_len - 1);
|
||||
}
|
||||
}
|
||||
|
||||
padding += biggest;
|
||||
}
|
||||
}
|
||||
|
||||
// debug!("padding matches took {:.02?}", before_padding.elapsed());
|
||||
|
||||
// With this check we can see that the loop above takes something
|
||||
// like 43% of the search time even when no rewrite is needed.
|
||||
// assert_eq!(before_matches, padded_matches);
|
||||
|
||||
SetBuf::from_dirty(padded_matches)
|
||||
}
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
query_index.linear_group().count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct NumberOfWords;
|
||||
|
||||
impl Criterion for NumberOfWords {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"NumberOfWords"
|
||||
}
|
||||
}
|
79
meilisearch-core/src/criterion/proximity.rs
Normal file
79
meilisearch-core/src/criterion/proximity.rs
Normal file
|
@ -0,0 +1,79 @@
|
|||
use std::cmp::{self, Ordering};
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
|
||||
use super::{Criterion, prepare_raw_matches};
|
||||
|
||||
pub struct Proximity;
|
||||
|
||||
impl Criterion for Proximity {
|
||||
fn name(&self) -> &str { "proximity" }
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
|
||||
}
|
||||
|
||||
fn evaluate<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
lhs: &RawDocument<'a, 'tag>,
|
||||
rhs: &RawDocument<'a, 'tag>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Ordering
|
||||
{
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
|
||||
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
|
||||
else { index_proximity(lhs.word_index, rhs.word_index) }
|
||||
}
|
||||
|
||||
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
|
||||
let mut min_prox = u16::max_value();
|
||||
for a in lhs {
|
||||
for b in rhs {
|
||||
let prox = attribute_proximity(*a, *b);
|
||||
min_prox = cmp::min(min_prox, prox);
|
||||
}
|
||||
}
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
|
||||
let mut proximity = 0;
|
||||
let mut iter = matches.linear_group_by_key(|m| m.query_index);
|
||||
|
||||
// iterate over groups by windows of size 2
|
||||
let mut last = iter.next();
|
||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
||||
proximity += min_proximity(lhs, rhs);
|
||||
last = Some(rhs);
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
let lhs = matches_proximity(&lhs.processed_matches);
|
||||
let rhs = matches_proximity(&rhs.processed_matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
|
@ -2,9 +2,13 @@ use std::cmp::Ordering;
|
|||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
|
||||
use crate::criterion::Criterion;
|
||||
use crate::{RankedMap, RawDocument};
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
|
@ -28,11 +32,11 @@ use meilisearch_schema::{Schema, SchemaAttr};
|
|||
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Typo)
|
||||
/// .add(Words)
|
||||
/// .add(Proximity)
|
||||
/// .add(Attribute)
|
||||
/// .add(WordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(custom_ranking)
|
||||
/// .add(DocumentId);
|
||||
|
@ -86,8 +90,28 @@ impl<'a> SortByAttr<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> Criterion for SortByAttr<'a> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
impl Criterion for SortByAttr<'_> {
|
||||
fn name(&self) -> &str {
|
||||
"sort by attribute"
|
||||
}
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
// ...
|
||||
}
|
||||
|
||||
fn evaluate<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
lhs: &RawDocument<'a, 'tag>,
|
||||
rhs: &RawDocument<'a, 'tag>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Ordering
|
||||
{
|
||||
let lhs = self.ranked_map.get(lhs.id, self.attr);
|
||||
let rhs = self.ranked_map.get(rhs.id, self.attr);
|
||||
|
||||
|
@ -105,10 +129,6 @@ impl<'a> Criterion for SortByAttr<'a> {
|
|||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SortByAttr"
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
|
|
|
@ -1,116 +0,0 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||
let mut number_words: usize = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_typos += custom_log10(distance[index]);
|
||||
number_words += 1;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfTypos;
|
||||
|
||||
impl Criterion for SumOfTypos {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfTypos"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0, 1];
|
||||
let distance1 = &[1, 0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchette"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchztte"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 1];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
let mut sum_attributes = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_attributes += attribute[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_attributes
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsAttribute;
|
||||
|
||||
impl Criterion for SumOfWordsAttribute {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsAttribute"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
|
||||
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
|
||||
#[test]
|
||||
fn title_vs_description() {
|
||||
let query_index0 = &[0];
|
||||
let attribute0 = &[0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let attribute1 = &[1];
|
||||
|
||||
let doc0 = sum_matches_attributes(query_index0, attribute0);
|
||||
let doc1 = sum_matches_attributes(query_index1, attribute1);
|
||||
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||
let mut sum_word_index = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_word_index += word_index[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_word_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsPosition;
|
||||
|
||||
impl Criterion for SumOfWordsPosition {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let word_index = lhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let word_index = rhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsPosition"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: "Soulier bleu"
|
||||
// doc1: "Botte rouge et soulier noir"
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let query_index0 = &[0];
|
||||
let word_index0 = &[0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let word_index1 = &[3];
|
||||
|
||||
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
|
||||
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
|
||||
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
|
||||
}
|
||||
}
|
67
meilisearch-core/src/criterion/typo.rs
Normal file
67
meilisearch-core/src/criterion/typo.rs
Normal file
|
@ -0,0 +1,67 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
|
||||
use super::{Criterion, prepare_query_distances};
|
||||
|
||||
pub struct Typo;
|
||||
|
||||
impl Criterion for Typo {
|
||||
fn name(&self) -> &str { "typo" }
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
|
||||
}
|
||||
|
||||
fn evaluate(
|
||||
&self,
|
||||
lhs: &RawDocument,
|
||||
rhs: &RawDocument,
|
||||
postings_lists: &SmallArena<PostingsListView>,
|
||||
) -> Ordering
|
||||
{
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn compute_typos(distances: &[Option<u8>]) -> usize {
|
||||
let mut number_words: usize = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
|
||||
for distance in distances {
|
||||
if let Some(distance) = distance {
|
||||
sum_typos += custom_log10(*distance);
|
||||
number_words += 1;
|
||||
}
|
||||
}
|
||||
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
let lhs = compute_typos(&lhs.processed_distances);
|
||||
let rhs = compute_typos(&rhs.processed_distances);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
43
meilisearch-core/src/criterion/words.rs
Normal file
43
meilisearch-core/src/criterion/words.rs
Normal file
|
@ -0,0 +1,43 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
|
||||
use super::{Criterion, prepare_query_distances};
|
||||
|
||||
pub struct Words;
|
||||
|
||||
impl Criterion for Words {
|
||||
fn name(&self) -> &str { "words" }
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
|
||||
}
|
||||
|
||||
fn evaluate(
|
||||
&self,
|
||||
lhs: &RawDocument,
|
||||
rhs: &RawDocument,
|
||||
postings_lists: &SmallArena<PostingsListView>,
|
||||
) -> Ordering
|
||||
{
|
||||
#[inline]
|
||||
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
|
||||
distances.iter().cloned().filter(Option::is_some).count()
|
||||
}
|
||||
|
||||
let lhs = number_of_query_words(&lhs.processed_distances);
|
||||
let rhs = number_of_query_words(&rhs.processed_distances);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
48
meilisearch-core/src/criterion/words_position.rs
Normal file
48
meilisearch-core/src/criterion/words_position.rs
Normal file
|
@ -0,0 +1,48 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
|
||||
use super::{Criterion, prepare_raw_matches};
|
||||
|
||||
pub struct WordsPosition;
|
||||
|
||||
impl Criterion for WordsPosition {
|
||||
fn name(&self) -> &str { "words position" }
|
||||
|
||||
fn prepare<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
|
||||
}
|
||||
|
||||
fn evaluate<'a, 'tag, 'txn>(
|
||||
&self,
|
||||
lhs: &RawDocument<'a, 'tag>,
|
||||
rhs: &RawDocument<'a, 'tag>,
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Ordering
|
||||
{
|
||||
#[inline]
|
||||
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
|
||||
let mut sum_words_position = 0;
|
||||
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
||||
sum_words_position += group[0].word_index as usize;
|
||||
}
|
||||
sum_words_position
|
||||
}
|
||||
|
||||
let lhs = sum_words_position(&lhs.processed_matches);
|
||||
let rhs = sum_words_position(&rhs.processed_matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
|
@ -1,164 +0,0 @@
|
|||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
#[inline]
|
||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||
(a.clone(), b.clone())
|
||||
}
|
||||
|
||||
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||
if lattr != rattr {
|
||||
return MAX_DISTANCE;
|
||||
}
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
|
||||
let mut min_prox = u16::max_value();
|
||||
|
||||
for a in lattr.iter().zip(lwi) {
|
||||
for b in rattr.iter().zip(rwi) {
|
||||
let a = clone_tuple(a);
|
||||
let b = clone_tuple(b);
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(
|
||||
query_index: &[u32],
|
||||
distance: &[u8],
|
||||
attribute: &[u16],
|
||||
word_index: &[u16],
|
||||
) -> u16 {
|
||||
let mut query_index_groups = query_index.linear_group();
|
||||
let mut proximity = 0;
|
||||
let mut index = 0;
|
||||
|
||||
let get_attr_wi = |index: usize, group_len: usize| {
|
||||
// retrieve the first distance group (with the lowest values)
|
||||
let len = distance[index..index + group_len]
|
||||
.linear_group()
|
||||
.next()
|
||||
.unwrap()
|
||||
.len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
|
||||
(rattr, rwi)
|
||||
};
|
||||
|
||||
let mut last = query_index_groups.next().map(|group| {
|
||||
let attr_wi = get_attr_wi(index, group.len());
|
||||
index += group.len();
|
||||
attr_wi
|
||||
});
|
||||
|
||||
// iter by windows of size 2
|
||||
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
|
||||
let attr_wi = get_attr_wi(index, rhs.len());
|
||||
proximity += min_proximity(lhs, attr_wi);
|
||||
last = Some(attr_wi);
|
||||
index += rhs.len();
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WordsProximity;
|
||||
|
||||
impl Criterion for WordsProximity {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
let attribute = lhs.attribute();
|
||||
let word_index = lhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
let attribute = rhs.attribute();
|
||||
let word_index = rhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"WordsProximity"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
// "soup" "of the" "the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 0 }
|
||||
// { id: 2, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
17
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_different_attributes() {
|
||||
// "soup day" "soup of the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 0, attr: 1, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 1, attr_index: 2 }
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
3
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue