117 lines
2.9 KiB
Rust
117 lines
2.9 KiB
Rust
use std::cmp::Ordering;
|
|
|
|
use slice_group_by::GroupBy;
|
|
|
|
use crate::criterion::Criterion;
|
|
use crate::RawDocument;
|
|
|
|
// This function is a wrong logarithmic 10 function.
|
|
// It is safe to panic on input number higher than 3,
|
|
// the number of typos is never bigger than that.
|
|
#[inline]
|
|
fn custom_log10(n: u8) -> f32 {
|
|
match n {
|
|
0 => 0.0, // log(1)
|
|
1 => 0.30102, // log(2)
|
|
2 => 0.47712, // log(3)
|
|
3 => 0.60205, // log(4)
|
|
_ => panic!("invalid number"),
|
|
}
|
|
}
|
|
|
|
#[inline]
|
|
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
|
let mut number_words: usize = 0;
|
|
let mut sum_typos = 0.0;
|
|
let mut index = 0;
|
|
|
|
for group in query_index.linear_group() {
|
|
sum_typos += custom_log10(distance[index]);
|
|
number_words += 1;
|
|
index += group.len();
|
|
}
|
|
|
|
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
pub struct SumOfTypos;
|
|
|
|
impl Criterion for SumOfTypos {
|
|
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
|
let lhs = {
|
|
let query_index = lhs.query_index();
|
|
let distance = lhs.distance();
|
|
sum_matches_typos(query_index, distance)
|
|
};
|
|
|
|
let rhs = {
|
|
let query_index = rhs.query_index();
|
|
let distance = rhs.distance();
|
|
sum_matches_typos(query_index, distance)
|
|
};
|
|
|
|
lhs.cmp(&rhs).reverse()
|
|
}
|
|
|
|
fn name(&self) -> &str {
|
|
"SumOfTypos"
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
// typing: "Geox CEO"
|
|
//
|
|
// doc0: "Geox SpA: CEO and Executive"
|
|
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
|
#[test]
|
|
fn one_typo_reference() {
|
|
let query_index0 = &[0, 1];
|
|
let distance0 = &[0, 0];
|
|
|
|
let query_index1 = &[0, 1];
|
|
let distance1 = &[1, 0];
|
|
|
|
let doc0 = sum_matches_typos(query_index0, distance0);
|
|
let doc1 = sum_matches_typos(query_index1, distance1);
|
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
|
}
|
|
|
|
// typing: "bouton manchette"
|
|
//
|
|
// doc0: "bouton manchette"
|
|
// doc1: "bouton"
|
|
#[test]
|
|
fn no_typo() {
|
|
let query_index0 = &[0, 1];
|
|
let distance0 = &[0, 0];
|
|
|
|
let query_index1 = &[0];
|
|
let distance1 = &[0];
|
|
|
|
let doc0 = sum_matches_typos(query_index0, distance0);
|
|
let doc1 = sum_matches_typos(query_index1, distance1);
|
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
|
}
|
|
|
|
// typing: "bouton manchztte"
|
|
//
|
|
// doc0: "bouton manchette"
|
|
// doc1: "bouton"
|
|
#[test]
|
|
fn one_typo() {
|
|
let query_index0 = &[0, 1];
|
|
let distance0 = &[0, 1];
|
|
|
|
let query_index1 = &[0];
|
|
let distance1 = &[0];
|
|
|
|
let doc0 = sum_matches_typos(query_index0, distance0);
|
|
let doc1 = sum_matches_typos(query_index1, distance1);
|
|
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
|
}
|
|
}
|