MeiliSearch/milli/src/search/matches/matching_words.rs

429 lines
14 KiB
Rust
Raw Normal View History

use std::cmp::{min, Reverse};
2022-04-04 18:56:59 +02:00
use std::collections::BTreeMap;
use std::fmt;
use std::ops::{Index, IndexMut};
2022-06-02 15:47:28 +02:00
use charabia::Token;
2021-06-16 18:33:33 +02:00
use levenshtein_automata::{Distance, DFA};
2022-03-22 15:22:14 +01:00
use crate::search::build_dfa;
type IsPrefix = bool;
2021-06-01 11:48:56 +02:00
/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
2022-04-04 18:56:59 +02:00
inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
}
impl MatchingWords {
2022-04-04 18:56:59 +02:00
pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
// Sort word by len in DESC order prioritizing the longuest matches,
2021-06-01 11:48:56 +02:00
// in order to highlight the longuest part of the matched word.
2022-04-04 18:56:59 +02:00
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
Self { inner: matching_words }
}
2022-04-07 17:32:13 +02:00
/// Returns an iterator over terms that match or partially match the given token.
2022-04-04 18:56:59 +02:00
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
MatchesIter { inner: Box::new(self.inner.iter()), token }
2022-03-22 15:22:14 +01:00
}
2022-04-04 18:56:59 +02:00
}
2022-03-22 15:22:14 +01:00
2022-04-07 17:32:13 +02:00
/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
2022-04-04 18:56:59 +02:00
pub struct MatchesIter<'a, 'b> {
inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
token: &'b Token<'b>,
}
impl<'a> Iterator for MatchesIter<'a, '_> {
type Item = MatchType<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) {
Some(char_len) => {
if matching_words.len() > 1 {
Some(MatchType::Partial(PartialMatch {
matching_words: &matching_words[1..],
ids,
char_len,
}))
} else {
2022-04-04 18:56:59 +02:00
Some(MatchType::Full { char_len, ids })
}
}
2022-04-04 18:56:59 +02:00
None => self.next(),
},
None => None,
}
}
}
2022-04-07 17:32:13 +02:00
/// Id of a matching term corespounding to a word written by the end user.
2022-04-04 18:56:59 +02:00
pub type PrimitiveWordId = u8;
2022-04-07 17:32:13 +02:00
/// Structure used to match a specific term.
2022-04-04 18:56:59 +02:00
pub struct MatchingWord {
pub dfa: DFA,
pub word: String,
pub typo: u8,
pub prefix: IsPrefix,
}
impl fmt::Debug for MatchingWord {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("MatchingWord")
.field("word", &self.word)
.field("typo", &self.typo)
.field("prefix", &self.prefix)
.finish()
}
}
impl PartialEq for MatchingWord {
fn eq(&self, other: &Self) -> bool {
self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
}
}
impl MatchingWord {
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
let dfa = build_dfa(&word, typo, prefix);
Self { dfa, word, typo, prefix }
}
2022-04-07 17:32:13 +02:00
/// Returns the lenght in chars of the match in case of the token matches the term.
2022-04-04 18:56:59 +02:00
pub fn match_token(&self, token: &Token) -> Option<usize> {
2022-06-02 15:47:28 +02:00
match self.dfa.eval(token.lemma()) {
2022-04-04 18:56:59 +02:00
Distance::Exact(t) if t <= self.typo => {
if self.prefix {
2022-06-02 15:47:28 +02:00
let len = bytes_to_highlight(token.lemma(), &self.word);
Some(token.original_lengths(len).0)
2022-04-04 18:56:59 +02:00
} else {
2022-06-02 15:47:28 +02:00
Some(token.original_lengths(token.lemma().len()).0)
}
}
2022-04-04 18:56:59 +02:00
_otherwise => None,
}
}
2022-04-04 18:56:59 +02:00
}
2022-04-07 17:32:13 +02:00
/// A given token can partially match a query word for several reasons:
/// - split words
/// - multi-word synonyms
/// In these cases we need to match consecutively several tokens to consider that the match is full.
2022-04-04 18:56:59 +02:00
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
Full { char_len: usize, ids: &'a [PrimitiveWordId] },
Partial(PartialMatch<'a>),
}
2022-04-07 17:32:13 +02:00
/// Structure helper to match several tokens in a row in order to complete a partial match.
2022-04-04 18:56:59 +02:00
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: &'a [MatchingWord],
ids: &'a [PrimitiveWordId],
char_len: usize,
}
impl<'a> PartialMatch<'a> {
2022-04-07 17:32:13 +02:00
/// Returns:
/// - None if the given token breaks the partial match
/// - Partial if the given token matches the partial match but doesn't complete it
/// - Full if the given token completes the partial match
2022-04-04 18:56:59 +02:00
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
self.matching_words[0].match_token(token).map(|char_len| {
if self.matching_words.len() > 1 {
MatchType::Partial(PartialMatch {
matching_words: &self.matching_words[1..],
ids: self.ids,
char_len,
})
} else {
MatchType::Full { char_len, ids: self.ids }
}
})
}
2022-04-04 18:56:59 +02:00
pub fn char_len(&self) -> usize {
self.char_len
}
}
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
2021-06-16 18:33:33 +02:00
N2Array { y_size: y, buf: vec![value; x * y] }
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
/// Returns the number of **bytes** we want to highlight in the `source` word.
/// Basically we want to highlight as much characters as possible in the source until it has too much
/// typos (= 2)
/// The algorithm is a modified
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
fn bytes_to_highlight(source: &str, target: &str) -> usize {
2021-07-01 19:03:28 +02:00
let n = source.chars().count();
let m = target.chars().count();
if n == 0 {
return 0;
}
// since we allow two typos we can send two characters even if it's completely wrong
if m < 3 {
return source.chars().take(m).map(|c| c.len_utf8()).sum();
}
if n == m && source == target {
return source.len();
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
2021-07-01 19:03:28 +02:00
for i in 0..=n {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
2021-07-01 19:03:28 +02:00
for j in 0..=m {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.chars().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.chars().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
2021-07-01 19:03:28 +02:00
let mut minimum = (u32::max_value(), 0);
for x in 0..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x);
}
}
// everything was done characters wise and now we want to returns a number of bytes
2021-07-01 19:03:28 +02:00
source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use std::str::from_utf8;
2022-06-02 15:47:28 +02:00
use charabia::TokenKind;
use super::*;
2021-06-16 18:33:33 +02:00
use crate::MatchingWords;
#[test]
fn test_bytes_to_highlight() {
struct TestBytesToHighlight {
query: &'static str,
text: &'static str,
length: usize,
}
let tests = [
TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
TestBytesToHighlight {
query: "Levenshtein",
text: "Levenshtein",
length: "Levenshtein".len(),
},
// we get to the end of our word with only one typo
TestBytesToHighlight {
query: "Levenste",
text: "Levenshtein",
length: "Levenste".len(),
},
// we get our third and last authorized typo right on the last character
TestBytesToHighlight {
query: "Levenstein",
text: "Levenshte",
2021-07-01 19:03:28 +02:00
length: "Levenste".len(),
},
// we get to the end of our word with only two typos at the beginning
TestBytesToHighlight {
query: "Bavenshtein",
text: "Levenshtein",
length: "Bavenshtein".len(),
},
2021-06-29 16:18:53 +02:00
TestBytesToHighlight {
2021-07-01 19:03:28 +02:00
query: "Альфа", text: "Альфой", length: "Альф".len()
2021-06-29 16:18:53 +02:00
},
TestBytesToHighlight {
query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
},
TestBytesToHighlight {
query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
},
TestBytesToHighlight {
query: "chäräcters",
text: "chäräcters",
length: "chäräcters".len(),
},
TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
];
for test in &tests {
2021-07-01 19:03:28 +02:00
let length = bytes_to_highlight(test.text, test.query);
assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
assert!(
from_utf8(&test.query.as_bytes()[..length]).is_ok(),
r#"converting {}[..{}] to an utf8 str failed"#,
test.query,
length
);
}
}
#[test]
fn matching_words() {
2022-04-04 18:56:59 +02:00
let matching_words = vec![
(vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
(vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
];
2022-04-04 18:56:59 +02:00
let matching_words = MatchingWords::new(matching_words);
2022-01-17 08:34:33 +01:00
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("word"),
char_end: "word".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "word".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
Some(MatchType::Full { char_len: 3, ids: &[2] })
2022-01-17 08:34:33 +01:00
);
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "nyc".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
2022-01-17 08:34:33 +01:00
None
);
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "world".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
2022-01-17 08:34:33 +01:00
);
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("splitted"),
char_end: "splitted".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "splitted".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[0] })
2022-01-17 08:34:33 +01:00
);
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "thisnew".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
2022-01-17 08:34:33 +01:00
None
);
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("borld"),
char_end: "borld".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "borld".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
2022-01-17 08:34:33 +01:00
);
assert_eq!(
2022-04-04 18:56:59 +02:00
matching_words
.match_token(&Token {
kind: TokenKind::Word,
2022-06-02 15:47:28 +02:00
lemma: Cow::Borrowed("wordsplit"),
char_end: "wordsplit".chars().count(),
2022-04-04 18:56:59 +02:00
byte_end: "wordsplit".len(),
2022-06-02 15:47:28 +02:00
..Default::default()
2022-04-04 18:56:59 +02:00
})
.next(),
Some(MatchType::Full { char_len: 4, ids: &[2] })
2022-01-17 08:34:33 +01:00
);
}
}