mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Merge branch 'search-refactor-highlighter' into search-refactor-highlighter-merged
This commit is contained in:
commit
e7bb8c940f
8 changed files with 470 additions and 631 deletions
|
@ -98,8 +98,8 @@ pub use self::heed_codec::{
|
|||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord,
|
||||
MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
|
||||
SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
|
|
|
@ -1,458 +0,0 @@
|
|||
use std::cmp::{min, Reverse};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::ops::{Index, IndexMut};
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::Token;
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
|
||||
use crate::error::InternalError;
|
||||
use crate::search::build_dfa;
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
type IsPrefix = bool;
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
writeln!(f, "[")?;
|
||||
for (matching_words, primitive_word_id) in self.inner.iter() {
|
||||
writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
|
||||
}
|
||||
writeln!(f, "]")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(
|
||||
mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
) -> crate::Result<Self> {
|
||||
// if one of the matching_words vec doesn't contain a word.
|
||||
if matching_words.iter().any(|(mw, _)| mw.is_empty()) {
|
||||
return Err(InternalError::InvalidMatchingWords.into());
|
||||
}
|
||||
|
||||
// Sort word by len in DESC order prioritizing the longuest matches,
|
||||
// in order to highlight the longuest part of the matched word.
|
||||
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
||||
|
||||
Ok(Self { inner: matching_words })
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { inner: Box::new(self.inner.iter()), token }
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
#[allow(clippy::type_complexity)]
|
||||
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
Some((matching_words, ids)) => match matching_words[0].match_token(self.token) {
|
||||
Some(char_len) => {
|
||||
if matching_words.len() > 1 {
|
||||
Some(MatchType::Partial(PartialMatch {
|
||||
matching_words: &matching_words[1..],
|
||||
ids,
|
||||
char_len,
|
||||
}))
|
||||
} else {
|
||||
Some(MatchType::Full { char_len, ids })
|
||||
}
|
||||
}
|
||||
None => self.next(),
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type PrimitiveWordId = u8;
|
||||
|
||||
/// Structure used to match a specific term.
|
||||
pub struct MatchingWord {
|
||||
pub dfa: DFA,
|
||||
pub word: String,
|
||||
pub typo: u8,
|
||||
pub prefix: IsPrefix,
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWord {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("MatchingWord")
|
||||
.field("word", &self.word)
|
||||
.field("typo", &self.typo)
|
||||
.field("prefix", &self.prefix)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for MatchingWord {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
|
||||
}
|
||||
}
|
||||
|
||||
impl MatchingWord {
|
||||
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
|
||||
if word.len() > MAX_WORD_LENGTH {
|
||||
return None;
|
||||
}
|
||||
let dfa = build_dfa(&word, typo, prefix);
|
||||
|
||||
Some(Self { dfa, word, typo, prefix })
|
||||
}
|
||||
|
||||
/// Returns the lenght in chars of the match in case of the token matches the term.
|
||||
pub fn match_token(&self, token: &Token) -> Option<usize> {
|
||||
match self.dfa.eval(token.lemma()) {
|
||||
Distance::Exact(t) if t <= self.typo => {
|
||||
if self.prefix {
|
||||
let len = bytes_to_highlight(token.lemma(), &self.word);
|
||||
Some(token.original_lengths(len).0)
|
||||
} else {
|
||||
Some(token.original_lengths(token.lemma().len()).0)
|
||||
}
|
||||
}
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_len: usize, ids: &'a [PrimitiveWordId] },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: &'a [Rc<MatchingWord>],
|
||||
ids: &'a [PrimitiveWordId],
|
||||
char_len: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
|
||||
self.matching_words[0].match_token(token).map(|char_len| {
|
||||
if self.matching_words.len() > 1 {
|
||||
MatchType::Partial(PartialMatch {
|
||||
matching_words: &self.matching_words[1..],
|
||||
ids: self.ids,
|
||||
char_len,
|
||||
})
|
||||
} else {
|
||||
MatchType::Full { char_len, ids: self.ids }
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn char_len(&self) -> usize {
|
||||
self.char_len
|
||||
}
|
||||
}
|
||||
|
||||
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
|
||||
struct N2Array<T> {
|
||||
y_size: usize,
|
||||
buf: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T: Clone> N2Array<T> {
|
||||
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
|
||||
N2Array { y_size: y, buf: vec![value; x * y] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<(usize, usize)> for N2Array<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, (x, y): (usize, usize)) -> &T {
|
||||
&self.buf[(x * self.y_size) + y]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
|
||||
&mut self.buf[(x * self.y_size) + y]
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of **bytes** we want to highlight in the `source` word.
|
||||
/// Basically we want to highlight as much characters as possible in the source until it has too much
|
||||
/// typos (= 2)
|
||||
/// The algorithm is a modified
|
||||
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
|
||||
fn bytes_to_highlight(source: &str, target: &str) -> usize {
|
||||
let n = source.chars().count();
|
||||
let m = target.chars().count();
|
||||
|
||||
if n == 0 {
|
||||
return 0;
|
||||
}
|
||||
// since we allow two typos we can send two characters even if it's completely wrong
|
||||
if m < 3 {
|
||||
return source.chars().take(m).map(|c| c.len_utf8()).sum();
|
||||
}
|
||||
if n == m && source == target {
|
||||
return source.len();
|
||||
}
|
||||
|
||||
let inf = n + m;
|
||||
let mut matrix = N2Array::new(n + 2, m + 2, 0);
|
||||
|
||||
matrix[(0, 0)] = inf;
|
||||
for i in 0..=n {
|
||||
matrix[(i + 1, 0)] = inf;
|
||||
matrix[(i + 1, 1)] = i;
|
||||
}
|
||||
for j in 0..=m {
|
||||
matrix[(0, j + 1)] = inf;
|
||||
matrix[(1, j + 1)] = j;
|
||||
}
|
||||
|
||||
let mut last_row = BTreeMap::new();
|
||||
|
||||
for (row, char_s) in source.chars().enumerate() {
|
||||
let mut last_match_col = 0;
|
||||
let row = row + 1;
|
||||
|
||||
for (col, char_t) in target.chars().enumerate() {
|
||||
let col = col + 1;
|
||||
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
|
||||
let cost = usize::from(char_s != char_t);
|
||||
|
||||
let dist_add = matrix[(row, col + 1)] + 1;
|
||||
let dist_del = matrix[(row + 1, col)] + 1;
|
||||
let dist_sub = matrix[(row, col)] + cost;
|
||||
let dist_trans = matrix[(last_match_row, last_match_col)]
|
||||
+ (row - last_match_row - 1)
|
||||
+ 1
|
||||
+ (col - last_match_col - 1);
|
||||
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
|
||||
matrix[(row + 1, col + 1)] = dist;
|
||||
|
||||
if cost == 0 {
|
||||
last_match_col = col;
|
||||
}
|
||||
}
|
||||
|
||||
last_row.insert(char_s, row);
|
||||
}
|
||||
|
||||
let mut minimum = (u32::max_value(), 0);
|
||||
for x in 0..=m {
|
||||
let dist = matrix[(n + 1, x + 1)] as u32;
|
||||
if dist < minimum.0 {
|
||||
minimum = (dist, x);
|
||||
}
|
||||
}
|
||||
|
||||
// everything was done characters wise and now we want to returns a number of bytes
|
||||
source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::borrow::Cow;
|
||||
use std::str::from_utf8;
|
||||
|
||||
use charabia::TokenKind;
|
||||
|
||||
use super::*;
|
||||
use crate::MatchingWords;
|
||||
|
||||
#[test]
|
||||
fn test_bytes_to_highlight() {
|
||||
struct TestBytesToHighlight {
|
||||
query: &'static str,
|
||||
text: &'static str,
|
||||
length: usize,
|
||||
}
|
||||
let tests = [
|
||||
TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
|
||||
TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
|
||||
TestBytesToHighlight {
|
||||
query: "Levenshtein",
|
||||
text: "Levenshtein",
|
||||
length: "Levenshtein".len(),
|
||||
},
|
||||
// we get to the end of our word with only one typo
|
||||
TestBytesToHighlight {
|
||||
query: "Levenste",
|
||||
text: "Levenshtein",
|
||||
length: "Levenste".len(),
|
||||
},
|
||||
// we get our third and last authorized typo right on the last character
|
||||
TestBytesToHighlight {
|
||||
query: "Levenstein",
|
||||
text: "Levenshte",
|
||||
length: "Levenste".len(),
|
||||
},
|
||||
// we get to the end of our word with only two typos at the beginning
|
||||
TestBytesToHighlight {
|
||||
query: "Bavenshtein",
|
||||
text: "Levenshtein",
|
||||
length: "Bavenshtein".len(),
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "Альфа", text: "Альфой", length: "Альф".len()
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "chäräcters",
|
||||
text: "chäräcters",
|
||||
length: "chäräcters".len(),
|
||||
},
|
||||
TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
|
||||
TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
|
||||
];
|
||||
|
||||
for test in &tests {
|
||||
let length = bytes_to_highlight(test.text, test.query);
|
||||
assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
|
||||
assert!(
|
||||
from_utf8(&test.query.as_bytes()[..length]).is_ok(),
|
||||
r#"converting {}[..{}] to an utf8 str failed"#,
|
||||
test.query,
|
||||
length
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
|
||||
Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("word"),
|
||||
char_end: "word".chars().count(),
|
||||
byte_end: "word".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 3, ids: &[2] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("splitted"),
|
||||
char_end: "splitted".chars().count(),
|
||||
byte_end: "splitted".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &[0] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("borld"),
|
||||
char_end: "borld".chars().count(),
|
||||
byte_end: "borld".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("wordsplit"),
|
||||
char_end: "wordsplit".chars().count(),
|
||||
byte_end: "wordsplit".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 4, ids: &[2] })
|
||||
);
|
||||
}
|
||||
}
|
|
@ -5,9 +5,8 @@ use once_cell::sync::Lazy;
|
|||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::matches::{
|
||||
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
|
||||
};
|
||||
pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
|
||||
use self::new::PartialSearchResult;
|
||||
use crate::{
|
||||
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
|
||||
};
|
||||
|
@ -19,7 +18,6 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
|||
|
||||
pub mod facet;
|
||||
mod fst_utils;
|
||||
mod matches;
|
||||
pub mod new;
|
||||
|
||||
pub struct Search<'a> {
|
||||
|
@ -110,19 +108,28 @@ impl<'a> Search<'a> {
|
|||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
execute_search(
|
||||
&mut ctx,
|
||||
&self.query,
|
||||
self.terms_matching_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
&self.filter,
|
||||
&self.sort_criteria,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)
|
||||
let PartialSearchResult { located_query_terms, candidates, documents_ids } =
|
||||
execute_search(
|
||||
&mut ctx,
|
||||
&self.query,
|
||||
self.terms_matching_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
&self.filter,
|
||||
&self.sort_criteria,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)?;
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
Ok(SearchResult { matching_words, candidates, documents_ids })
|
||||
}
|
||||
}
|
||||
|
||||
|
|
377
milli/src/search/new/matches/matching_words.rs
Normal file
377
milli/src/search/new/matches/matching_words.rs
Normal file
|
@ -0,0 +1,377 @@
|
|||
use std::cmp::Reverse;
|
||||
use std::fmt;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use charabia::Token;
|
||||
|
||||
use super::super::interner::Interned;
|
||||
use super::super::query_term::{
|
||||
Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
|
||||
};
|
||||
use super::super::{DedupInterner, Phrase};
|
||||
use crate::SearchContext;
|
||||
|
||||
pub struct LocatedMatchingPhrase {
|
||||
pub value: Interned<Phrase>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
pub struct LocatedMatchingWords {
|
||||
pub value: Vec<Interned<String>>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
pub is_prefix: bool,
|
||||
pub original_char_count: usize,
|
||||
}
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
word_interner: DedupInterner<String>,
|
||||
phrase_interner: DedupInterner<Phrase>,
|
||||
phrases: Vec<LocatedMatchingPhrase>,
|
||||
words: Vec<LocatedMatchingWords>,
|
||||
}
|
||||
|
||||
/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
|
||||
fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
|
||||
let mut matching_words = Vec::new();
|
||||
let mut matching_phrases = Vec::new();
|
||||
|
||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
||||
let QueryTerm {
|
||||
original: _,
|
||||
is_multiple_words: _,
|
||||
max_nbr_typos: _,
|
||||
is_prefix: _,
|
||||
zero_typo,
|
||||
one_typo,
|
||||
two_typo,
|
||||
} = term;
|
||||
|
||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
||||
let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
|
||||
|
||||
// zero typo
|
||||
if let Some(phrase) = phrase {
|
||||
matching_phrases.push(*phrase);
|
||||
}
|
||||
if let Some(zero_typo) = zero_typo {
|
||||
matching_words.push(*zero_typo);
|
||||
}
|
||||
for synonym in synonyms {
|
||||
matching_phrases.push(*synonym);
|
||||
}
|
||||
|
||||
// one typo
|
||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
||||
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
|
||||
if let Some(split_words) = split_words {
|
||||
matching_phrases.push(*split_words);
|
||||
}
|
||||
for one_typo in one_typo {
|
||||
matching_words.push(*one_typo);
|
||||
}
|
||||
}
|
||||
|
||||
// two typos
|
||||
// the structure is exhaustively extracted to ensure that no field is missing.
|
||||
if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
|
||||
for two_typos in two_typos {
|
||||
matching_words.push(*two_typos);
|
||||
}
|
||||
}
|
||||
|
||||
(matching_phrases, matching_words)
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||
let mut phrases = Vec::new();
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
|
||||
// and wrap them in dedicated structures.
|
||||
for located_term in located_terms {
|
||||
let term = ctx.term_interner.get(located_term.value);
|
||||
let (matching_phrases, matching_words) = extract_matching_terms(term);
|
||||
|
||||
for matching_phrase in matching_phrases {
|
||||
phrases.push(LocatedMatchingPhrase {
|
||||
value: matching_phrase,
|
||||
positions: located_term.positions.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
positions: located_term.positions.clone(),
|
||||
is_prefix: term.is_prefix,
|
||||
original_char_count: ctx.word_interner.get(term.original).chars().count(),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort word to put prefixes at the bottom prioritizing the exact matches.
|
||||
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
|
||||
|
||||
Self {
|
||||
phrases,
|
||||
words,
|
||||
word_interner: ctx.word_interner,
|
||||
phrase_interner: ctx.phrase_interner,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
|
||||
}
|
||||
|
||||
/// Try to match the token with one of the located_words.
|
||||
fn match_unique_words<'a>(&'a self, token: &Token) -> Option<MatchType<'a>> {
|
||||
for located_words in &self.words {
|
||||
for word in &located_words.value {
|
||||
let word = self.word_interner.get(*word);
|
||||
// if the word is a prefix we match using starts_with.
|
||||
if located_words.is_prefix && token.lemma().starts_with(word) {
|
||||
let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else {
|
||||
continue;
|
||||
};
|
||||
let prefix_length = char_index + c.len_utf8();
|
||||
let char_len = token.original_lengths(prefix_length).0;
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { char_len, ids });
|
||||
// else we exact match the token.
|
||||
} else if token.lemma() == word {
|
||||
let char_len = token.char_end - token.char_start;
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { char_len, ids });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
matching_words: &'a MatchingWords,
|
||||
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.phrases.next() {
|
||||
// Try to match all the phrases first.
|
||||
Some(located_phrase) => {
|
||||
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
|
||||
|
||||
// create a PartialMatch struct to make it compute the first match
|
||||
// instead of duplicating the code.
|
||||
let ids = &located_phrase.positions;
|
||||
// collect the references of words from the interner.
|
||||
let words = phrase
|
||||
.words
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||
})
|
||||
.collect();
|
||||
let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
|
||||
|
||||
partial.match_token(self.token).or_else(|| self.next())
|
||||
}
|
||||
// If no phrases matches, try to match uiques words.
|
||||
None => self.matching_words.match_unique_words(self.token),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type WordId = u16;
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: Vec<Option<&'a str>>,
|
||||
ids: &'a RangeInclusive<WordId>,
|
||||
char_len: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
|
||||
let Self { mut matching_words, ids, .. } = self;
|
||||
|
||||
let is_matching = match matching_words.first()? {
|
||||
Some(word) => &token.lemma() == word,
|
||||
// a None value in the phrase corresponds to a stop word,
|
||||
// the walue is considered a match if the current token is categorized as a stop word.
|
||||
None => token.is_stopword(),
|
||||
};
|
||||
|
||||
let char_len = token.char_end - token.char_start;
|
||||
// if there are remaining words to match in the phrase and the current token is matching,
|
||||
// return a new Partial match allowing the highlighter to continue.
|
||||
if is_matching && matching_words.len() > 1 {
|
||||
matching_words.remove(0);
|
||||
Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
|
||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||
// return a Full match.
|
||||
} else if is_matching {
|
||||
Some(MatchType::Full { char_len, ids })
|
||||
// if the current token doesn't match, return None to break the match sequence.
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn char_len(&self) -> usize {
|
||||
self.char_len
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
|
||||
|
||||
let phrases: Vec<_> = phrases
|
||||
.iter()
|
||||
.map(|p| {
|
||||
(
|
||||
phrase_interner
|
||||
.get(p.value)
|
||||
.words
|
||||
.iter()
|
||||
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
p.positions.clone(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let words: Vec<_> = words
|
||||
.iter()
|
||||
.flat_map(|w| {
|
||||
w.value
|
||||
.iter()
|
||||
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
|
||||
use super::super::super::located_query_terms_from_string;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let tokenizer = TokenizerBuilder::new().build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{SeparatorKind, Token, Tokenizer};
|
||||
use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
|
||||
pub use matching_words::{MatchingWord, MatchingWords};
|
||||
pub use matching_words::MatchingWords;
|
||||
use matching_words::{MatchType, PartialMatch, WordId};
|
||||
use serde::Serialize;
|
||||
|
||||
pub mod matching_words;
|
||||
|
@ -88,7 +88,7 @@ impl FormatOptions {
|
|||
pub struct Match {
|
||||
match_len: usize,
|
||||
// ids of the query words that matches.
|
||||
ids: Vec<PrimitiveWordId>,
|
||||
ids: Vec<WordId>,
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
|
@ -137,11 +137,12 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||
}
|
||||
// partial match is now full, we keep this matches and we advance positions
|
||||
Some(MatchType::Full { char_len, ids }) => {
|
||||
let ids: Vec<_> = ids.clone().into_iter().collect();
|
||||
// save previously matched tokens as matches.
|
||||
let iter = potential_matches.into_iter().map(
|
||||
|(token_position, word_position, match_len)| Match {
|
||||
match_len,
|
||||
ids: ids.to_vec(),
|
||||
ids: ids.clone(),
|
||||
word_position,
|
||||
token_position,
|
||||
},
|
||||
|
@ -151,7 +152,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||
// save the token that closes the partial match as a match.
|
||||
matches.push(Match {
|
||||
match_len: char_len,
|
||||
ids: ids.to_vec(),
|
||||
ids,
|
||||
word_position,
|
||||
token_position,
|
||||
});
|
||||
|
@ -191,9 +192,10 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||
// we match, we save the current token as a match,
|
||||
// then we continue the rest of the tokens.
|
||||
MatchType::Full { char_len, ids } => {
|
||||
let ids: Vec<_> = ids.clone().into_iter().collect();
|
||||
matches.push(Match {
|
||||
match_len: char_len,
|
||||
ids: ids.to_vec(),
|
||||
ids,
|
||||
word_position,
|
||||
token_position,
|
||||
});
|
||||
|
@ -334,7 +336,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
|
||||
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
|
@ -494,39 +496,29 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::TokenizerBuilder;
|
||||
use matching_words::tests::temp_index_with_documents;
|
||||
|
||||
use super::super::located_query_terms_from_string;
|
||||
use super::*;
|
||||
use crate::search::matches::matching_words::MatchingWord;
|
||||
use crate::SearchContext;
|
||||
|
||||
fn matching_words() -> MatchingWords {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
MatchingWords::new(matching_words).unwrap()
|
||||
}
|
||||
|
||||
impl MatcherBuilder<'_, Vec<u8>> {
|
||||
pub fn from_matching_words(matching_words: MatchingWords) -> Self {
|
||||
Self::new(matching_words, TokenizerBuilder::default().build())
|
||||
impl<'a> MatcherBuilder<'a, &[u8]> {
|
||||
pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self {
|
||||
let tokenizer = TokenizerBuilder::new().build();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
Self::new(matching_words, TokenizerBuilder::new().build())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_identity() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: None };
|
||||
|
||||
|
@ -551,9 +543,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn format_highlight() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
|
@ -594,16 +587,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "world");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing prefix match.
|
||||
|
@ -624,6 +611,10 @@ mod tests {
|
|||
@"<em>Ŵôřlḑ</em>"
|
||||
);
|
||||
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "westfali");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Westfália";
|
||||
let mut matcher = builder.build(text);
|
||||
|
@ -636,9 +627,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn format_crop() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
||||
|
||||
|
@ -733,9 +725,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn format_highlight_crop() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
|
@ -795,9 +788,10 @@ mod tests {
|
|||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let text = "void void split the world void void.";
|
||||
|
||||
|
@ -831,25 +825,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone(), all[2].clone()], vec![0]),
|
||||
(vec![all[3].clone()], vec![1]),
|
||||
(vec![all[4].clone(), all[5].clone()], vec![1]),
|
||||
(vec![all[4].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
let mut builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\"");
|
||||
builder.highlight_prefix("_".to_string());
|
||||
builder.highlight_suffix("_".to_string());
|
||||
|
||||
|
@ -859,7 +838,7 @@ mod tests {
|
|||
let mut matcher = builder.build(text);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_"
|
||||
@"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
|
||||
);
|
||||
}
|
||||
}
|
|
@ -5,6 +5,7 @@ mod graph_based_ranking_rule;
|
|||
mod interner;
|
||||
mod limits;
|
||||
mod logger;
|
||||
pub mod matches;
|
||||
mod query_graph;
|
||||
mod query_term;
|
||||
mod ranking_rule_graph;
|
||||
|
@ -33,8 +34,8 @@ use interner::DedupInterner;
|
|||
pub use logger::detailed::DetailedSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
use query_graph::{QueryGraph, QueryNode};
|
||||
use query_term::{located_query_terms_from_string, Phrase, QueryTerm};
|
||||
use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
|
||||
use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use resolve_query_graph::PhraseDocIdsCache;
|
||||
use roaring::RoaringBitmap;
|
||||
use words::Words;
|
||||
|
@ -47,10 +48,7 @@ use self::ranking_rules::{BoxRankingRule, RankingRule};
|
|||
use self::resolve_query_graph::compute_query_graph_docids;
|
||||
use self::sort::Sort;
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::{
|
||||
AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy,
|
||||
UserError,
|
||||
};
|
||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
pub struct SearchContext<'ctx> {
|
||||
|
@ -62,6 +60,7 @@ pub struct SearchContext<'ctx> {
|
|||
pub term_interner: Interner<QueryTerm>,
|
||||
pub phrase_docids: PhraseDocIdsCache,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
|
||||
Self {
|
||||
|
@ -291,13 +290,14 @@ pub fn execute_search(
|
|||
words_limit: Option<usize>,
|
||||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<SearchResult> {
|
||||
) -> Result<PartialSearchResult> {
|
||||
let mut universe = if let Some(filters) = filters {
|
||||
filters.evaluate(ctx.txn, ctx.index)?
|
||||
} else {
|
||||
ctx.index.documents_ids(ctx.txn)?
|
||||
};
|
||||
|
||||
let mut located_query_terms = None;
|
||||
let bucket_sort_output = if let Some(query) = query {
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
|
@ -317,6 +317,7 @@ pub fn execute_search(
|
|||
|
||||
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
|
||||
let graph = QueryGraph::from_query(ctx, &query_terms)?;
|
||||
located_query_terms = Some(query_terms);
|
||||
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
|
@ -357,9 +358,7 @@ pub fn execute_search(
|
|||
}
|
||||
}
|
||||
|
||||
Ok(SearchResult {
|
||||
// TODO: correct matching words
|
||||
matching_words: MatchingWords::default(),
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
documents_ids: docids,
|
||||
})
|
||||
|
@ -406,3 +405,9 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>
|
|||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct PartialSearchResult {
|
||||
pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
|
||||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue