mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-23 13:24:27 +01:00
Merge pull request #166 from meilisearch/split-query-words
Split query words
This commit is contained in:
commit
97cc3c7cce
@ -1,8 +1,7 @@
|
|||||||
use fst::Automaton;
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use levenshtein_automata::{
|
use levenshtein_automata::{
|
||||||
LevenshteinAutomatonBuilder as LevBuilder,
|
LevenshteinAutomatonBuilder as LevBuilder,
|
||||||
DFA, Distance,
|
DFA,
|
||||||
};
|
};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
@ -11,55 +10,16 @@ lazy_static! {
|
|||||||
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct DfaExt {
|
|
||||||
query_len: usize,
|
|
||||||
automaton: DFA,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Automaton for DfaExt {
|
|
||||||
type State = <DFA as Automaton>::State;
|
|
||||||
|
|
||||||
fn start(&self) -> Self::State {
|
|
||||||
self.automaton.start()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_match(&self, state: &Self::State) -> bool {
|
|
||||||
self.automaton.is_match(state)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn can_match(&self, state: &Self::State) -> bool {
|
|
||||||
self.automaton.can_match(state)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn will_always_match(&self, state: &Self::State) -> bool {
|
|
||||||
self.automaton.will_always_match(state)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
|
|
||||||
self.automaton.accept(state, byte)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AutomatonExt for DfaExt {
|
|
||||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
|
|
||||||
self.automaton.eval(s)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn query_len(&self) -> usize {
|
|
||||||
self.query_len
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
enum PrefixSetting {
|
enum PrefixSetting {
|
||||||
Prefix,
|
Prefix,
|
||||||
NoPrefix,
|
NoPrefix,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
|
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||||
use self::PrefixSetting::{Prefix, NoPrefix};
|
use self::PrefixSetting::{Prefix, NoPrefix};
|
||||||
|
|
||||||
let dfa = match query.len() {
|
match query.len() {
|
||||||
0 ..= 4 => match setting {
|
0 ..= 4 => match setting {
|
||||||
Prefix => LEVDIST0.build_prefix_dfa(query),
|
Prefix => LEVDIST0.build_prefix_dfa(query),
|
||||||
NoPrefix => LEVDIST0.build_dfa(query),
|
NoPrefix => LEVDIST0.build_dfa(query),
|
||||||
@ -72,20 +32,13 @@ fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
|
|||||||
Prefix => LEVDIST2.build_prefix_dfa(query),
|
Prefix => LEVDIST2.build_prefix_dfa(query),
|
||||||
NoPrefix => LEVDIST2.build_dfa(query),
|
NoPrefix => LEVDIST2.build_dfa(query),
|
||||||
},
|
},
|
||||||
};
|
}
|
||||||
|
|
||||||
DfaExt { query_len: query.len(), automaton: dfa }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build_prefix_dfa(query: &str) -> DfaExt {
|
pub fn build_prefix_dfa(query: &str) -> DFA {
|
||||||
build_dfa_with_setting(query, PrefixSetting::Prefix)
|
build_dfa_with_setting(query, PrefixSetting::Prefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn build_dfa(query: &str) -> DfaExt {
|
pub fn build_dfa(query: &str) -> DFA {
|
||||||
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait AutomatonExt: Automaton {
|
|
||||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
|
|
||||||
fn query_len(&self) -> usize;
|
|
||||||
}
|
|
||||||
|
@ -11,8 +11,9 @@ use meilidb_tokenizer::{is_cjk, split_query_string};
|
|||||||
use rayon::slice::ParallelSliceMut;
|
use rayon::slice::ParallelSliceMut;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupByMut;
|
use slice_group_by::GroupByMut;
|
||||||
|
use levenshtein_automata::DFA;
|
||||||
|
|
||||||
use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa};
|
use crate::automaton::{build_dfa, build_prefix_dfa};
|
||||||
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
|
||||||
use crate::criterion::Criteria;
|
use crate::criterion::Criteria;
|
||||||
use crate::raw_documents_from_matches;
|
use crate::raw_documents_from_matches;
|
||||||
@ -21,18 +22,38 @@ use crate::{Match, DocumentId, Store, RawDocument, Document};
|
|||||||
const NGRAMS: usize = 3;
|
const NGRAMS: usize = 3;
|
||||||
|
|
||||||
struct Automaton {
|
struct Automaton {
|
||||||
index: usize,
|
query_index: usize,
|
||||||
|
query_len: usize,
|
||||||
is_exact: bool,
|
is_exact: bool,
|
||||||
dfa: DfaExt,
|
dfa: DFA,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Automaton {
|
impl Automaton {
|
||||||
fn exact(index: usize, dfa: DfaExt) -> Automaton {
|
fn exact(query_index: usize, query: &str) -> Automaton {
|
||||||
Automaton { index, is_exact: true, dfa }
|
Automaton {
|
||||||
|
query_index,
|
||||||
|
query_len: query.len(),
|
||||||
|
is_exact: true,
|
||||||
|
dfa: build_dfa(query),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn non_exact(index: usize, dfa: DfaExt) -> Automaton {
|
fn prefix_exact(query_index: usize, query: &str) -> Automaton {
|
||||||
Automaton { index, is_exact: false, dfa }
|
Automaton {
|
||||||
|
query_index,
|
||||||
|
query_len: query.len(),
|
||||||
|
is_exact: true,
|
||||||
|
dfa: build_prefix_dfa(query),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn non_exact(query_index: usize, query: &str) -> Automaton {
|
||||||
|
Automaton {
|
||||||
|
query_index,
|
||||||
|
query_len: query.len(),
|
||||||
|
is_exact: false,
|
||||||
|
dfa: build_dfa(query),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -46,6 +67,29 @@ pub fn normalize_str(string: &str) -> String {
|
|||||||
string
|
string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn split_best_frequency<'a, S: Store>(
|
||||||
|
word: &'a str,
|
||||||
|
store: &S,
|
||||||
|
) -> Result<Option<(&'a str, &'a str)>, S::Error>
|
||||||
|
{
|
||||||
|
let chars = word.char_indices().skip(1);
|
||||||
|
let mut best = None;
|
||||||
|
|
||||||
|
for (i, _) in chars {
|
||||||
|
let (left, right) = word.split_at(i);
|
||||||
|
|
||||||
|
let left_freq = store.word_indexes(left.as_bytes())?.map_or(0, |i| i.len());
|
||||||
|
let right_freq = store.word_indexes(right.as_bytes())?.map_or(0, |i| i.len());
|
||||||
|
let min_freq = cmp::min(left_freq, right_freq);
|
||||||
|
|
||||||
|
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||||
|
best = Some((min_freq, left, right));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(best.map(|(_, l, r)| (l, r)))
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
@ -54,19 +98,12 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
let synonyms = store.synonyms()?;
|
let synonyms = store.synonyms()?;
|
||||||
|
|
||||||
for n in 1..=NGRAMS {
|
for n in 1..=NGRAMS {
|
||||||
let mut index = 0;
|
let mut query_index = 0;
|
||||||
let mut ngrams = query_words.windows(n).peekable();
|
let mut ngrams = query_words.windows(n).peekable();
|
||||||
|
|
||||||
while let Some(ngram_slice) = ngrams.next() {
|
while let Some(ngram_slice) = ngrams.next() {
|
||||||
let ngram_nb_words = ngram_slice.len();
|
let ngram_nb_words = ngram_slice.len();
|
||||||
let ngram = ngram_slice.join(" ");
|
let ngram = ngram_slice.join(" ");
|
||||||
let concat = ngram_slice.concat();
|
|
||||||
|
|
||||||
// automaton of concatenation of query words
|
|
||||||
let normalized = normalize_str(&concat);
|
|
||||||
let lev = build_dfa(&normalized);
|
|
||||||
let automaton = Automaton::exact(index, lev);
|
|
||||||
automatons.push((automaton, normalized));
|
|
||||||
|
|
||||||
let has_following_word = ngrams.peek().is_some();
|
let has_following_word = ngrams.peek().is_some();
|
||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||||
@ -92,11 +129,10 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
let nb_synonym_words = split_query_string(synonyms).count();
|
let nb_synonym_words = split_query_string(synonyms).count();
|
||||||
|
|
||||||
for synonym in split_query_string(synonyms) {
|
for synonym in split_query_string(synonyms) {
|
||||||
let lev = build_dfa(synonym);
|
|
||||||
let automaton = if nb_synonym_words == 1 {
|
let automaton = if nb_synonym_words == 1 {
|
||||||
Automaton::exact(index, lev)
|
Automaton::exact(query_index, synonym)
|
||||||
} else {
|
} else {
|
||||||
Automaton::non_exact(index, lev)
|
Automaton::non_exact(query_index, synonym)
|
||||||
};
|
};
|
||||||
automatons.push((automaton, synonym.to_owned()));
|
automatons.push((automaton, synonym.to_owned()));
|
||||||
}
|
}
|
||||||
@ -105,22 +141,55 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
}
|
}
|
||||||
|
|
||||||
if n == 1 {
|
if n == 1 {
|
||||||
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
// TODO we do not support "phrase query" in other words:
|
||||||
let automaton = Automaton::exact(index, lev);
|
// first term *must* follow the second term
|
||||||
|
if let Some((left, right)) = split_best_frequency(&ngram, store)? {
|
||||||
|
let automaton = Automaton::exact(query_index, left);
|
||||||
|
automatons.push((automaton, left.to_owned()));
|
||||||
|
|
||||||
|
let automaton = Automaton::exact(query_index, right);
|
||||||
|
automatons.push((automaton, right.to_owned()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let automaton = if not_prefix_dfa {
|
||||||
|
Automaton::exact(query_index, &ngram)
|
||||||
|
} else {
|
||||||
|
Automaton::prefix_exact(query_index, &ngram)
|
||||||
|
};
|
||||||
automatons.push((automaton, ngram));
|
automatons.push((automaton, ngram));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// automaton of concatenation of query words
|
||||||
|
let concat = ngram_slice.concat();
|
||||||
|
let normalized = normalize_str(&concat);
|
||||||
|
let automaton = Automaton::exact(query_index, &normalized);
|
||||||
|
automatons.push((automaton, normalized));
|
||||||
}
|
}
|
||||||
|
|
||||||
index += 1;
|
query_index += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
automatons.sort_unstable_by(|a, b| (a.0.index, &a.1).cmp(&(b.0.index, &b.1)));
|
automatons.sort_unstable_by(|a, b| (a.0.query_index, &a.1).cmp(&(b.0.query_index, &b.1)));
|
||||||
automatons.dedup_by(|a, b| (a.0.index, &a.1) == (b.0.index, &b.1));
|
automatons.dedup_by(|a, b| (a.0.query_index, &a.1) == (b.0.query_index, &b.1));
|
||||||
let automatons = automatons.into_iter().map(|(a, _)| a).collect();
|
let automatons = automatons.into_iter().map(|(a, _)| a).collect();
|
||||||
|
|
||||||
Ok(automatons)
|
Ok(automatons)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn rewrite_matched_positions(matches: &mut [(DocumentId, Match)]) {
|
||||||
|
for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) {
|
||||||
|
let mut offset = 0;
|
||||||
|
for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) {
|
||||||
|
let word_index = query_indexes[0].1.word_index - offset as u16;
|
||||||
|
for (_, match_) in query_indexes.iter_mut() {
|
||||||
|
match_.word_index = word_index;
|
||||||
|
}
|
||||||
|
offset += query_indexes.len() - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> {
|
pub struct QueryBuilder<'c, S, FI = fn(DocumentId) -> bool> {
|
||||||
store: S,
|
store: S,
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
@ -184,9 +253,9 @@ where S: Store,
|
|||||||
|
|
||||||
while let Some((input, indexed_values)) = stream.next() {
|
while let Some((input, indexed_values)) = stream.next() {
|
||||||
for iv in indexed_values {
|
for iv in indexed_values {
|
||||||
let Automaton { index, is_exact, ref dfa } = automatons[iv.index];
|
let Automaton { query_index, is_exact, query_len, ref dfa } = automatons[iv.index];
|
||||||
let distance = dfa.eval(input).to_u8();
|
let distance = dfa.eval(input).to_u8();
|
||||||
let is_exact = is_exact && distance == 0 && input.len() == dfa.query_len();
|
let is_exact = is_exact && distance == 0 && input.len() == query_len;
|
||||||
|
|
||||||
let doc_indexes = self.store.word_indexes(input)?;
|
let doc_indexes = self.store.word_indexes(input)?;
|
||||||
let doc_indexes = match doc_indexes {
|
let doc_indexes = match doc_indexes {
|
||||||
@ -197,8 +266,8 @@ where S: Store,
|
|||||||
for di in doc_indexes.as_slice() {
|
for di in doc_indexes.as_slice() {
|
||||||
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
|
||||||
let match_ = Match {
|
let match_ = Match {
|
||||||
query_index: index as u32,
|
query_index: query_index as u32,
|
||||||
distance: distance,
|
distance,
|
||||||
attribute: di.attribute,
|
attribute: di.attribute,
|
||||||
word_index: di.word_index,
|
word_index: di.word_index,
|
||||||
is_exact,
|
is_exact,
|
||||||
@ -206,23 +275,15 @@ where S: Store,
|
|||||||
char_length: di.char_length,
|
char_length: di.char_length,
|
||||||
};
|
};
|
||||||
matches.push((di.document_id, match_));
|
matches.push((di.document_id, match_));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rewrite the matched positions for next criteria evaluations
|
||||||
matches.par_sort_unstable();
|
matches.par_sort_unstable();
|
||||||
|
rewrite_matched_positions(&mut matches);
|
||||||
for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) {
|
|
||||||
let mut offset = 0;
|
|
||||||
for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) {
|
|
||||||
let word_index = query_indexes[0].1.word_index - offset as u16;
|
|
||||||
for (_, match_) in query_indexes.iter_mut() {
|
|
||||||
match_.word_index = word_index;
|
|
||||||
}
|
|
||||||
offset += query_indexes.len() - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let total_matches = matches.len();
|
let total_matches = matches.len();
|
||||||
let padded_matches = SetBuf::from_dirty(matches);
|
let padded_matches = SetBuf::from_dirty(matches);
|
||||||
@ -543,6 +604,28 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple() {
|
||||||
|
let store = InMemorySetStore::from_iter(vec![
|
||||||
|
("iphone", &[doc_char_index(0, 0, 0)][..]),
|
||||||
|
("from", &[doc_char_index(0, 1, 1)][..]),
|
||||||
|
("apple", &[doc_char_index(0, 2, 2)][..]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("iphone from apple", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut matches = matches.into_iter();
|
||||||
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
|
assert_matches!(matches.next(), Some(Match { query_index: 1, word_index: 1, .. }));
|
||||||
|
assert_matches!(matches.next(), Some(Match { query_index: 2, word_index: 2, .. }));
|
||||||
|
assert_matches!(matches.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn simple_synonyms() {
|
fn simple_synonyms() {
|
||||||
let mut store = InMemorySetStore::from_iter(vec![
|
let mut store = InMemorySetStore::from_iter(vec![
|
||||||
@ -556,10 +639,9 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -568,10 +650,9 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
@ -590,10 +671,9 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -602,10 +682,9 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -635,10 +714,9 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -647,10 +725,9 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
@ -672,22 +749,19 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 3, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 3);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 5, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 5);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -696,22 +770,19 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 3, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 3);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 5, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 5);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -720,22 +791,19 @@ mod tests {
|
|||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 0, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 0);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 3, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 3);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
assert_eq!(matches.len(), 1);
|
let mut matches = matches.into_iter();
|
||||||
let match_ = matches[0];
|
assert_matches!(matches.next(), Some(Match { query_index: 0, word_index: 5, .. }));
|
||||||
assert_eq!(match_.query_index, 0);
|
assert_matches!(matches.next(), None);
|
||||||
assert_eq!(match_.word_index, 5);
|
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
@ -933,6 +1001,12 @@ mod tests {
|
|||||||
("blue", &[doc_char_index(1, 1, 1)][..]),
|
("blue", &[doc_char_index(1, 1, 1)][..]),
|
||||||
("subway", &[doc_char_index(1, 2, 2)][..]),
|
("subway", &[doc_char_index(1, 2, 2)][..]),
|
||||||
("broken", &[doc_char_index(1, 3, 3)][..]),
|
("broken", &[doc_char_index(1, 3, 3)][..]),
|
||||||
|
|
||||||
|
("new", &[doc_char_index(2, 0, 0)][..]),
|
||||||
|
("york", &[doc_char_index(2, 1, 1)][..]),
|
||||||
|
("underground", &[doc_char_index(2, 2, 2)][..]),
|
||||||
|
("train", &[doc_char_index(2, 3, 3)][..]),
|
||||||
|
("broken", &[doc_char_index(2, 4, 4)][..]),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]));
|
store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"]));
|
||||||
@ -943,6 +1017,16 @@ mod tests {
|
|||||||
let results = builder.query("new york underground train broken", 0..20).unwrap();
|
let results = builder.query("new york underground train broken", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // new
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, char_index: 2, .. })); // underground
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, char_index: 3, .. })); // train
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, char_index: 4, .. })); // broken
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york
|
||||||
@ -962,6 +1046,16 @@ mod tests {
|
|||||||
let results = builder.query("new york city underground train broken", 0..20).unwrap();
|
let results = builder.query("new york city underground train broken", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // new
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 0, char_index: 1, .. })); // york
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, char_index: 2, .. })); // underground
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 2, char_index: 3, .. })); // train
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, char_index: 4, .. })); // broken
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city
|
||||||
@ -1033,9 +1127,7 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 0, .. })); // téléphone
|
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 1, .. })); // téléphone
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 1, .. })); // telephone
|
|
||||||
assert_matches!(iter.next(), Some(Match { query_index: 0, distance: 2, .. })); // télephone
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
@ -1061,4 +1153,38 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_split() {
|
||||||
|
let store = InMemorySetStore::from_iter(vec![
|
||||||
|
("porte", &[doc_char_index(0, 0, 0)][..]),
|
||||||
|
("feuille", &[doc_char_index(0, 1, 1)][..]),
|
||||||
|
("search", &[doc_char_index(1, 0, 0)][..]),
|
||||||
|
("engine", &[doc_char_index(1, 1, 1)][..]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("portefeuille", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // porte
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // feuille
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("searchengine", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 0, .. })); // search
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, char_index: 1, .. })); // engine
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user