mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Cargo fmt pass
This commit is contained in:
parent
47d777c8f7
commit
ca26a0f2e4
48 changed files with 1599 additions and 979 deletions
|
@ -1,8 +1,5 @@
|
|||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
use once_cell::sync::OnceCell;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA,
|
||||
};
|
||||
|
||||
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
|
||||
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
|
||||
|
@ -15,30 +12,30 @@ enum PrefixSetting {
|
|||
}
|
||||
|
||||
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
|
||||
use PrefixSetting::{Prefix, NoPrefix};
|
||||
use PrefixSetting::{NoPrefix, Prefix};
|
||||
|
||||
match query.len() {
|
||||
0 ..= 4 => {
|
||||
0..=4 => {
|
||||
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
5 ..= 8 => {
|
||||
}
|
||||
5..=8 => {
|
||||
let builder = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
let builder = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
|
||||
match setting {
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
Prefix => builder.build_prefix_dfa(query),
|
||||
NoPrefix => builder.build_dfa(query),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,14 +6,14 @@ use std::vec;
|
|||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
use meilidb_tokenizer::{split_query_string, is_cjk};
|
||||
use meilidb_tokenizer::{is_cjk, split_query_string};
|
||||
|
||||
use crate::store;
|
||||
use crate::error::MResult;
|
||||
use crate::store;
|
||||
|
||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
|
@ -27,14 +27,9 @@ impl AutomatonProducer {
|
|||
query: &str,
|
||||
main_store: store::Main,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)>
|
||||
{
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||
let (automatons, query_enhancer) =
|
||||
generate_automatons(reader, query, main_store, synonyms_store)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
|
@ -112,8 +107,7 @@ fn generate_automatons(
|
|||
query: &str,
|
||||
main_store: store::Main,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)>
|
||||
{
|
||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
|
@ -130,7 +124,6 @@ fn generate_automatons(
|
|||
let mut original_automatons = Vec::new();
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
|
@ -148,29 +141,33 @@ fn generate_automatons(
|
|||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
let not_prefix_dfa =
|
||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
|
||||
let lev = if not_prefix_dfa {
|
||||
build_dfa(&normalized)
|
||||
} else {
|
||||
build_prefix_dfa(&normalized)
|
||||
};
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words { continue }
|
||||
if ngram_nb_words != base_nb_words {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
|
@ -178,7 +175,11 @@ fn generate_automatons(
|
|||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||
enhancer_builder.declare(
|
||||
query_range.clone(),
|
||||
real_query_index,
|
||||
&synonyms_words,
|
||||
);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
use std::ops::Range;
|
||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
|
@ -34,13 +34,14 @@ use std::cmp::Ordering::{Less, Greater, Equal};
|
|||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
where
|
||||
S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false
|
||||
return false;
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
|
@ -49,7 +50,9 @@ where S: AsRef<str>,
|
|||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||
!original
|
||||
.map(AsRef::as_ref)
|
||||
.eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
|
@ -68,11 +71,20 @@ impl FakeIntervalTree {
|
|||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end { Equal } else { Less }
|
||||
} else { Greater }
|
||||
if point < r.end {
|
||||
Equal
|
||||
} else {
|
||||
Less
|
||||
}
|
||||
} else {
|
||||
Greater
|
||||
}
|
||||
});
|
||||
|
||||
let n = match element { Ok(n) => n, Err(n) => n };
|
||||
let n = match element {
|
||||
Ok(n) => n,
|
||||
Err(n) => n,
|
||||
};
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
|
@ -91,9 +103,13 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||
QueryEnhancerBuilder {
|
||||
query,
|
||||
origins,
|
||||
real_to_origin,
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
|
@ -101,12 +117,12 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where T: AsRef<str>,
|
||||
where
|
||||
T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
@ -126,7 +142,8 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
self.real_to_origin
|
||||
.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
|
@ -148,10 +165,10 @@ impl QueryEnhancer {
|
|||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) =
|
||||
self.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
let (range, (origin, real_length)) = self
|
||||
.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
|
@ -160,7 +177,10 @@ impl QueryEnhancer {
|
|||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 { new_origin = origin + i; break }
|
||||
if count == 0 {
|
||||
new_origin = origin + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
|
@ -168,15 +188,20 @@ impl QueryEnhancer {
|
|||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||
|
||||
Range {
|
||||
start: (start + n) as u32,
|
||||
end: (start + n + remaining) as u32,
|
||||
}
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||
Range {
|
||||
start: (origin + n) as u32,
|
||||
end: (origin + n + 1) as u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -382,16 +407,16 @@ mod tests {
|
|||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue