mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 20:07:09 +02:00
Make the project be a workspace
This commit is contained in:
parent
3476939b7e
commit
62a0aefe44
40 changed files with 49 additions and 42 deletions
|
@ -1,202 +0,0 @@
|
|||
mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::vec;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
use meilidb_tokenizer::{split_query_string, is_cjk};
|
||||
|
||||
use crate::store;
|
||||
|
||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
pub struct AutomatonProducer {
|
||||
automatons: Vec<Vec<Automaton>>,
|
||||
}
|
||||
|
||||
impl AutomatonProducer {
|
||||
pub fn new(
|
||||
reader: &impl rkv::Readable,
|
||||
query: &str,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> (AutomatonProducer, QueryEnhancer)
|
||||
{
|
||||
let (automatons, query_enhancer) = generate_automatons(reader, query, synonyms_store).unwrap();
|
||||
(AutomatonProducer { automatons }, query_enhancer)
|
||||
}
|
||||
|
||||
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
|
||||
self.automatons.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Automaton {
|
||||
pub index: usize,
|
||||
pub ngram: usize,
|
||||
pub query_len: usize,
|
||||
pub is_exact: bool,
|
||||
pub is_prefix: bool,
|
||||
pub query: String,
|
||||
}
|
||||
|
||||
impl Automaton {
|
||||
pub fn dfa(&self) -> DFA {
|
||||
if self.is_prefix {
|
||||
build_prefix_dfa(&self.query)
|
||||
} else {
|
||||
build_dfa(&self.query)
|
||||
}
|
||||
}
|
||||
|
||||
fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: true,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: false,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
|
||||
if !string.contains(is_cjk) {
|
||||
string = deunicode::deunicode_with_tofu(&string, "");
|
||||
}
|
||||
|
||||
string
|
||||
}
|
||||
|
||||
fn generate_automatons(
|
||||
reader: &impl rkv::Readable,
|
||||
query: &str,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> Result<(Vec<Vec<Automaton>>, QueryEnhancer), rkv::StoreError>
|
||||
{
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = synonym_store.synonyms_fst(reader)?;
|
||||
|
||||
let mut automatons = Vec::new();
|
||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||
|
||||
// We must not declare the original words to the query enhancer
|
||||
// *but* we need to push them in the automatons list first
|
||||
let mut original_automatons = Vec::new();
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
let automaton = if not_prefix_dfa {
|
||||
Automaton::exact(automatons.len(), 1, word)
|
||||
} else {
|
||||
Automaton::prefix_exact(automatons.len(), 1, word)
|
||||
};
|
||||
original_automatons.push(automaton);
|
||||
}
|
||||
|
||||
automatons.push(original_automatons);
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) };
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words { continue }
|
||||
|
||||
if let Some(synonyms) = synonym_store.alternatives_to(reader, base.as_bytes())? {
|
||||
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automatons.len();
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
Automaton::exact(automatons.len(), n, synonym)
|
||||
} else {
|
||||
Automaton::non_exact(automatons.len(), n, synonym)
|
||||
};
|
||||
automatons.push(vec![automaton]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
|
||||
let real_query_index = automatons.len();
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||
|
||||
let automaton = Automaton::exact(automatons.len(), n, &normalized);
|
||||
automatons.push(vec![automaton]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// order automatons, the most important first,
|
||||
// we keep the original automatons at the front.
|
||||
automatons[1..].sort_unstable_by_key(|a| {
|
||||
let a = a.first().unwrap();
|
||||
(Reverse(a.is_exact), Reverse(a.ngram))
|
||||
});
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue