MeiliSearch/meilisearch-core/src/bucket_sort.rs
Clément Renault 0f698d6bd9
Work in progress: Bad Typo detection
I have an issue where "speakers" is split into "speaker" and "s",
when I compute the distances for the Typo criterion,
it takes "s" into account and put a distance of zero in the bucket 0
(the "speakers" bucket), therefore it reports any document matching "s"
without typos as best results.

I need to make sure to ignore "s" when its associated part "speaker"
doesn't even exist in the document and is not in the place
it should be ("speaker" followed by "s").

This is hard to think that it will had much computation time to
the Typo criterion like in the previous algorithm where I computed
the real query/words indexes based and removed the invalid ones
before sending the documents to the bucket sort.
2019-12-13 14:38:22 +01:00

468 lines
16 KiB
Rust

use std::ops::Deref;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::io::Write;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
use std::time::{Duration, Instant};
use compact_arena::{SmallArena, Idx32, mk_arena};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
use log::debug;
use meilisearch_tokenizer::{is_cjk, split_query_string};
use meilisearch_types::{DocIndex, Highlight};
use sdset::Set;
use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::NGRAMS;
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::automaton::{normalize_str, split_best_frequency};
use crate::criterion2::*;
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
pub fn bucket_sort<'c>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
) -> MResult<Vec<Document>>
{
// let automatons = construct_automatons(query);
let (automatons, query_enhancer) =
construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;
debug!("{:?}", query_enhancer);
let before_postings_lists_fetching = Instant::now();
mk_arena!(arena);
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
debug!("bare matches ({}) retrieved in {:.02?}",
bare_matches.len(),
before_postings_lists_fetching.elapsed(),
);
let before_raw_documents_presort = Instant::now();
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
raw_documents.push(RawDocument {
raw_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
});
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
before_raw_documents_building.elapsed(),
);
dbg!(mem::size_of::<BareMatch>());
dbg!(mem::size_of::<SimpleMatch>());
let mut groups = vec![raw_documents.as_mut_slice()];
let criteria = [
Box::new(Typo) as Box<dyn Criterion>,
Box::new(Words) as Box<dyn Criterion>,
Box::new(Proximity),
Box::new(Attribute),
Box::new(WordsPosition),
Box::new(Exact),
Box::new(StableDocId),
];
'criteria: for criterion in &criteria {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for mut group in tmp_groups {
let before_criterion_preparation = Instant::now();
criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons);
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) {
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
let iter = iter.map(|d| {
let highlights = d.raw_matches.iter().flat_map(|sm| {
let postings_list = &arena[sm.postings_list];
let input = postings_list.input();
let query = &automatons[sm.query_index as usize].query;
debug!("{:?} contains {:?}", d.raw_matches[0].document_id, query);
postings_list.iter().map(move |m| {
let covered_area = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 }
})
}).collect();
debug!("{:?} contains {:?}", d.raw_matches[0].document_id, d.processed_distances);
Document {
id: d.raw_matches[0].document_id,
highlights,
#[cfg(test)] matches: Vec::new(),
}
});
Ok(iter.collect())
}
pub struct RawDocument<'a, 'tag> {
pub raw_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found
/// where the `query_index` is the index
pub processed_distances: Vec<Option<u8>>,
}
pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: u16,
pub distance: u8,
pub is_exact: bool,
pub postings_list: Idx32<'tag>,
}
// TODO remove that
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct SimpleMatch {
pub query_index: u16,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Clone)]
pub struct PostingsListView<'txn> {
input: Rc<[u8]>,
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
offset: usize,
len: usize,
}
impl<'txn> PostingsListView<'txn> {
pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
PostingsListView { input, postings_list, offset: 0, len }
}
pub fn len(&self) -> usize {
self.len
}
pub fn input(&self) -> &[u8] {
&self.input
}
pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> {
assert!(offset + len <= self.len);
PostingsListView {
input: self.input.clone(),
postings_list: self.postings_list.clone(),
offset: self.offset + offset,
len: len,
}
}
}
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
fn as_ref(&self) -> &Set<DocIndex> {
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
}
}
impl Deref for PostingsListView<'_> {
type Target = Set<DocIndex>;
fn deref(&self) -> &Set<DocIndex> {
Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
}
}
fn fetch_matches<'txn, 'tag>(
reader: &'txn heed::RoTxn<MainT>,
automatons: &[QueryWordAutomaton],
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
) -> MResult<Vec<BareMatch<'tag>>>
{
let mut before_words_fst = Instant::now();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
debug!("words fst took {:.02?}", before_words_fst.elapsed());
let mut total_postings_lists = Vec::new();
let mut dfa_time = Duration::default();
let mut stream_next_time = Duration::default();
let mut postings_lists_fetching_time = Duration::default();
for (query_index, automaton) in automatons.iter().enumerate() {
let before_dfa = Instant::now();
let dfa = automaton.dfa();
let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton;
dfa_time += before_dfa.elapsed();
let mut number_of_words = 0;
let before_fst_search = Instant::now();
let mut stream = words.search(&dfa).into_stream();
debug!("fst search took {:.02?}", before_fst_search.elapsed());
// while let Some(input) = stream.next() {
loop {
let before_stream_next = Instant::now();
let input = match stream.next() {
Some(input) => input,
None => break,
};
stream_next_time += before_stream_next.elapsed();
number_of_words += 1;
let distance = dfa.eval(input).to_u8();
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
let before_postings_lists_fetching = Instant::now();
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
let input = Rc::from(input);
let postings_list = Rc::new(postings_list);
let postings_list_view = PostingsListView::new(input, postings_list);
let mut offset = 0;
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
let document_id = group[0].document_id;
let stuffed = BareMatch {
document_id,
query_index: query_index as u16,
distance,
is_exact,
postings_list: posting_list_index,
};
total_postings_lists.push(stuffed);
offset += group.len();
}
}
postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
}
debug!("{:?} gives {} words", query, number_of_words);
}
debug!("stream next took {:.02?}", stream_next_time);
debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
debug!("dfa creation took {:.02?}", dfa_time);
Ok(total_postings_lists)
}
#[derive(Debug)]
pub struct QueryWordAutomaton {
pub query: String,
/// Is it a word that must be considered exact
/// or is it some derived word (i.e. a synonym)
pub is_exact: bool,
pub is_prefix: bool,
/// If it's a phrase query and what is
/// its index an the length of the phrase
pub phrase_query: Option<(u16, u16)>,
}
impl QueryWordAutomaton {
pub fn exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: false,
phrase_query: None,
}
}
pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: true,
phrase_query: None,
}
}
pub fn non_exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton {
query: query.to_string(),
is_exact: false,
is_prefix: false,
phrase_query: None,
}
}
pub fn dfa(&self) -> DFA {
if self.phrase_query.is_some() {
build_exact_dfa(&self.query)
} else if self.is_prefix {
build_prefix_dfa(&self.query)
} else {
build_dfa(&self.query)
}
}
}
fn construct_automatons2(
reader: &heed::RoTxn<MainT>,
query: &str,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
synonym_store: store::Synonyms,
) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
Some(synonym) => synonym,
None => fst::Set::default(),
};
let mut automaton_index = 0;
let mut automatons = Vec::new();
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
// We must not declare the original words to the query enhancer
// *but* we need to push them in the automatons list first
let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
let automaton = if not_prefix_dfa {
QueryWordAutomaton::exact(word)
} else {
QueryWordAutomaton::exact_prefix(word)
};
automaton_index += 1;
automatons.push(automaton);
}
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap();
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 {
QueryWordAutomaton::exact(synonym)
} else {
QueryWordAutomaton::non_exact(synonym)
};
automaton_index += 1;
automatons.push(automaton);
}
}
}
}
if n == 1 {
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1;
automatons.push(left_automaton);
let mut right_automaton = QueryWordAutomaton::exact(right);
right_automaton.phrase_query = Some((1, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
automaton_index += 1;
automatons.push(right_automaton);
}
} else {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
let automaton = QueryWordAutomaton::exact(&normalized);
automaton_index += 1;
automatons.push(automaton);
}
}
}
Ok((automatons, enhancer_builder.build()))
}