Work in progress: Bad Typo detection

I have an issue where "speakers" is split into "speaker" and "s",
when I compute the distances for the Typo criterion,
it takes "s" into account and put a distance of zero in the bucket 0
(the "speakers" bucket), therefore it reports any document matching "s"
without typos as best results.

I need to make sure to ignore "s" when its associated part "speaker"
doesn't even exist in the document and is not in the place
it should be ("speaker" followed by "s").

This is hard to think that it will had much computation time to
the Typo criterion like in the previous algorithm where I computed
the real query/words indexes based and removed the invalid ones
before sending the documents to the bucket sort.
This commit is contained in:
Clément Renault 2019-12-06 19:15:19 +01:00
parent 4e91b31b1f
commit 0f698d6bd9
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
4 changed files with 111 additions and 29 deletions

View File

@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA {
pub fn build_dfa(query: &str) -> DFA { pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix) build_dfa_with_setting(query, PrefixSetting::NoPrefix)
} }
pub fn build_exact_dfa(query: &str) -> DFA {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
builder.build_dfa(query)
}

View File

@ -13,7 +13,7 @@ use crate::database::MainT;
use crate::error::MResult; use crate::error::MResult;
use crate::store; use crate::store;
pub use self::dfa::{build_dfa, build_prefix_dfa}; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
pub use self::query_enhancer::QueryEnhancer; pub use self::query_enhancer::QueryEnhancer;
pub use self::query_enhancer::QueryEnhancerBuilder; pub use self::query_enhancer::QueryEnhancerBuilder;

View File

@ -19,7 +19,7 @@ use slice_group_by::{GroupBy, GroupByMut};
use crate::automaton::NGRAMS; use crate::automaton::NGRAMS;
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder}; use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
use crate::automaton::{build_dfa, build_prefix_dfa}; use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::automaton::{normalize_str, split_best_frequency}; use crate::automaton::{normalize_str, split_best_frequency};
use crate::criterion2::*; use crate::criterion2::*;
@ -41,6 +41,8 @@ pub fn bucket_sort<'c>(
let (automatons, query_enhancer) = let (automatons, query_enhancer) =
construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?; construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;
debug!("{:?}", query_enhancer);
let before_postings_lists_fetching = Instant::now(); let before_postings_lists_fetching = Instant::now();
mk_arena!(arena); mk_arena!(arena);
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?; let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
@ -74,7 +76,7 @@ pub fn bucket_sort<'c>(
let criteria = [ let criteria = [
Box::new(Typo) as Box<dyn Criterion>, Box::new(Typo) as Box<dyn Criterion>,
Box::new(Words), Box::new(Words) as Box<dyn Criterion>,
Box::new(Proximity), Box::new(Proximity),
Box::new(Attribute), Box::new(Attribute),
Box::new(WordsPosition), Box::new(WordsPosition),
@ -88,7 +90,7 @@ pub fn bucket_sort<'c>(
for mut group in tmp_groups { for mut group in tmp_groups {
let before_criterion_preparation = Instant::now(); let before_criterion_preparation = Instant::now();
criterion.prepare(&mut group, &mut arena, &query_enhancer); criterion.prepare(&mut group, &mut arena, &query_enhancer, &automatons);
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed()); debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let before_criterion_sort = Instant::now(); let before_criterion_sort = Instant::now();
@ -116,6 +118,7 @@ pub fn bucket_sort<'c>(
let postings_list = &arena[sm.postings_list]; let postings_list = &arena[sm.postings_list];
let input = postings_list.input(); let input = postings_list.input();
let query = &automatons[sm.query_index as usize].query; let query = &automatons[sm.query_index as usize].query;
debug!("{:?} contains {:?}", d.raw_matches[0].document_id, query);
postings_list.iter().map(move |m| { postings_list.iter().map(move |m| {
let covered_area = if query.len() > input.len() { let covered_area = if query.len() > input.len() {
input.len() input.len()
@ -126,6 +129,8 @@ pub fn bucket_sort<'c>(
}) })
}).collect(); }).collect();
debug!("{:?} contains {:?}", d.raw_matches[0].document_id, d.processed_distances);
Document { Document {
id: d.raw_matches[0].document_id, id: d.raw_matches[0].document_id,
highlights, highlights,
@ -233,7 +238,7 @@ fn fetch_matches<'txn, 'tag>(
for (query_index, automaton) in automatons.iter().enumerate() { for (query_index, automaton) in automatons.iter().enumerate() {
let before_dfa = Instant::now(); let before_dfa = Instant::now();
let dfa = automaton.dfa(); let dfa = automaton.dfa();
let QueryWordAutomaton { query, is_exact, is_prefix } = automaton; let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton;
dfa_time += before_dfa.elapsed(); dfa_time += before_dfa.elapsed();
let mut number_of_words = 0; let mut number_of_words = 0;
@ -294,28 +299,48 @@ fn fetch_matches<'txn, 'tag>(
#[derive(Debug)] #[derive(Debug)]
pub struct QueryWordAutomaton { pub struct QueryWordAutomaton {
query: String, pub query: String,
/// Is it a word that must be considered exact /// Is it a word that must be considered exact
/// or is it some derived word (i.e. a synonym) /// or is it some derived word (i.e. a synonym)
is_exact: bool, pub is_exact: bool,
is_prefix: bool, pub is_prefix: bool,
/// If it's a phrase query and what is
/// its index an the length of the phrase
pub phrase_query: Option<(u16, u16)>,
} }
impl QueryWordAutomaton { impl QueryWordAutomaton {
pub fn exact(query: &str) -> QueryWordAutomaton { pub fn exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: false } QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: false,
phrase_query: None,
}
} }
pub fn exact_prefix(query: &str) -> QueryWordAutomaton { pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: true } QueryWordAutomaton {
query: query.to_string(),
is_exact: true,
is_prefix: true,
phrase_query: None,
}
} }
pub fn non_exact(query: &str) -> QueryWordAutomaton { pub fn non_exact(query: &str) -> QueryWordAutomaton {
QueryWordAutomaton { query: query.to_string(), is_exact: false, is_prefix: false } QueryWordAutomaton {
query: query.to_string(),
is_exact: false,
is_prefix: false,
phrase_query: None,
}
} }
pub fn dfa(&self) -> DFA { pub fn dfa(&self) -> DFA {
if self.is_prefix { if self.phrase_query.is_some() {
build_exact_dfa(&self.query)
} else if self.is_prefix {
build_prefix_dfa(&self.query) build_prefix_dfa(&self.query)
} else { } else {
build_dfa(&self.query) build_dfa(&self.query)
@ -411,16 +436,17 @@ fn construct_automatons2(
if n == 1 { if n == 1 {
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let left_automaton = QueryWordAutomaton::exact(left); let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
automaton_index += 1; automaton_index += 1;
automatons.push(left_automaton); automatons.push(left_automaton);
let right_automaton = QueryWordAutomaton::exact(right); let mut right_automaton = QueryWordAutomaton::exact(right);
right_automaton.phrase_query = Some((1, 2));
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
automaton_index += 1; automaton_index += 1;
automatons.push(right_automaton); automatons.push(right_automaton);
} }
} else { } else {
// automaton of concatenation of query words // automaton of concatenation of query words

View File

@ -5,9 +5,10 @@ use std::sync::atomic::{self, AtomicUsize};
use slice_group_by::{GroupBy, GroupByMut}; use slice_group_by::{GroupBy, GroupByMut};
use compact_arena::SmallArena; use compact_arena::SmallArena;
use sdset::{Set, SetBuf}; use sdset::{Set, SetBuf};
use log::debug;
use crate::{DocIndex, DocumentId}; use crate::{DocIndex, DocumentId};
use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView}; use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton};
use crate::automaton::QueryEnhancer; use crate::automaton::QueryEnhancer;
type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>; type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>;
@ -20,6 +21,7 @@ pub trait Criterion {
documents: &mut [RawDocument<'a, 'tag>], documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>, postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
); );
fn evaluate<'a, 'tag, 'txn>( fn evaluate<'a, 'tag, 'txn>(
@ -77,6 +79,7 @@ impl Criterion for Typo {
documents: &mut [RawDocument], documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena, postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
prepare_query_distances(documents, query_enhancer); prepare_query_distances(documents, query_enhancer);
} }
@ -134,6 +137,7 @@ impl Criterion for Words {
documents: &mut [RawDocument], documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena, postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
prepare_query_distances(documents, query_enhancer); prepare_query_distances(documents, query_enhancer);
} }
@ -161,6 +165,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>], documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>, postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
for document in documents { for document in documents {
if !document.processed_matches.is_empty() { continue } if !document.processed_matches.is_empty() { continue }
@ -181,7 +186,7 @@ fn prepare_raw_matches<'a, 'tag, 'txn>(
} }
} }
let processed = multiword_rewrite_matches(&mut processed, query_enhancer); let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
document.processed_matches = processed.into_vec(); document.processed_matches = processed.into_vec();
} }
} }
@ -196,8 +201,9 @@ impl Criterion for Proximity {
documents: &mut [RawDocument<'a, 'tag>], documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>, postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
prepare_raw_matches(documents, postings_lists, query_enhancer); prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
} }
fn evaluate<'a, 'tag, 'txn>( fn evaluate<'a, 'tag, 'txn>(
@ -264,8 +270,9 @@ impl Criterion for Attribute {
documents: &mut [RawDocument<'a, 'tag>], documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>, postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
prepare_raw_matches(documents, postings_lists, query_enhancer); prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
} }
fn evaluate<'a, 'tag, 'txn>( fn evaluate<'a, 'tag, 'txn>(
@ -276,16 +283,16 @@ impl Criterion for Attribute {
) -> Ordering ) -> Ordering
{ {
#[inline] #[inline]
fn sum_attribute(matches: &[SimpleMatch]) -> usize { fn best_attribute(matches: &[SimpleMatch]) -> u16 {
let mut sum_attribute = 0; let mut best_attribute = u16::max_value();
for group in matches.linear_group_by_key(|bm| bm.query_index) { for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_attribute += group[0].attribute as usize; best_attribute = cmp::min(best_attribute, group[0].attribute);
} }
sum_attribute best_attribute
} }
let lhs = sum_attribute(&lhs.processed_matches); let lhs = best_attribute(&lhs.processed_matches);
let rhs = sum_attribute(&rhs.processed_matches); let rhs = best_attribute(&rhs.processed_matches);
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }
@ -301,8 +308,9 @@ impl Criterion for WordsPosition {
documents: &mut [RawDocument<'a, 'tag>], documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>, postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
prepare_raw_matches(documents, postings_lists, query_enhancer); prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
} }
fn evaluate<'a, 'tag, 'txn>( fn evaluate<'a, 'tag, 'txn>(
@ -338,6 +346,7 @@ impl Criterion for Exact {
documents: &mut [RawDocument], documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena, postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
for document in documents { for document in documents {
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact))); document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
@ -379,6 +388,7 @@ impl Criterion for StableDocId {
documents: &mut [RawDocument], documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena, postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
// ... // ...
} }
@ -398,17 +408,58 @@ impl Criterion for StableDocId {
} }
pub fn multiword_rewrite_matches( pub fn multiword_rewrite_matches(
matches: &mut [SimpleMatch], simple_matches: &mut [SimpleMatch],
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) -> SetBuf<SimpleMatch> ) -> SetBuf<SimpleMatch>
{ {
let mut padded_matches = Vec::with_capacity(matches.len()); let mut matches = Vec::with_capacity(simple_matches.len());
// let before_sort = Instant::now(); // let before_sort = Instant::now();
// we sort the matches by word index to make them rewritable // we sort the matches by word index to make them rewritable
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index)); simple_matches.sort_unstable_by_key(|m| (m.attribute, m.query_index, m.word_index));
// debug!("sorting dirty matches took {:.02?}", before_sort.elapsed()); // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
for same_attribute in simple_matches.linear_group_by_key(|m| m.attribute) {
let iter = same_attribute.linear_group_by_key(|m| m.query_index);
let mut iter = iter.peekable();
while let Some(same_query_index) = iter.next() {
let query_index = same_query_index[0].query_index;
// TODO we need to support phrase query of longer length
if let Some((i, len)) = automatons[query_index as usize].phrase_query {
if i != 0 { continue }
// is the next query_index group the required one
if iter.peek().map_or(false, |g| g[0].query_index == query_index + 1) {
if let Some(next) = iter.next() {
for ma in same_query_index {
for mb in next {
if ma.word_index == mb.word_index + 1 {
matches.push(*ma);
matches.push(*mb);
}
}
}
}
}
} else {
matches.extend_from_slice(same_query_index);
}
}
}
// let is_phrase_query = automatons[match_.query_index as usize].phrase_query_len.is_some();
// let next_query_index = match_.query_index + 1;
// if is_phrase_query && iter.remainder().iter().find(|m| m.query_index == next_query_index).is_none() {
// continue
// }
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now(); // let before_padding = Instant::now();
// for each attribute of each document // for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) { for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {