Fix the processed distance algorithm

This commit is contained in:
Clément Renault 2019-12-07 13:32:43 +01:00
parent 0f698d6bd9
commit 22b19c0d93
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
2 changed files with 27 additions and 11 deletions

View File

@ -1,4 +1,5 @@
use std::ops::Deref; use std::ops::Deref;
use std::fmt;
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::HashSet; use std::collections::HashSet;
@ -145,7 +146,6 @@ pub struct RawDocument<'a, 'tag> {
pub raw_matches: &'a mut [BareMatch<'tag>], pub raw_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>, pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found /// The list of minimum `distance` found
/// where the `query_index` is the index
pub processed_distances: Vec<Option<u8>>, pub processed_distances: Vec<Option<u8>>,
} }
@ -157,6 +157,17 @@ pub struct BareMatch<'tag> {
pub postings_list: Idx32<'tag>, pub postings_list: Idx32<'tag>,
} }
impl fmt::Debug for BareMatch<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BareMatch")
.field("document_id", &self.document_id)
.field("query_index", &self.query_index)
.field("distance", &self.distance)
.field("is_exact", &self.is_exact)
.finish()
}
}
// TODO remove that // TODO remove that
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct SimpleMatch { pub struct SimpleMatch {
@ -238,14 +249,11 @@ fn fetch_matches<'txn, 'tag>(
for (query_index, automaton) in automatons.iter().enumerate() { for (query_index, automaton) in automatons.iter().enumerate() {
let before_dfa = Instant::now(); let before_dfa = Instant::now();
let dfa = automaton.dfa(); let dfa = automaton.dfa();
let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton; let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton;
dfa_time += before_dfa.elapsed(); dfa_time += before_dfa.elapsed();
let mut number_of_words = 0; let mut number_of_words = 0;
let before_fst_search = Instant::now();
let mut stream = words.search(&dfa).into_stream(); let mut stream = words.search(&dfa).into_stream();
debug!("fst search took {:.02?}", before_fst_search.elapsed());
// while let Some(input) = stream.next() { // while let Some(input) = stream.next() {
loop { loop {
@ -272,7 +280,7 @@ fn fetch_matches<'txn, 'tag>(
let posting_list_index = arena.add(postings_list_view.range(offset, group.len())); let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
let document_id = group[0].document_id; let document_id = group[0].document_id;
let stuffed = BareMatch { let bare_match = BareMatch {
document_id, document_id,
query_index: query_index as u16, query_index: query_index as u16,
distance, distance,
@ -280,7 +288,7 @@ fn fetch_matches<'txn, 'tag>(
postings_list: posting_list_index, postings_list: posting_list_index,
}; };
total_postings_lists.push(stuffed); total_postings_lists.push(bare_match);
offset += group.len(); offset += group.len();
} }
} }
@ -434,7 +442,7 @@ fn construct_automatons2(
} }
} }
if n == 1 { if false && n == 1 {
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
let mut left_automaton = QueryWordAutomaton::exact(left); let mut left_automaton = QueryWordAutomaton::exact(left);
left_automaton.phrase_query = Some((0, 2)); left_automaton.phrase_query = Some((0, 2));

View File

@ -46,14 +46,22 @@ pub trait Criterion {
fn prepare_query_distances( fn prepare_query_distances(
documents: &mut [RawDocument], documents: &mut [RawDocument],
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) { ) {
for document in documents { for document in documents {
if !document.processed_distances.is_empty() { continue } if !document.processed_distances.is_empty() { continue }
let mut processed = Vec::new(); let mut processed = Vec::new();
for m in document.raw_matches.iter() { for m in document.raw_matches.iter() {
// FIXME we really need to take splitted words into account
// those must be seen at the same level as the non-splitteds
// if automatons[m.query_index as usize].phrase_query.is_some() {
// continue
// }
let range = query_enhancer.replacement(m.query_index as u32); let range = query_enhancer.replacement(m.query_index as u32);
processed.resize(range.end as usize, None); let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range { for index in range {
let index = index as usize; let index = index as usize;
@ -81,7 +89,7 @@ impl Criterion for Typo {
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton], automatons: &[QueryWordAutomaton],
) { ) {
prepare_query_distances(documents, query_enhancer); prepare_query_distances(documents, query_enhancer, automatons);
} }
fn evaluate( fn evaluate(
@ -139,7 +147,7 @@ impl Criterion for Words {
query_enhancer: &QueryEnhancer, query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton], automatons: &[QueryWordAutomaton],
) { ) {
prepare_query_distances(documents, query_enhancer); prepare_query_distances(documents, query_enhancer, automatons);
} }
fn evaluate( fn evaluate(