From 22b19c0d9316a93af04e274a015fade56943cd72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 7 Dec 2019 13:32:43 +0100 Subject: [PATCH] Fix the processed distance algorithm --- meilisearch-core/src/bucket_sort.rs | 24 ++++++++++++++++-------- meilisearch-core/src/criterion2.rs | 14 +++++++++++--- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 7477ff383..303e94e50 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -1,4 +1,5 @@ use std::ops::Deref; +use std::fmt; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; @@ -145,7 +146,6 @@ pub struct RawDocument<'a, 'tag> { pub raw_matches: &'a mut [BareMatch<'tag>], pub processed_matches: Vec, /// The list of minimum `distance` found - /// where the `query_index` is the index pub processed_distances: Vec>, } @@ -157,6 +157,17 @@ pub struct BareMatch<'tag> { pub postings_list: Idx32<'tag>, } +impl fmt::Debug for BareMatch<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("BareMatch") + .field("document_id", &self.document_id) + .field("query_index", &self.query_index) + .field("distance", &self.distance) + .field("is_exact", &self.is_exact) + .finish() + } +} + // TODO remove that #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct SimpleMatch { @@ -238,14 +249,11 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton; + let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; - - let before_fst_search = Instant::now(); let mut stream = words.search(&dfa).into_stream(); - debug!("fst search took {:.02?}", before_fst_search.elapsed()); // while let Some(input) = stream.next() { loop { @@ -272,7 +280,7 @@ fn fetch_matches<'txn, 'tag>( let posting_list_index = arena.add(postings_list_view.range(offset, group.len())); let document_id = group[0].document_id; - let stuffed = BareMatch { + let bare_match = BareMatch { document_id, query_index: query_index as u16, distance, @@ -280,7 +288,7 @@ fn fetch_matches<'txn, 'tag>( postings_list: posting_list_index, }; - total_postings_lists.push(stuffed); + total_postings_lists.push(bare_match); offset += group.len(); } } @@ -434,7 +442,7 @@ fn construct_automatons2( } } - if n == 1 { + if false && n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { let mut left_automaton = QueryWordAutomaton::exact(left); left_automaton.phrase_query = Some((0, 2)); diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index dd7769261..4c40b9969 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -46,14 +46,22 @@ pub trait Criterion { fn prepare_query_distances( documents: &mut [RawDocument], query_enhancer: &QueryEnhancer, + automatons: &[QueryWordAutomaton], ) { for document in documents { if !document.processed_distances.is_empty() { continue } let mut processed = Vec::new(); for m in document.raw_matches.iter() { + // FIXME we really need to take splitted words into account + // those must be seen at the same level as the non-splitteds + // if automatons[m.query_index as usize].phrase_query.is_some() { + // continue + // } + let range = query_enhancer.replacement(m.query_index as u32); - processed.resize(range.end as usize, None); + let new_len = cmp::max(range.end as usize, processed.len()); + processed.resize(new_len, None); for index in range { let index = index as usize; @@ -81,7 +89,7 @@ impl Criterion for Typo { query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer); + prepare_query_distances(documents, query_enhancer, automatons); } fn evaluate( @@ -139,7 +147,7 @@ impl Criterion for Words { query_enhancer: &QueryEnhancer, automatons: &[QueryWordAutomaton], ) { - prepare_query_distances(documents, query_enhancer); + prepare_query_distances(documents, query_enhancer, automatons); } fn evaluate(