mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 08:44:27 +01:00
Fix the processed distance algorithm
This commit is contained in:
parent
0f698d6bd9
commit
22b19c0d93
@ -1,4 +1,5 @@
|
|||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
use std::fmt;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
@ -145,7 +146,6 @@ pub struct RawDocument<'a, 'tag> {
|
|||||||
pub raw_matches: &'a mut [BareMatch<'tag>],
|
pub raw_matches: &'a mut [BareMatch<'tag>],
|
||||||
pub processed_matches: Vec<SimpleMatch>,
|
pub processed_matches: Vec<SimpleMatch>,
|
||||||
/// The list of minimum `distance` found
|
/// The list of minimum `distance` found
|
||||||
/// where the `query_index` is the index
|
|
||||||
pub processed_distances: Vec<Option<u8>>,
|
pub processed_distances: Vec<Option<u8>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,6 +157,17 @@ pub struct BareMatch<'tag> {
|
|||||||
pub postings_list: Idx32<'tag>,
|
pub postings_list: Idx32<'tag>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for BareMatch<'_> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.debug_struct("BareMatch")
|
||||||
|
.field("document_id", &self.document_id)
|
||||||
|
.field("query_index", &self.query_index)
|
||||||
|
.field("distance", &self.distance)
|
||||||
|
.field("is_exact", &self.is_exact)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO remove that
|
// TODO remove that
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub struct SimpleMatch {
|
pub struct SimpleMatch {
|
||||||
@ -238,14 +249,11 @@ fn fetch_matches<'txn, 'tag>(
|
|||||||
for (query_index, automaton) in automatons.iter().enumerate() {
|
for (query_index, automaton) in automatons.iter().enumerate() {
|
||||||
let before_dfa = Instant::now();
|
let before_dfa = Instant::now();
|
||||||
let dfa = automaton.dfa();
|
let dfa = automaton.dfa();
|
||||||
let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton;
|
let QueryWordAutomaton { query, is_exact, is_prefix, phrase_query } = automaton;
|
||||||
dfa_time += before_dfa.elapsed();
|
dfa_time += before_dfa.elapsed();
|
||||||
|
|
||||||
let mut number_of_words = 0;
|
let mut number_of_words = 0;
|
||||||
|
|
||||||
let before_fst_search = Instant::now();
|
|
||||||
let mut stream = words.search(&dfa).into_stream();
|
let mut stream = words.search(&dfa).into_stream();
|
||||||
debug!("fst search took {:.02?}", before_fst_search.elapsed());
|
|
||||||
|
|
||||||
// while let Some(input) = stream.next() {
|
// while let Some(input) = stream.next() {
|
||||||
loop {
|
loop {
|
||||||
@ -272,7 +280,7 @@ fn fetch_matches<'txn, 'tag>(
|
|||||||
|
|
||||||
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
|
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
|
||||||
let document_id = group[0].document_id;
|
let document_id = group[0].document_id;
|
||||||
let stuffed = BareMatch {
|
let bare_match = BareMatch {
|
||||||
document_id,
|
document_id,
|
||||||
query_index: query_index as u16,
|
query_index: query_index as u16,
|
||||||
distance,
|
distance,
|
||||||
@ -280,7 +288,7 @@ fn fetch_matches<'txn, 'tag>(
|
|||||||
postings_list: posting_list_index,
|
postings_list: posting_list_index,
|
||||||
};
|
};
|
||||||
|
|
||||||
total_postings_lists.push(stuffed);
|
total_postings_lists.push(bare_match);
|
||||||
offset += group.len();
|
offset += group.len();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -434,7 +442,7 @@ fn construct_automatons2(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if n == 1 {
|
if false && n == 1 {
|
||||||
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
||||||
let mut left_automaton = QueryWordAutomaton::exact(left);
|
let mut left_automaton = QueryWordAutomaton::exact(left);
|
||||||
left_automaton.phrase_query = Some((0, 2));
|
left_automaton.phrase_query = Some((0, 2));
|
||||||
|
@ -46,14 +46,22 @@ pub trait Criterion {
|
|||||||
fn prepare_query_distances(
|
fn prepare_query_distances(
|
||||||
documents: &mut [RawDocument],
|
documents: &mut [RawDocument],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
|
automatons: &[QueryWordAutomaton],
|
||||||
) {
|
) {
|
||||||
for document in documents {
|
for document in documents {
|
||||||
if !document.processed_distances.is_empty() { continue }
|
if !document.processed_distances.is_empty() { continue }
|
||||||
|
|
||||||
let mut processed = Vec::new();
|
let mut processed = Vec::new();
|
||||||
for m in document.raw_matches.iter() {
|
for m in document.raw_matches.iter() {
|
||||||
|
// FIXME we really need to take splitted words into account
|
||||||
|
// those must be seen at the same level as the non-splitteds
|
||||||
|
// if automatons[m.query_index as usize].phrase_query.is_some() {
|
||||||
|
// continue
|
||||||
|
// }
|
||||||
|
|
||||||
let range = query_enhancer.replacement(m.query_index as u32);
|
let range = query_enhancer.replacement(m.query_index as u32);
|
||||||
processed.resize(range.end as usize, None);
|
let new_len = cmp::max(range.end as usize, processed.len());
|
||||||
|
processed.resize(new_len, None);
|
||||||
|
|
||||||
for index in range {
|
for index in range {
|
||||||
let index = index as usize;
|
let index = index as usize;
|
||||||
@ -81,7 +89,7 @@ impl Criterion for Typo {
|
|||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
) {
|
) {
|
||||||
prepare_query_distances(documents, query_enhancer);
|
prepare_query_distances(documents, query_enhancer, automatons);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate(
|
fn evaluate(
|
||||||
@ -139,7 +147,7 @@ impl Criterion for Words {
|
|||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
automatons: &[QueryWordAutomaton],
|
automatons: &[QueryWordAutomaton],
|
||||||
) {
|
) {
|
||||||
prepare_query_distances(documents, query_enhancer);
|
prepare_query_distances(documents, query_enhancer, automatons);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate(
|
fn evaluate(
|
||||||
|
Loading…
Reference in New Issue
Block a user