From 4e91b31b1f68e976779571139bb78949d928f32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 6 Dec 2019 13:41:22 +0100 Subject: [PATCH] Make the Typo and Words work with synonyms --- .../src/automaton/query_enhancer.rs | 2 +- meilisearch-core/src/bucket_sort.rs | 72 ++++++---------- meilisearch-core/src/criterion2.rs | 82 ++++++++++++------- 3 files changed, 76 insertions(+), 80 deletions(-) diff --git a/meilisearch-core/src/automaton/query_enhancer.rs b/meilisearch-core/src/automaton/query_enhancer.rs index f564239d7..4b7582dd5 100644 --- a/meilisearch-core/src/automaton/query_enhancer.rs +++ b/meilisearch-core/src/automaton/query_enhancer.rs @@ -188,7 +188,7 @@ fn replacement( let n = real - range.start; let start = origins[origin]; - let end = origins[new_origin + 1]; + let end = origins.get(new_origin + 1)?; let remaining = (end - start) - n; Some(Range { diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 0fb1fed3b..8e4612c22 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -56,7 +56,11 @@ pub fn bucket_sort<'c>( let before_raw_documents_building = Instant::now(); let mut raw_documents = Vec::new(); for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) { - raw_documents.push(RawDocument { raw_matches, processed_matches: None }); + raw_documents.push(RawDocument { + raw_matches, + processed_matches: Vec::new(), + processed_distances: Vec::new(), + }); } debug!("creating {} candidates documents took {:.02?}", raw_documents.len(), @@ -134,7 +138,10 @@ pub fn bucket_sort<'c>( pub struct RawDocument<'a, 'tag> { pub raw_matches: &'a mut [BareMatch<'tag>], - pub processed_matches: Option>, + pub processed_matches: Vec, + /// The list of minimum `distance` found + /// where the `query_index` is the index + pub processed_distances: Vec>, } pub struct BareMatch<'tag> { @@ -226,7 +233,7 @@ fn fetch_matches<'txn, 'tag>( for (query_index, automaton) in automatons.iter().enumerate() { let before_dfa = Instant::now(); let dfa = automaton.dfa(); - let QueryWordAutomaton { index, query, is_exact, is_prefix } = automaton; + let QueryWordAutomaton { query, is_exact, is_prefix } = automaton; dfa_time += before_dfa.elapsed(); let mut number_of_words = 0; @@ -287,7 +294,6 @@ fn fetch_matches<'txn, 'tag>( #[derive(Debug)] pub struct QueryWordAutomaton { - index: usize, query: String, /// Is it a word that must be considered exact /// or is it some derived word (i.e. a synonym) @@ -296,16 +302,16 @@ pub struct QueryWordAutomaton { } impl QueryWordAutomaton { - pub fn exact(query: &str, index: usize) -> QueryWordAutomaton { - QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: false } + pub fn exact(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: false } } - pub fn exact_prefix(query: &str, index: usize) -> QueryWordAutomaton { - QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: true } + pub fn exact_prefix(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { query: query.to_string(), is_exact: true, is_prefix: true } } - pub fn non_exact(query: &str, index: usize) -> QueryWordAutomaton { - QueryWordAutomaton { index, query: query.to_string(), is_exact: false, is_prefix: false } + pub fn non_exact(query: &str) -> QueryWordAutomaton { + QueryWordAutomaton { query: query.to_string(), is_exact: false, is_prefix: false } } pub fn dfa(&self) -> DFA { @@ -317,27 +323,6 @@ impl QueryWordAutomaton { } } -// fn construct_automatons(query: &str) -> Vec { -// let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); -// let mut original_words = split_query_string(query).map(str::to_lowercase).peekable(); -// let mut automatons = Vec::new(); - -// while let Some(word) = original_words.next() { -// let has_following_word = original_words.peek().is_some(); -// let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - -// let automaton = if not_prefix_dfa { -// QueryWordAutomaton::exact(word) -// } else { -// QueryWordAutomaton::exact_prefix(word) -// }; - -// automatons.push(automaton); -// } - -// automatons -// } - fn construct_automatons2( reader: &heed::RoTxn, query: &str, @@ -364,9 +349,9 @@ fn construct_automatons2( let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); let automaton = if not_prefix_dfa { - QueryWordAutomaton::exact(word, automaton_index) + QueryWordAutomaton::exact(word) } else { - QueryWordAutomaton::exact_prefix(word, automaton_index) + QueryWordAutomaton::exact_prefix(word) }; automaton_index += 1; automatons.push(automaton); @@ -413,9 +398,9 @@ fn construct_automatons2( for synonym in synonyms_words { let automaton = if nb_synonym_words == 1 { - QueryWordAutomaton::exact(synonym, automaton_index) + QueryWordAutomaton::exact(synonym) } else { - QueryWordAutomaton::non_exact(synonym, automaton_index) + QueryWordAutomaton::non_exact(synonym) }; automaton_index += 1; automatons.push(automaton); @@ -426,12 +411,12 @@ fn construct_automatons2( if n == 1 { if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { - let left_automaton = QueryWordAutomaton::exact(left, automaton_index); + let left_automaton = QueryWordAutomaton::exact(left); enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); automaton_index += 1; automatons.push(left_automaton); - let right_automaton = QueryWordAutomaton::exact(right, automaton_index); + let right_automaton = QueryWordAutomaton::exact(right); enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); automaton_index += 1; automatons.push(right_automaton); @@ -445,23 +430,12 @@ fn construct_automatons2( let real_query_index = automaton_index; enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - let automaton = QueryWordAutomaton::exact(&normalized, automaton_index); + let automaton = QueryWordAutomaton::exact(&normalized); automaton_index += 1; automatons.push(automaton); } } } - // // order automatons, the most important first, - // // we keep the original automatons at the front. - // automatons[1..].sort_by_key(|group| { - // let a = group.automatons.first().unwrap(); - // ( - // Reverse(a.is_exact), - // a.ngram, - // Reverse(group.automatons.len()), - // ) - // }); - Ok((automatons, enhancer_builder.build())) } diff --git a/meilisearch-core/src/criterion2.rs b/meilisearch-core/src/criterion2.rs index 469f936fa..4adb69dea 100644 --- a/meilisearch-core/src/criterion2.rs +++ b/meilisearch-core/src/criterion2.rs @@ -41,6 +41,32 @@ pub trait Criterion { } } +fn prepare_query_distances( + documents: &mut [RawDocument], + query_enhancer: &QueryEnhancer, +) { + for document in documents { + if !document.processed_distances.is_empty() { continue } + + let mut processed = Vec::new(); + for m in document.raw_matches.iter() { + let range = query_enhancer.replacement(m.query_index as u32); + processed.resize(range.end as usize, None); + + for index in range { + let index = index as usize; + processed[index] = match processed[index] { + Some(distance) if distance > m.distance => Some(m.distance), + Some(distance) => Some(distance), + None => Some(m.distance), + }; + } + } + + document.processed_distances = processed; + } +} + pub struct Typo; impl Criterion for Typo { @@ -52,9 +78,7 @@ impl Criterion for Typo { postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, ) { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, bm.distance)); - } + prepare_query_distances(documents, query_enhancer); } fn evaluate( @@ -79,20 +103,22 @@ impl Criterion for Typo { } #[inline] - fn compute_typos(matches: &[BareMatch]) -> usize { + fn compute_typos(distances: &[Option]) -> usize { let mut number_words: usize = 0; let mut sum_typos = 0.0; - for group in matches.linear_group_by_key(|bm| bm.query_index) { - sum_typos += custom_log10(group[0].distance); - number_words += 1; + for distance in distances { + if let Some(distance) = distance { + sum_typos += custom_log10(*distance); + number_words += 1; + } } (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize } - let lhs = compute_typos(&lhs.raw_matches); - let rhs = compute_typos(&rhs.raw_matches); + let lhs = compute_typos(&lhs.processed_distances); + let rhs = compute_typos(&rhs.processed_distances); lhs.cmp(&rhs).reverse() } @@ -109,9 +135,7 @@ impl Criterion for Words { postings_lists: &mut PostingsListsArena, query_enhancer: &QueryEnhancer, ) { - for document in documents { - document.raw_matches.sort_unstable_by_key(|bm| bm.query_index); - } + prepare_query_distances(documents, query_enhancer); } fn evaluate( @@ -122,28 +146,26 @@ impl Criterion for Words { ) -> Ordering { #[inline] - fn number_of_query_words(matches: &[BareMatch]) -> usize { - matches.linear_group_by_key(|bm| bm.query_index).count() + fn number_of_query_words(distances: &[Option]) -> usize { + distances.iter().cloned().filter(Option::is_some).count() } - let lhs = number_of_query_words(&lhs.raw_matches); - let rhs = number_of_query_words(&rhs.raw_matches); + let lhs = number_of_query_words(&lhs.processed_distances); + let rhs = number_of_query_words(&rhs.processed_distances); lhs.cmp(&rhs).reverse() } } -fn process_raw_matches<'a, 'tag, 'txn>( +fn prepare_raw_matches<'a, 'tag, 'txn>( documents: &mut [RawDocument<'a, 'tag>], postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { for document in documents { - if document.processed_matches.is_some() { continue } + if !document.processed_matches.is_empty() { continue } let mut processed = Vec::new(); - let document_id = document.raw_matches[0].document_id; - for m in document.raw_matches.iter() { let postings_list = &postings_lists[m.postings_list]; processed.reserve(postings_list.len()); @@ -160,7 +182,7 @@ fn process_raw_matches<'a, 'tag, 'txn>( } let processed = multiword_rewrite_matches(&mut processed, query_enhancer); - document.processed_matches = Some(processed.into_vec()); + document.processed_matches = processed.into_vec(); } } @@ -175,7 +197,7 @@ impl Criterion for Proximity { postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { - process_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer); } fn evaluate<'a, 'tag, 'txn>( @@ -225,8 +247,8 @@ impl Criterion for Proximity { proximity } - let lhs = matches_proximity(&lhs.processed_matches.as_ref().unwrap()); - let rhs = matches_proximity(&rhs.processed_matches.as_ref().unwrap()); + let lhs = matches_proximity(&lhs.processed_matches); + let rhs = matches_proximity(&rhs.processed_matches); lhs.cmp(&rhs) } @@ -243,7 +265,7 @@ impl Criterion for Attribute { postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { - process_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer); } fn evaluate<'a, 'tag, 'txn>( @@ -262,8 +284,8 @@ impl Criterion for Attribute { sum_attribute } - let lhs = sum_attribute(&lhs.processed_matches.as_ref().unwrap()); - let rhs = sum_attribute(&rhs.processed_matches.as_ref().unwrap()); + let lhs = sum_attribute(&lhs.processed_matches); + let rhs = sum_attribute(&rhs.processed_matches); lhs.cmp(&rhs) } @@ -280,7 +302,7 @@ impl Criterion for WordsPosition { postings_lists: &mut PostingsListsArena<'tag, 'txn>, query_enhancer: &QueryEnhancer, ) { - process_raw_matches(documents, postings_lists, query_enhancer); + prepare_raw_matches(documents, postings_lists, query_enhancer); } fn evaluate<'a, 'tag, 'txn>( @@ -299,8 +321,8 @@ impl Criterion for WordsPosition { sum_words_position } - let lhs = sum_words_position(&lhs.processed_matches.as_ref().unwrap()); - let rhs = sum_words_position(&rhs.processed_matches.as_ref().unwrap()); + let lhs = sum_words_position(&lhs.processed_matches); + let rhs = sum_words_position(&rhs.processed_matches); lhs.cmp(&rhs) }