Speed-up the MatchingWords highlighting struct

This commit is contained in:
Kerollmops 2021-02-24 17:44:35 +01:00
parent 4510bbccca
commit 5af63c74e0
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 91 additions and 93 deletions

View File

@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use milli::facet::FacetValue; use milli::facet::FacetValue;
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition};
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new(); static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
Self { analyzer } Self { analyzer }
} }
fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value { fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value {
match value { match value {
Value::Null => Value::Null, Value::Null => Value::Null,
Value::Bool(boolean) => Value::Bool(boolean), Value::Bool(boolean) => Value::Bool(boolean),
@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
let analyzed = self.analyzer.analyze(&old_string); let analyzed = self.analyzer.analyze(&old_string);
for (word, token) in analyzed.reconstruct() { for (word, token) in analyzed.reconstruct() {
if token.is_word() { if token.is_word() {
let to_highlight = words_to_highlight.contains(token.text()); let to_highlight = matching_words.matches(token.text());
if to_highlight { string.push_str("<mark>") } if to_highlight { string.push_str("<mark>") }
string.push_str(word); string.push_str(word);
if to_highlight { string.push_str("</mark>") } if to_highlight { string.push_str("</mark>") }
@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
}, },
Value::Array(values) => { Value::Array(values) => {
Value::Array(values.into_iter() Value::Array(values.into_iter()
.map(|v| self.highlight_value(v, words_to_highlight)) .map(|v| self.highlight_value(v, matching_words))
.collect()) .collect())
}, },
Value::Object(object) => { Value::Object(object) => {
Value::Object(object.into_iter() Value::Object(object.into_iter()
.map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) .map(|(k, v)| (k, self.highlight_value(v, matching_words)))
.collect()) .collect())
}, },
} }
@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
fn highlight_record( fn highlight_record(
&self, &self,
object: &mut Map<String, Value>, object: &mut Map<String, Value>,
words_to_highlight: &HashSet<String>, matching_words: &MatchingWords,
attributes_to_highlight: &HashSet<String>, attributes_to_highlight: &HashSet<String>,
) { ) {
// TODO do we need to create a string for element that are not and needs to be highlight? // TODO do we need to create a string for element that are not and needs to be highlight?
for (key, value) in object.iter_mut() { for (key, value) in object.iter_mut() {
if attributes_to_highlight.contains(key) { if attributes_to_highlight.contains(key) {
let old_value = mem::take(value); let old_value = mem::take(value);
*value = self.highlight_value(old_value, words_to_highlight); *value = self.highlight_value(old_value, matching_words);
} }
} }
} }
@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> {
search.facet_condition(condition); search.facet_condition(condition);
} }
let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap();
let number_of_candidates = candidates.len(); let number_of_candidates = candidates.len();
let facets = if query.facet_distribution == Some(true) { let facets = if query.facet_distribution == Some(true) {
@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> {
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting { if !disable_highlighting {
highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight);
} }
documents.push(object); documents.push(object);

View File

@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
pub use self::update_store::UpdateStore; pub use self::update_store::UpdateStore;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -1,8 +1,8 @@
use std::{borrow::Cow, collections::HashMap, mem::take}; use std::{borrow::Cow, collections::HashMap, mem::take};
use anyhow::bail; use anyhow::bail;
use roaring::RoaringBitmap;
use log::debug; use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::word_derivations; use crate::search::word_derivations;

View File

@ -1,10 +1,9 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashSet;
use std::fmt; use std::fmt;
use std::time::Instant; use std::time::Instant;
use fst::{IntoStreamer, Streamer, Set}; use fst::{IntoStreamer, Streamer, Set};
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
use log::debug; use log::debug;
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult};
use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity};
use crate::{Index, DocumentId}; use crate::{Index, DocumentId};
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
pub use self::facet::FacetIter; pub use self::facet::FacetIter;
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
pub use self::query_tree::MatchingWords;
use self::query_tree::QueryTreeBuilder; use self::query_tree::QueryTreeBuilder;
// Building these factories is not free. // Building these factories is not free.
@ -87,6 +87,11 @@ impl<'a> Search<'a> {
debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed());
let matching_words = match query_tree.as_ref() {
Some(query_tree) => MatchingWords::from_query_tree(&query_tree),
None => MatchingWords::default(),
};
// We are testing the typo criteria but there will be more of them soon. // We are testing the typo criteria but there will be more of them soon.
let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?;
let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?;
@ -128,8 +133,7 @@ impl<'a> Search<'a> {
if limit == 0 { break } if limit == 0 { break }
} }
let found_words = HashSet::new(); Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids })
} }
} }
@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> {
#[derive(Default)] #[derive(Default)]
pub struct SearchResult { pub struct SearchResult {
pub found_words: HashSet<String>, pub matching_words: MatchingWords,
pub candidates: RoaringBitmap, pub candidates: RoaringBitmap,
// TODO those documents ids should be associated with their criteria scores. // TODO those documents ids should be associated with their criteria scores.
pub documents_ids: Vec<DocumentId>, pub documents_ids: Vec<DocumentId>,
} }
pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<[u8]>>) -> anyhow::Result<Vec<(String, u8)>> { pub fn word_derivations(
let lev = match max_typo { word: &str,
0 => &LEVDIST0, is_prefix: bool,
1 => &LEVDIST1, max_typo: u8,
_ => &LEVDIST2, fst: &fst::Set<Cow<[u8]>>,
}; ) -> anyhow::Result<Vec<(String, u8)>>
{
let dfa = if is_prefix {
lev.build_prefix_dfa(&word)
} else {
lev.build_dfa(&word)
};
let mut derived_words = Vec::new(); let mut derived_words = Vec::new();
let dfa = build_dfa(word, max_typo, is_prefix);
let mut stream = fst.search_with_state(&dfa).into_stream(); let mut stream = fst.search_with_state(&dfa).into_stream();
while let Some((word, state)) = stream.next() { while let Some((word, state)) = stream.next() {
@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se
Ok(derived_words) Ok(derived_words)
} }
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
let lev = match typos {
0 => &LEVDIST0,
1 => &LEVDIST1,
_ => &LEVDIST2,
};
if is_prefix {
lev.build_prefix_dfa(word)
} else {
lev.build_dfa(word)
}
}

View File

@ -1,12 +1,13 @@
use std::borrow::Cow; use std::collections::HashSet;
use std::collections::BTreeMap;
use std::{fmt, cmp, mem}; use std::{fmt, cmp, mem};
use levenshtein_automata::{DFA, Distance};
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::Index; use crate::Index;
use super::build_dfa;
type IsOptionalWord = bool; type IsOptionalWord = bool;
type IsPrefix = bool; type IsPrefix = bool;
@ -113,6 +114,14 @@ impl QueryKind {
QueryKind::Tolerant { typo, word } QueryKind::Tolerant { typo, word }
} }
pub fn is_tolerant(&self) -> bool {
matches!(self, QueryKind::Tolerant { .. })
}
pub fn is_exact(&self) -> bool {
matches!(self, QueryKind::Exact { .. })
}
pub fn typo(&self) -> u8 { pub fn typo(&self) -> u8 {
match self { match self {
QueryKind::Tolerant { typo, .. } => *typo, QueryKind::Tolerant { typo, .. } => *typo,
@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
} }
/// The query tree builder is the interface to build a query tree. /// The query tree builder is the interface to build a query tree.
#[derive(Default)]
pub struct MatchingWords { pub struct MatchingWords {
inner: BTreeMap<String, IsPrefix> dfas: Vec<(DFA, u8)>,
} }
impl MatchingWords { impl MatchingWords {
/// List all words which can be considered as a match for the query tree. /// List all words which can be considered as a match for the query tree.
pub fn from_query_tree(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> Self { pub fn from_query_tree(tree: &Operation) -> Self {
Self { inner: fetch_words(tree, fst).into_iter().collect() } Self {
dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect()
}
} }
/// Return true if the word match. /// Return true if the word match.
pub fn is_match(&self, word: &str) -> bool { pub fn matches(&self, word: &str) -> bool {
fn first_char(s: &str) -> Option<&str> { self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) {
s.chars().next().map(|c| &s[..c.len_utf8()]) Distance::Exact(t) => t <= *typo,
} Distance::AtLeast(_) => false,
})
match first_char(word) {
Some(first) => {
let left = first.to_owned();
let right = word.to_owned();
self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word)
},
None => false
} }
} }
}
type FetchedWords = Vec<(String, IsPrefix)>;
/// Lists all words which can be considered as a match for the query tree. /// Lists all words which can be considered as a match for the query tree.
fn fetch_words(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
fn resolve_branch(tree: &[Operation], fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect()
}
fn resolve_query(query: &Query, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
match query.kind.clone() {
QueryKind::Exact { word, .. } => vec![(word, query.prefix)],
QueryKind::Tolerant { typo, word } => {
if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) {
words.into_iter().map(|(w, _)| (w, query.prefix)).collect()
} else {
vec![(word, query.prefix)]
}
}
}
}
fn resolve_ops(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
match tree { match tree {
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
resolve_branch(ops.as_slice(), fst) ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
}, },
Operation::Query(ops) => { Operation::Query(Query { prefix, kind }) => {
resolve_query(ops, fst) let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix));
}, },
} }
} }
let mut words = resolve_ops(tree, fst); let mut queries = HashSet::new();
words.sort_unstable(); resolve_ops(tree, &mut queries);
words.dedup(); queries
words
} }
/// Main function that creates the final query tree from the primitive query. /// Main function that creates the final query tree from the primitive query.
@ -559,7 +544,7 @@ mod test {
use std::collections::HashMap; use std::collections::HashMap;
use fst::Set; use fst::Set;
use maplit::hashmap; use maplit::{hashmap, hashset};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rand::{Rng, SeedableRng, rngs::StdRng}; use rand::{Rng, SeedableRng, rngs::StdRng};
@ -970,26 +955,26 @@ mod test {
let context = TestContext::default(); let context = TestContext::default();
let query_tree = context.build(false, true, tokens).unwrap().unwrap(); let query_tree = context.build(false, true, tokens).unwrap().unwrap();
let expected = vec![ let expected = hashset!{
("city".to_string(), false), ("word", 0, false),
("earth".to_string(), false), ("nyc", 0, false),
("nature".to_string(), false), ("wordsplit", 2, false),
("new".to_string(), false), ("wordsplitnycworld", 2, true),
("nyc".to_string(), false), ("nature", 0, false),
("split".to_string(), false), ("new", 0, false),
("word".to_string(), false), ("city", 0, false),
("word".to_string(), true), ("world", 1, true),
("world".to_string(), true), ("york", 0, false),
("york".to_string(), false), ("split", 0, false),
("nycworld", 1, true),
]; ("earth", 0, false),
("wordsplitnyc", 2, false),
};
let mut keys = context.postings.keys().collect::<Vec<_>>(); let mut keys = context.postings.keys().collect::<Vec<_>>();
keys.sort_unstable(); keys.sort_unstable();
let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
let words = fetch_words(&query_tree, &set);
let words = fetch_queries(&query_tree);
assert_eq!(expected, words); assert_eq!(expected, words);
} }
} }