mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Speed-up the MatchingWords highlighting struct
This commit is contained in:
parent
4510bbccca
commit
5af63c74e0
5 changed files with 91 additions and 93 deletions
|
@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
|
|||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
|
||||
pub use self::update_store::UpdateStore;
|
||||
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
use std::{borrow::Cow, collections::HashMap, mem::take};
|
||||
|
||||
use anyhow::bail;
|
||||
use roaring::RoaringBitmap;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
|
||||
use crate::search::word_derivations;
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::time::Instant;
|
||||
|
||||
use fst::{IntoStreamer, Streamer, Set};
|
||||
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
||||
use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
|
||||
use once_cell::sync::Lazy;
|
||||
|
@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult};
|
|||
use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity};
|
||||
use crate::{Index, DocumentId};
|
||||
|
||||
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
||||
pub use self::facet::FacetIter;
|
||||
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
|
||||
pub use self::query_tree::MatchingWords;
|
||||
use self::query_tree::QueryTreeBuilder;
|
||||
|
||||
// Building these factories is not free.
|
||||
|
@ -87,6 +87,11 @@ impl<'a> Search<'a> {
|
|||
|
||||
debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed());
|
||||
|
||||
let matching_words = match query_tree.as_ref() {
|
||||
Some(query_tree) => MatchingWords::from_query_tree(&query_tree),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
// We are testing the typo criteria but there will be more of them soon.
|
||||
let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?;
|
||||
let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?;
|
||||
|
@ -128,8 +133,7 @@ impl<'a> Search<'a> {
|
|||
if limit == 0 { break }
|
||||
}
|
||||
|
||||
let found_words = HashSet::new();
|
||||
Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids })
|
||||
Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> {
|
|||
|
||||
#[derive(Default)]
|
||||
pub struct SearchResult {
|
||||
pub found_words: HashSet<String>,
|
||||
pub matching_words: MatchingWords,
|
||||
pub candidates: RoaringBitmap,
|
||||
// TODO those documents ids should be associated with their criteria scores.
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
}
|
||||
|
||||
pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<[u8]>>) -> anyhow::Result<Vec<(String, u8)>> {
|
||||
let lev = match max_typo {
|
||||
0 => &LEVDIST0,
|
||||
1 => &LEVDIST1,
|
||||
_ => &LEVDIST2,
|
||||
};
|
||||
|
||||
let dfa = if is_prefix {
|
||||
lev.build_prefix_dfa(&word)
|
||||
} else {
|
||||
lev.build_dfa(&word)
|
||||
};
|
||||
|
||||
pub fn word_derivations(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
fst: &fst::Set<Cow<[u8]>>,
|
||||
) -> anyhow::Result<Vec<(String, u8)>>
|
||||
{
|
||||
let mut derived_words = Vec::new();
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
let mut stream = fst.search_with_state(&dfa).into_stream();
|
||||
|
||||
while let Some((word, state)) = stream.next() {
|
||||
|
@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se
|
|||
|
||||
Ok(derived_words)
|
||||
}
|
||||
|
||||
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
|
||||
let lev = match typos {
|
||||
0 => &LEVDIST0,
|
||||
1 => &LEVDIST1,
|
||||
_ => &LEVDIST2,
|
||||
};
|
||||
|
||||
if is_prefix {
|
||||
lev.build_prefix_dfa(word)
|
||||
} else {
|
||||
lev.build_dfa(word)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::{fmt, cmp, mem};
|
||||
|
||||
use levenshtein_automata::{DFA, Distance};
|
||||
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
||||
use roaring::RoaringBitmap;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::Index;
|
||||
use super::build_dfa;
|
||||
|
||||
type IsOptionalWord = bool;
|
||||
type IsPrefix = bool;
|
||||
|
@ -113,6 +114,14 @@ impl QueryKind {
|
|||
QueryKind::Tolerant { typo, word }
|
||||
}
|
||||
|
||||
pub fn is_tolerant(&self) -> bool {
|
||||
matches!(self, QueryKind::Tolerant { .. })
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> bool {
|
||||
matches!(self, QueryKind::Exact { .. })
|
||||
}
|
||||
|
||||
pub fn typo(&self) -> u8 {
|
||||
match self {
|
||||
QueryKind::Tolerant { typo, .. } => *typo,
|
||||
|
@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
|
|||
}
|
||||
|
||||
/// The query tree builder is the interface to build a query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
inner: BTreeMap<String, IsPrefix>
|
||||
dfas: Vec<(DFA, u8)>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
/// List all words which can be considered as a match for the query tree.
|
||||
pub fn from_query_tree(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> Self {
|
||||
Self { inner: fetch_words(tree, fst).into_iter().collect() }
|
||||
pub fn from_query_tree(tree: &Operation) -> Self {
|
||||
Self {
|
||||
dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the word match.
|
||||
pub fn is_match(&self, word: &str) -> bool {
|
||||
fn first_char(s: &str) -> Option<&str> {
|
||||
s.chars().next().map(|c| &s[..c.len_utf8()])
|
||||
}
|
||||
|
||||
match first_char(word) {
|
||||
Some(first) => {
|
||||
let left = first.to_owned();
|
||||
let right = word.to_owned();
|
||||
self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word)
|
||||
},
|
||||
None => false
|
||||
}
|
||||
pub fn matches(&self, word: &str) -> bool {
|
||||
self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) {
|
||||
Distance::Exact(t) => t <= *typo,
|
||||
Distance::AtLeast(_) => false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type FetchedWords = Vec<(String, IsPrefix)>;
|
||||
|
||||
/// Lists all words which can be considered as a match for the query tree.
|
||||
fn fetch_words(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
fn resolve_branch(tree: &[Operation], fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect()
|
||||
}
|
||||
|
||||
fn resolve_query(query: &Query, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
match query.kind.clone() {
|
||||
QueryKind::Exact { word, .. } => vec![(word, query.prefix)],
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) {
|
||||
words.into_iter().map(|(w, _)| (w, query.prefix)).collect()
|
||||
} else {
|
||||
vec![(word, query.prefix)]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_ops(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
|
||||
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
||||
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
|
||||
match tree {
|
||||
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
|
||||
resolve_branch(ops.as_slice(), fst)
|
||||
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
||||
},
|
||||
Operation::Query(ops) => {
|
||||
resolve_query(ops, fst)
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
||||
out.insert((kind.word(), typo, *prefix));
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut words = resolve_ops(tree, fst);
|
||||
words.sort_unstable();
|
||||
words.dedup();
|
||||
words
|
||||
let mut queries = HashSet::new();
|
||||
resolve_ops(tree, &mut queries);
|
||||
queries
|
||||
}
|
||||
|
||||
/// Main function that creates the final query tree from the primitive query.
|
||||
|
@ -559,7 +544,7 @@ mod test {
|
|||
use std::collections::HashMap;
|
||||
|
||||
use fst::Set;
|
||||
use maplit::hashmap;
|
||||
use maplit::{hashmap, hashset};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use rand::{Rng, SeedableRng, rngs::StdRng};
|
||||
|
||||
|
@ -970,26 +955,26 @@ mod test {
|
|||
let context = TestContext::default();
|
||||
let query_tree = context.build(false, true, tokens).unwrap().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
("city".to_string(), false),
|
||||
("earth".to_string(), false),
|
||||
("nature".to_string(), false),
|
||||
("new".to_string(), false),
|
||||
("nyc".to_string(), false),
|
||||
("split".to_string(), false),
|
||||
("word".to_string(), false),
|
||||
("word".to_string(), true),
|
||||
("world".to_string(), true),
|
||||
("york".to_string(), false),
|
||||
|
||||
];
|
||||
let expected = hashset!{
|
||||
("word", 0, false),
|
||||
("nyc", 0, false),
|
||||
("wordsplit", 2, false),
|
||||
("wordsplitnycworld", 2, true),
|
||||
("nature", 0, false),
|
||||
("new", 0, false),
|
||||
("city", 0, false),
|
||||
("world", 1, true),
|
||||
("york", 0, false),
|
||||
("split", 0, false),
|
||||
("nycworld", 1, true),
|
||||
("earth", 0, false),
|
||||
("wordsplitnyc", 2, false),
|
||||
};
|
||||
|
||||
let mut keys = context.postings.keys().collect::<Vec<_>>();
|
||||
keys.sort_unstable();
|
||||
let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
|
||||
|
||||
let words = fetch_words(&query_tree, &set);
|
||||
|
||||
let words = fetch_queries(&query_tree);
|
||||
assert_eq!(expected, words);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue