Integrate the new Highlighter in the search

This commit is contained in:
ManyTheFish 2023-04-06 13:58:56 +02:00
parent ebe23b04c9
commit 9c5f64769a
6 changed files with 77 additions and 41 deletions

View File

@ -97,8 +97,8 @@ pub use self::heed_codec::{
}; };
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{ pub use self::search::{
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
}; };
pub type Result<T> = std::result::Result<T, error::Error>; pub type Result<T> = std::result::Result<T, error::Error>;

View File

@ -289,8 +289,7 @@ mod tests {
use charabia::TokenKind; use charabia::TokenKind;
use super::*; use super::{MatchingWords, *};
use crate::MatchingWords;
#[test] #[test]
fn test_bytes_to_highlight() { fn test_bytes_to_highlight() {

View File

@ -5,9 +5,7 @@ use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
pub use self::matches::{ pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
};
use crate::{ use crate::{
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
}; };
@ -109,9 +107,9 @@ impl<'a> Search<'a> {
} }
pub fn execute(&self) -> Result<SearchResult> { pub fn execute(&self) -> Result<SearchResult> {
let mut ctx = SearchContext::new(self.index, self.rtxn); let ctx = SearchContext::new(self.index, self.rtxn);
execute_search( execute_search(
&mut ctx, ctx,
&self.query, &self.query,
self.terms_matching_strategy, self.terms_matching_strategy,
self.exhaustive_number_hits, self.exhaustive_number_hits,

View File

@ -1,4 +1,5 @@
use std::cmp::Reverse; use std::cmp::Reverse;
use std::fmt;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use charabia::Token; use charabia::Token;
@ -23,6 +24,7 @@ pub struct LocatedMatchingWords {
/// Structure created from a query tree /// Structure created from a query tree
/// referencing words that match the given query tree. /// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords { pub struct MatchingWords {
word_interner: DedupInterner<String>, word_interner: DedupInterner<String>,
phrase_interner: DedupInterner<Phrase>, phrase_interner: DedupInterner<Phrase>,
@ -240,6 +242,40 @@ impl<'a> PartialMatch<'a> {
} }
} }
impl fmt::Debug for MatchingWords {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
let phrases: Vec<_> = phrases
.iter()
.map(|p| {
(
phrase_interner
.get(p.value)
.words
.iter()
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
.collect::<Vec<_>>()
.join(" "),
p.positions.clone(),
)
})
.collect();
let words: Vec<_> = words
.iter()
.flat_map(|w| {
w.value
.iter()
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
.collect::<Vec<_>>()
})
.collect();
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
}
}
#[cfg(test)] #[cfg(test)]
pub(crate) mod tests { pub(crate) mod tests {
use std::borrow::Cow; use std::borrow::Cow;

View File

@ -1,7 +1,8 @@
use std::borrow::Cow; use std::borrow::Cow;
use charabia::{SeparatorKind, Token, Tokenizer}; use charabia::{SeparatorKind, Token, Tokenizer};
use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; pub use matching_words::MatchingWords;
use matching_words::{MatchType, PartialMatch, WordId};
use serde::Serialize; use serde::Serialize;
use super::query_term::LocatedQueryTerm; use super::query_term::LocatedQueryTerm;
@ -23,12 +24,7 @@ pub struct MatcherBuilder<'a, A> {
} }
impl<'a, A> MatcherBuilder<'a, A> { impl<'a, A> MatcherBuilder<'a, A> {
pub fn new( pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
ctx: SearchContext,
located_terms: Vec<LocatedQueryTerm>,
tokenizer: Tokenizer<'a, 'a, A>,
) -> Self {
let matching_words = MatchingWords::new(ctx, located_terms);
Self { Self {
matching_words, matching_words,
tokenizer, tokenizer,
@ -514,7 +510,8 @@ mod tests {
let tokenizer = TokenizerBuilder::new().build(); let tokenizer = TokenizerBuilder::new().build();
let tokens = tokenizer.tokenize(query); let tokens = tokenizer.tokenize(query);
let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
Self::new(ctx, query_terms, TokenizerBuilder::new().build()) let matching_words = MatchingWords::new(ctx, query_terms);
Self::new(matching_words, TokenizerBuilder::new().build())
} }
} }

View File

@ -4,7 +4,7 @@ mod graph_based_ranking_rule;
mod interner; mod interner;
mod limits; mod limits;
mod logger; mod logger;
mod matches; pub mod matches;
mod query_graph; mod query_graph;
mod query_term; mod query_term;
mod ranking_rule_graph; mod ranking_rule_graph;
@ -271,7 +271,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub fn execute_search( pub fn execute_search(
ctx: &mut SearchContext, mut ctx: SearchContext,
query: &Option<String>, query: &Option<String>,
terms_matching_strategy: TermsMatchingStrategy, terms_matching_strategy: TermsMatchingStrategy,
exhaustive_number_hits: bool, exhaustive_number_hits: bool,
@ -284,21 +284,22 @@ pub fn execute_search(
query_graph_logger: &mut dyn SearchLogger<QueryGraph>, query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<SearchResult> { ) -> Result<SearchResult> {
let mut universe = if let Some(filters) = filters { let mut universe = if let Some(filters) = filters {
filters.evaluate(ctx.txn, ctx.index)? filters.evaluate(&mut ctx.txn, &mut ctx.index)?
} else { } else {
ctx.index.documents_ids(ctx.txn)? ctx.index.documents_ids(&mut ctx.txn)?
}; };
let mut located_query_terms = None;
let documents_ids = if let Some(query) = query { let documents_ids = if let Some(query) = query {
// We make sure that the analyzer is aware of the stop words // We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them. // this ensures that the query builder is able to properly remove them.
let mut tokbuilder = TokenizerBuilder::new(); let mut tokbuilder = TokenizerBuilder::new();
let stop_words = ctx.index.stop_words(ctx.txn)?; let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?;
if let Some(ref stop_words) = stop_words { if let Some(ref stop_words) = stop_words {
tokbuilder.stop_words(stop_words); tokbuilder.stop_words(stop_words);
} }
let script_lang_map = ctx.index.script_language(ctx.txn)?; let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?;
if !script_lang_map.is_empty() { if !script_lang_map.is_empty() {
tokbuilder.allow_list(&script_lang_map); tokbuilder.allow_list(&script_lang_map);
} }
@ -306,27 +307,31 @@ pub fn execute_search(
let tokenizer = tokbuilder.build(); let tokenizer = tokbuilder.build();
let tokens = tokenizer.tokenize(query); let tokens = tokenizer.tokenize(query);
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?;
let graph = QueryGraph::from_query(ctx, &query_terms)?; let graph = QueryGraph::from_query(&mut ctx, &query_terms)?;
located_query_terms = Some(query_terms);
check_sort_criteria(ctx, sort_criteria.as_ref())?; check_sort_criteria(&mut ctx, sort_criteria.as_ref())?;
universe = resolve_maximally_reduced_query_graph( universe = resolve_maximally_reduced_query_graph(
ctx, &mut ctx,
&universe, &universe,
&graph, &graph,
terms_matching_strategy, terms_matching_strategy,
query_graph_logger, query_graph_logger,
)?; )?;
let ranking_rules = let ranking_rules = get_ranking_rules_for_query_graph_search(
get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; &mut ctx,
sort_criteria,
terms_matching_strategy,
)?;
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
} else { } else {
let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?;
bucket_sort( bucket_sort(
ctx, &mut ctx,
ranking_rules, ranking_rules,
&PlaceholderQuery, &PlaceholderQuery,
&universe, &universe,
@ -340,19 +345,20 @@ pub fn execute_search(
// is requested and a distinct attribute is set. // is requested and a distinct attribute is set.
let mut candidates = universe; let mut candidates = universe;
if exhaustive_number_hits { if exhaustive_number_hits {
if let Some(f) = ctx.index.distinct_field(ctx.txn)? { if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? {
if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) {
candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining;
} }
} }
} }
Ok(SearchResult { // consume context and located_query_terms to build MatchingWords.
// TODO: correct matching words let matching_words = match located_query_terms {
matching_words: MatchingWords::default(), Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
candidates, None => MatchingWords::default(),
documents_ids, };
})
Ok(SearchResult { matching_words, candidates, documents_ids })
} }
fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>) -> Result<()> { fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>) -> Result<()> {