mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Merge #584
584: Chores: Enhance smart-crop code comments r=curquiza a=ManyTheFish Enhance explanation around smart crop algorithms Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
This commit is contained in:
commit
5704235521
@ -11,6 +11,7 @@ const DEFAULT_CROP_MARKER: &'static str = "…";
|
|||||||
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
|
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
|
||||||
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
|
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
|
||||||
|
|
||||||
|
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||||
pub struct MatcherBuilder<'a, A> {
|
pub struct MatcherBuilder<'a, A> {
|
||||||
matching_words: MatchingWords,
|
matching_words: MatchingWords,
|
||||||
tokenizer: Tokenizer<'a, A>,
|
tokenizer: Tokenizer<'a, A>,
|
||||||
@ -100,6 +101,8 @@ pub struct MatchBounds {
|
|||||||
pub length: usize,
|
pub length: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Structure used to analize a string, compute words that match,
|
||||||
|
/// and format the source string, returning a highlighted and cropped sub-string.
|
||||||
pub struct Matcher<'t, 'm, A> {
|
pub struct Matcher<'t, 'm, A> {
|
||||||
text: &'t str,
|
text: &'t str,
|
||||||
matching_words: &'m MatchingWords,
|
matching_words: &'m MatchingWords,
|
||||||
@ -113,6 +116,8 @@ pub struct Matcher<'t, 'm, A> {
|
|||||||
impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||||
/// Iterates over tokens and save any of them that matches the query.
|
/// Iterates over tokens and save any of them that matches the query.
|
||||||
fn compute_matches(&mut self) -> &mut Self {
|
fn compute_matches(&mut self) -> &mut Self {
|
||||||
|
/// some words are counted as matches only if they are close together and in the good order,
|
||||||
|
/// compute_partial_match peek into next words to validate if the match is complete.
|
||||||
fn compute_partial_match<'a>(
|
fn compute_partial_match<'a>(
|
||||||
mut partial: PartialMatch,
|
mut partial: PartialMatch,
|
||||||
token_position: usize,
|
token_position: usize,
|
||||||
@ -246,9 +251,14 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
// matches needs to be counted in the crop len.
|
// matches needs to be counted in the crop len.
|
||||||
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
|
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
|
||||||
|
|
||||||
|
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
||||||
|
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
||||||
let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
|
let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
|
||||||
|
// an iterator starting from the last match token position and going towards the end of the text.
|
||||||
let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
|
let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
|
||||||
|
|
||||||
|
// grows the crop window peeking in both directions
|
||||||
|
// until the window contains the good number of words:
|
||||||
while remaining_words > 0 {
|
while remaining_words > 0 {
|
||||||
let before_token = before_tokens.peek().map(|t| t.separator_kind());
|
let before_token = before_tokens.peek().map(|t| t.separator_kind());
|
||||||
let after_token = after_tokens.peek().map(|t| t.separator_kind());
|
let after_token = after_tokens.peek().map(|t| t.separator_kind());
|
||||||
@ -315,6 +325,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// finally, keep the byte index of each bound of the crop window.
|
||||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||||
|
|
||||||
@ -353,7 +364,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
(uniq_score, distance_score, order_score)
|
(uniq_score, distance_score, order_score)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the matches interval where the score computed by match_interval_score is maximal.
|
/// Returns the matches interval where the score computed by match_interval_score is the best.
|
||||||
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
|
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
|
||||||
// we compute the matches interval if we have at least 2 matches.
|
// we compute the matches interval if we have at least 2 matches.
|
||||||
if matches.len() > 1 {
|
if matches.len() > 1 {
|
||||||
@ -408,6 +419,8 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
} else {
|
} else {
|
||||||
match &self.matches {
|
match &self.matches {
|
||||||
Some((tokens, matches)) => {
|
Some((tokens, matches)) => {
|
||||||
|
// If the text has to be cropped,
|
||||||
|
// compute the best interval to crop around.
|
||||||
let matches = match format_options.crop {
|
let matches = match format_options.crop {
|
||||||
Some(crop_size) if crop_size > 0 => {
|
Some(crop_size) if crop_size > 0 => {
|
||||||
self.find_best_match_interval(matches, crop_size)
|
self.find_best_match_interval(matches, crop_size)
|
||||||
@ -415,6 +428,8 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
|||||||
_ => matches,
|
_ => matches,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// If the text has to be cropped,
|
||||||
|
// crop around the best interval.
|
||||||
let (byte_start, byte_end) = match format_options.crop {
|
let (byte_start, byte_end) = match format_options.crop {
|
||||||
Some(crop_size) if crop_size > 0 => {
|
Some(crop_size) if crop_size > 0 => {
|
||||||
self.crop_bounds(tokens, matches, crop_size)
|
self.crop_bounds(tokens, matches, crop_size)
|
||||||
|
Loading…
Reference in New Issue
Block a user