2022-03-22 15:22:14 +01:00
use std ::borrow ::Cow ;
2022-06-02 17:59:04 +02:00
use charabia ::{ SeparatorKind , Token , Tokenizer } ;
2022-04-12 16:31:58 +02:00
use matching_words ::{ MatchType , PartialMatch , PrimitiveWordId } ;
pub use matching_words ::{ MatchingWord , MatchingWords } ;
2022-05-16 17:05:20 +02:00
use serde ::Serialize ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
pub mod matching_words ;
2022-03-22 15:22:14 +01:00
const DEFAULT_CROP_MARKER : & 'static str = " … " ;
const DEFAULT_HIGHLIGHT_PREFIX : & 'static str = " <em> " ;
const DEFAULT_HIGHLIGHT_SUFFIX : & 'static str = " </em> " ;
2022-06-02 17:59:04 +02:00
pub struct MatcherBuilder < ' a , A > {
2022-03-22 15:22:14 +01:00
matching_words : MatchingWords ,
2022-06-02 17:59:04 +02:00
tokenizer : Tokenizer < ' a , A > ,
2022-03-22 15:22:14 +01:00
crop_marker : Option < String > ,
highlight_prefix : Option < String > ,
highlight_suffix : Option < String > ,
}
2022-06-02 17:59:04 +02:00
impl < ' a , A > MatcherBuilder < ' a , A > {
pub fn new ( matching_words : MatchingWords , tokenizer : Tokenizer < ' a , A > ) -> Self {
Self {
matching_words ,
tokenizer ,
crop_marker : None ,
highlight_prefix : None ,
highlight_suffix : None ,
}
2022-03-22 15:22:14 +01:00
}
pub fn crop_marker ( & mut self , marker : String ) -> & Self {
self . crop_marker = Some ( marker ) ;
self
}
pub fn highlight_prefix ( & mut self , prefix : String ) -> & Self {
self . highlight_prefix = Some ( prefix ) ;
self
}
pub fn highlight_suffix ( & mut self , suffix : String ) -> & Self {
self . highlight_suffix = Some ( suffix ) ;
self
}
2022-06-02 17:59:04 +02:00
pub fn build < ' t , ' m > ( & ' m self , text : & ' t str ) -> Matcher < ' t , ' m , A > {
2022-03-22 15:22:14 +01:00
let crop_marker = match & self . crop_marker {
Some ( marker ) = > marker . as_str ( ) ,
None = > & DEFAULT_CROP_MARKER ,
} ;
let highlight_prefix = match & self . highlight_prefix {
Some ( marker ) = > marker . as_str ( ) ,
None = > & DEFAULT_HIGHLIGHT_PREFIX ,
} ;
let highlight_suffix = match & self . highlight_suffix {
Some ( marker ) = > marker . as_str ( ) ,
None = > & DEFAULT_HIGHLIGHT_SUFFIX ,
} ;
Matcher {
text ,
matching_words : & self . matching_words ,
2022-06-02 17:59:04 +02:00
tokenizer : & self . tokenizer ,
2022-03-22 15:22:14 +01:00
crop_marker ,
highlight_prefix ,
highlight_suffix ,
matches : None ,
}
}
}
2022-04-12 13:42:14 +02:00
#[ derive(Copy, Clone, Default) ]
pub struct FormatOptions {
pub highlight : bool ,
pub crop : Option < usize > ,
}
impl FormatOptions {
pub fn merge ( self , other : Self ) -> Self {
Self { highlight : self . highlight | | other . highlight , crop : self . crop . or ( other . crop ) }
}
}
2022-03-28 18:17:50 +02:00
#[ derive(Clone, Debug) ]
2022-03-28 15:57:05 +02:00
pub struct Match {
2022-03-22 15:22:14 +01:00
match_len : usize ,
2022-04-04 18:56:59 +02:00
// ids of the query words that matches.
ids : Vec < PrimitiveWordId > ,
2022-03-22 15:22:14 +01:00
// position of the word in the whole text.
2022-03-28 15:57:05 +02:00
word_position : usize ,
// position of the token in the whole text.
token_position : usize ,
2022-03-22 15:22:14 +01:00
}
2022-05-16 17:05:20 +02:00
#[ derive(Serialize, Debug, Clone, PartialEq) ]
2022-03-22 15:22:14 +01:00
pub struct MatchBounds {
2022-03-30 15:22:18 +02:00
pub start : usize ,
pub length : usize ,
2022-03-22 15:22:14 +01:00
}
2022-06-02 17:59:04 +02:00
pub struct Matcher < ' t , ' m , A > {
2022-03-22 15:22:14 +01:00
text : & ' t str ,
matching_words : & ' m MatchingWords ,
2022-06-02 17:59:04 +02:00
tokenizer : & ' m Tokenizer < ' m , A > ,
2022-03-22 15:22:14 +01:00
crop_marker : & ' m str ,
highlight_prefix : & ' m str ,
highlight_suffix : & ' m str ,
2022-06-02 17:59:04 +02:00
matches : Option < ( Vec < Token < ' t > > , Vec < Match > ) > ,
2022-03-22 15:22:14 +01:00
}
2022-06-02 17:59:04 +02:00
impl < ' t , A : AsRef < [ u8 ] > > Matcher < ' t , '_ , A > {
2022-04-05 17:35:52 +02:00
/// Iterates over tokens and save any of them that matches the query.
2022-03-22 15:22:14 +01:00
fn compute_matches ( & mut self ) -> & mut Self {
2022-04-11 16:46:45 +02:00
fn compute_partial_match < ' a > (
2022-04-05 17:35:52 +02:00
mut partial : PartialMatch ,
2022-04-11 16:46:45 +02:00
token_position : usize ,
word_position : usize ,
words_positions : & mut impl Iterator < Item = ( usize , usize , & ' a Token < ' a > ) > ,
2022-04-05 17:35:52 +02:00
matches : & mut Vec < Match > ,
) -> bool {
2022-04-11 16:46:45 +02:00
let mut potential_matches = Vec ::new ( ) ;
// Add first match to potential matches.
potential_matches . push ( ( token_position , word_position , partial . char_len ( ) ) ) ;
for ( token_position , word_position , word ) in words_positions {
partial = match partial . match_token ( & word ) {
// token matches the partial match, but the match is not full,
// we temporarly save the current token then we try to match the next one.
Some ( MatchType ::Partial ( partial ) ) = > {
potential_matches . push ( ( token_position , word_position , partial . char_len ( ) ) ) ;
partial
}
// partial match is now full, we keep this matches and we advance positions
Some ( MatchType ::Full { char_len , ids } ) = > {
// save previously matched tokens as matches.
let iter = potential_matches . into_iter ( ) . map (
| ( token_position , word_position , match_len ) | Match {
match_len ,
2022-04-05 17:35:52 +02:00
ids : ids . to_vec ( ) ,
2022-04-11 16:46:45 +02:00
word_position ,
token_position ,
} ,
) ;
matches . extend ( iter ) ;
// save the token that closes the partial match as a match.
matches . push ( Match {
match_len : char_len ,
ids : ids . to_vec ( ) ,
word_position ,
token_position ,
} ) ;
// the match is complete, we return true.
return true ;
}
// no match, continue to next match.
None = > break ,
} ;
2022-04-05 17:35:52 +02:00
}
// the match is not complete, we return false.
false
}
2022-06-02 17:59:04 +02:00
let tokens : Vec < _ > = self . tokenizer . tokenize ( self . text ) . collect ( ) ;
2022-03-22 15:22:14 +01:00
let mut matches = Vec ::new ( ) ;
2022-04-11 16:46:45 +02:00
2022-06-02 17:59:04 +02:00
let mut words_positions = tokens
2022-04-11 16:46:45 +02:00
. iter ( )
. scan ( ( 0 , 0 ) , | ( token_position , word_position ) , token | {
let current_token_position = * token_position ;
let current_word_position = * word_position ;
* token_position + = 1 ;
2022-06-02 15:47:28 +02:00
if ! token . is_separator ( ) {
2022-04-11 16:46:45 +02:00
* word_position + = 1 ;
}
Some ( ( current_token_position , current_word_position , token ) )
} )
2022-06-02 15:47:28 +02:00
. filter ( | ( _ , _ , token ) | ! token . is_separator ( ) ) ;
2022-04-11 16:46:45 +02:00
while let Some ( ( token_position , word_position , word ) ) = words_positions . next ( ) {
for match_type in self . matching_words . match_token ( word ) {
match match_type {
// we match, we save the current token as a match,
// then we continue the rest of the tokens.
MatchType ::Full { char_len , ids } = > {
matches . push ( Match {
match_len : char_len ,
ids : ids . to_vec ( ) ,
word_position ,
token_position ,
} ) ;
break ;
}
// we match partially, iterate over next tokens to check if we can complete the match.
MatchType ::Partial ( partial ) = > {
// if match is completed, we break the matching loop over the current token,
2022-04-05 17:35:52 +02:00
// then we continue the rest of the tokens.
2022-04-11 16:46:45 +02:00
let mut wp = words_positions . clone ( ) ;
if compute_partial_match (
partial ,
token_position ,
word_position ,
& mut wp ,
& mut matches ,
) {
words_positions = wp ;
2022-04-04 18:56:59 +02:00
break ;
}
}
2022-03-22 15:22:14 +01:00
}
}
}
2022-06-02 17:59:04 +02:00
self . matches = Some ( ( tokens , matches ) ) ;
2022-03-22 15:22:14 +01:00
self
}
2022-04-05 17:35:52 +02:00
/// Returns boundaries of the words that match the query.
2022-03-22 15:22:14 +01:00
pub fn matches ( & mut self ) -> Vec < MatchBounds > {
match & self . matches {
None = > self . compute_matches ( ) . matches ( ) ,
2022-06-02 17:59:04 +02:00
Some ( ( tokens , matches ) ) = > matches
2022-03-28 15:57:05 +02:00
. iter ( )
. map ( | m | MatchBounds {
2022-06-02 17:59:04 +02:00
start : tokens [ m . token_position ] . byte_start ,
2022-03-28 15:57:05 +02:00
length : m . match_len ,
} )
. collect ( ) ,
}
}
2022-04-07 20:15:14 +02:00
/// Returns the bounds in byte index of the crop window.
2022-06-02 17:59:04 +02:00
fn crop_bounds ( & self , tokens : & [ Token ] , matches : & [ Match ] , crop_size : usize ) -> ( usize , usize ) {
2022-04-05 17:35:52 +02:00
// if there is no match, we start from the beginning of the string by default.
2022-03-28 15:57:05 +02:00
let first_match_word_position = matches . first ( ) . map ( | m | m . word_position ) . unwrap_or ( 0 ) ;
let first_match_token_position = matches . first ( ) . map ( | m | m . token_position ) . unwrap_or ( 0 ) ;
let last_match_word_position = matches . last ( ) . map ( | m | m . word_position ) . unwrap_or ( 0 ) ;
let last_match_token_position = matches . last ( ) . map ( | m | m . token_position ) . unwrap_or ( 0 ) ;
2022-04-05 17:35:52 +02:00
// matches needs to be counted in the crop len.
2022-04-12 13:42:14 +02:00
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position ;
2022-04-05 17:35:52 +02:00
2022-06-02 17:59:04 +02:00
let mut before_tokens = tokens [ .. first_match_token_position ] . iter ( ) . rev ( ) . peekable ( ) ;
let mut after_tokens = tokens [ last_match_token_position .. ] . iter ( ) . peekable ( ) ;
2022-04-07 20:15:14 +02:00
2022-03-28 15:57:05 +02:00
while remaining_words > 0 {
2022-06-02 15:47:28 +02:00
let before_token = before_tokens . peek ( ) . map ( | t | t . separator_kind ( ) ) ;
let after_token = after_tokens . peek ( ) . map ( | t | t . separator_kind ( ) ) ;
2022-04-07 20:15:14 +02:00
match ( before_token , after_token ) {
2022-04-05 17:35:52 +02:00
// we can expand both sides.
2022-04-07 20:15:14 +02:00
( Some ( before_token ) , Some ( after_token ) ) = > {
match ( before_token , after_token ) {
2022-04-05 17:35:52 +02:00
// if they are both separators and are the same kind then advance both,
// or expand in the soft separator separator side.
2022-04-07 20:15:14 +02:00
( Some ( before_token_kind ) , Some ( after_token_kind ) ) = > {
if before_token_kind = = after_token_kind {
before_tokens . next ( ) ;
2022-06-16 18:23:57 +02:00
// this avoid having an ending separator before crop marker.
if remaining_words > 1 {
after_tokens . next ( ) ;
}
2022-04-07 20:15:14 +02:00
} else if before_token_kind = = SeparatorKind ::Hard {
after_tokens . next ( ) ;
2022-03-28 15:57:05 +02:00
} else {
2022-04-07 20:15:14 +02:00
before_tokens . next ( ) ;
2022-03-28 15:57:05 +02:00
}
}
2022-04-05 17:35:52 +02:00
// if one of the tokens is a word, we expend in the side of the word.
// left is a word, advance left.
2022-03-28 15:57:05 +02:00
( None , Some ( _ ) ) = > {
2022-04-07 20:15:14 +02:00
before_tokens . next ( ) ;
2022-03-28 15:57:05 +02:00
remaining_words - = 1 ;
}
2022-04-05 17:35:52 +02:00
// right is a word, advance right.
2022-03-28 15:57:05 +02:00
( Some ( _ ) , None ) = > {
2022-04-07 20:15:14 +02:00
after_tokens . next ( ) ;
2022-03-28 15:57:05 +02:00
remaining_words - = 1 ;
}
2022-04-05 17:35:52 +02:00
// both are words, advance left then right if remaining_word > 0.
2022-03-28 15:57:05 +02:00
( None , None ) = > {
2022-04-07 20:15:14 +02:00
before_tokens . next ( ) ;
2022-03-28 15:57:05 +02:00
remaining_words - = 1 ;
if remaining_words > 0 {
2022-04-07 20:15:14 +02:00
after_tokens . next ( ) ;
2022-03-28 15:57:05 +02:00
remaining_words - = 1 ;
}
}
}
}
2022-03-29 14:51:02 +02:00
// the end of the text is reached, advance left.
2022-04-07 20:15:14 +02:00
( Some ( before_token ) , None ) = > {
before_tokens . next ( ) ;
if before_token . is_none ( ) {
2022-03-28 15:57:05 +02:00
remaining_words - = 1 ;
}
}
2022-03-29 14:51:02 +02:00
// the start of the text is reached, advance right.
2022-04-07 20:15:14 +02:00
( None , Some ( after_token ) ) = > {
after_tokens . next ( ) ;
if after_token . is_none ( ) {
2022-03-28 15:57:05 +02:00
remaining_words - = 1 ;
}
}
2022-03-29 14:51:02 +02:00
// no more token to add.
2022-03-28 15:57:05 +02:00
( None , None ) = > break ,
}
}
2022-04-07 20:15:14 +02:00
let crop_byte_start = before_tokens . next ( ) . map_or ( 0 , | t | t . byte_end ) ;
let crop_byte_end = after_tokens . next ( ) . map_or ( self . text . len ( ) , | t | t . byte_start ) ;
( crop_byte_start , crop_byte_end )
2022-03-22 15:22:14 +01:00
}
2022-04-05 17:35:52 +02:00
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
2022-03-28 18:17:50 +02:00
fn match_interval_score ( & self , matches : & [ Match ] ) -> ( i16 , i16 , i16 ) {
2022-04-04 18:56:59 +02:00
let mut ids : Vec < PrimitiveWordId > = Vec ::with_capacity ( matches . len ( ) ) ;
2022-03-28 18:17:50 +02:00
let mut order_score = 0 ;
let mut distance_score = 0 ;
let mut iter = matches . iter ( ) . peekable ( ) ;
while let Some ( m ) = iter . next ( ) {
if let Some ( next_match ) = iter . peek ( ) {
// if matches are ordered
2022-04-04 18:56:59 +02:00
if next_match . ids . iter ( ) . min ( ) > m . ids . iter ( ) . min ( ) {
2022-03-28 18:17:50 +02:00
order_score + = 1 ;
}
// compute distance between matches
distance_score - = ( next_match . word_position - m . word_position ) . min ( 7 ) as i16 ;
}
2022-04-04 18:56:59 +02:00
ids . extend ( m . ids . iter ( ) ) ;
2022-03-28 18:17:50 +02:00
}
ids . sort_unstable ( ) ;
ids . dedup ( ) ;
let uniq_score = ids . len ( ) as i16 ;
// rank by unique match count, then by distance between matches, then by ordered match count.
( uniq_score , distance_score , order_score )
}
2022-04-05 17:35:52 +02:00
/// Returns the matches interval where the score computed by match_interval_score is maximal.
2022-04-12 13:42:14 +02:00
fn find_best_match_interval < ' a > ( & self , matches : & ' a [ Match ] , crop_size : usize ) -> & ' a [ Match ] {
2022-04-05 17:35:52 +02:00
// we compute the matches interval if we have at least 2 matches.
2022-03-28 18:17:50 +02:00
if matches . len ( ) > 1 {
2022-04-05 17:35:52 +02:00
// positions of the first and the last match of the best matches interval in `matches`.
2022-03-29 14:51:02 +02:00
let mut best_interval = ( 0 , 0 ) ;
let mut best_interval_score = self . match_interval_score ( & matches [ 0 ..= 0 ] ) ;
2022-04-05 17:35:52 +02:00
// current interval positions.
2022-03-28 18:17:50 +02:00
let mut interval_first = 0 ;
2022-03-29 14:51:02 +02:00
let mut interval_last = 0 ;
for ( index , next_match ) in matches . iter ( ) . enumerate ( ) . skip ( 1 ) {
2022-04-05 17:35:52 +02:00
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
2022-04-12 13:42:14 +02:00
if next_match . word_position - matches [ interval_first ] . word_position > = crop_size {
2022-03-28 18:17:50 +02:00
let interval_score =
self . match_interval_score ( & matches [ interval_first ..= interval_last ] ) ;
// keep interval if it's the best
if interval_score > best_interval_score {
best_interval = ( interval_first , interval_last ) ;
best_interval_score = interval_score ;
}
2022-04-05 17:35:52 +02:00
// advance start of the interval while interval is longer than crop_size.
2022-03-28 18:17:50 +02:00
while next_match . word_position - matches [ interval_first ] . word_position
2022-04-12 13:42:14 +02:00
> = crop_size
2022-03-28 18:17:50 +02:00
{
interval_first + = 1 ;
}
}
interval_last = index ;
}
2022-04-05 17:35:52 +02:00
// compute the last interval score and compare it to the best one.
2022-03-28 18:17:50 +02:00
let interval_score =
self . match_interval_score ( & matches [ interval_first ..= interval_last ] ) ;
if interval_score > best_interval_score {
best_interval = ( interval_first , interval_last ) ;
}
& matches [ best_interval . 0 ..= best_interval . 1 ]
} else {
matches
2022-03-28 15:57:05 +02:00
}
2022-03-22 15:22:14 +01:00
}
2022-04-05 17:35:52 +02:00
// Returns the formatted version of the original text.
2022-04-12 13:42:14 +02:00
pub fn format ( & mut self , format_options : FormatOptions ) -> Cow < ' t , str > {
if ! format_options . highlight & & format_options . crop . is_none ( ) {
2022-04-05 17:35:52 +02:00
// compute matches is not needed if no highlight nor crop is requested.
2022-03-22 15:22:14 +01:00
Cow ::Borrowed ( self . text )
} else {
match & self . matches {
2022-06-02 17:59:04 +02:00
Some ( ( tokens , matches ) ) = > {
2022-04-12 13:42:14 +02:00
let matches = match format_options . crop {
Some ( crop_size ) if crop_size > 0 = > {
self . find_best_match_interval ( matches , crop_size )
}
_ = > matches ,
} ;
2022-04-08 11:20:41 +02:00
2022-04-12 13:42:14 +02:00
let ( byte_start , byte_end ) = match format_options . crop {
2022-06-02 17:59:04 +02:00
Some ( crop_size ) if crop_size > 0 = > {
self . crop_bounds ( tokens , matches , crop_size )
}
2022-04-12 13:42:14 +02:00
_ = > ( 0 , self . text . len ( ) ) ,
} ;
2022-03-22 15:22:14 +01:00
let mut formatted = Vec ::new ( ) ;
// push crop marker if it's not the start of the text.
if byte_start > 0 & & ! self . crop_marker . is_empty ( ) {
formatted . push ( self . crop_marker ) ;
}
let mut byte_index = byte_start ;
2022-04-12 13:42:14 +02:00
if format_options . highlight {
2022-03-22 15:22:14 +01:00
// insert highlight markers around matches.
2022-04-08 11:20:41 +02:00
for m in matches {
2022-03-28 15:57:05 +02:00
let token = & tokens [ m . token_position ] ;
if byte_index < token . byte_start {
formatted . push ( & self . text [ byte_index .. token . byte_start ] ) ;
2022-03-22 15:22:14 +01:00
}
2022-03-30 15:43:49 +02:00
let highlight_byte_index = self . text [ token . byte_start .. ]
. char_indices ( )
. enumerate ( )
. find ( | ( i , _ ) | * i = = m . match_len )
2022-04-05 17:35:52 +02:00
. map_or ( token . byte_end , | ( _ , ( i , _ ) ) | i + token . byte_start ) ;
2022-03-22 15:22:14 +01:00
formatted . push ( self . highlight_prefix ) ;
2022-03-30 15:43:49 +02:00
formatted . push ( & self . text [ token . byte_start .. highlight_byte_index ] ) ;
2022-03-22 15:22:14 +01:00
formatted . push ( self . highlight_suffix ) ;
2022-04-05 17:35:52 +02:00
// if it's a prefix highlight, we put the end of the word after the highlight marker.
if highlight_byte_index < token . byte_end {
formatted . push ( & self . text [ highlight_byte_index .. token . byte_end ] ) ;
}
2022-03-22 15:22:14 +01:00
2022-03-28 15:57:05 +02:00
byte_index = token . byte_end ;
2022-03-22 15:22:14 +01:00
}
}
// push the rest of the text between last match and the end of crop.
if byte_index < byte_end {
formatted . push ( & self . text [ byte_index .. byte_end ] ) ;
}
// push crop marker if it's not the end of the text.
if byte_end < self . text . len ( ) & & ! self . crop_marker . is_empty ( ) {
formatted . push ( self . crop_marker ) ;
}
if formatted . len ( ) = = 1 {
// avoid concatenating if there is already 1 slice.
Cow ::Borrowed ( & self . text [ byte_start .. byte_end ] )
} else {
Cow ::Owned ( formatted . concat ( ) )
}
}
2022-04-12 13:42:14 +02:00
None = > self . compute_matches ( ) . format ( format_options ) ,
2022-03-22 15:22:14 +01:00
}
}
}
}
#[ cfg(test) ]
mod tests {
2022-06-02 17:59:04 +02:00
use charabia ::TokenizerBuilder ;
2022-03-30 15:43:49 +02:00
2022-03-22 15:22:14 +01:00
use super ::* ;
2022-04-04 18:56:59 +02:00
use crate ::search ::matches ::matching_words ::MatchingWord ;
fn matching_words ( ) -> MatchingWords {
let matching_words = vec! [
( vec! [ MatchingWord ::new ( " split " . to_string ( ) , 0 , false ) ] , vec! [ 0 ] ) ,
( vec! [ MatchingWord ::new ( " the " . to_string ( ) , 0 , false ) ] , vec! [ 1 ] ) ,
( vec! [ MatchingWord ::new ( " world " . to_string ( ) , 1 , true ) ] , vec! [ 2 ] ) ,
] ;
MatchingWords ::new ( matching_words )
2022-03-22 15:22:14 +01:00
}
2022-06-02 17:59:04 +02:00
impl MatcherBuilder < '_ , Vec < u8 > > {
pub fn from_matching_words ( matching_words : MatchingWords ) -> Self {
Self ::new ( matching_words , TokenizerBuilder ::default ( ) . build ( ) )
}
}
2022-03-22 15:22:14 +01:00
#[ test ]
fn format_identity ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : false , crop : None } ;
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no crop and no highlight should return complete text.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , & text ) ;
2022-03-22 15:22:14 +01:00
// Text containing all matches.
2022-03-28 15:57:05 +02:00
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no crop and no highlight should return complete text.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , & text ) ;
2022-03-22 15:22:14 +01:00
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no crop and no highlight should return complete text.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , & text ) ;
2022-03-22 15:22:14 +01:00
}
#[ test ]
fn format_highlight ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : true , crop : None } ;
2022-03-22 15:22:14 +01:00
2022-03-29 14:51:02 +02:00
// empty text.
let text = " " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " " ) ;
2022-03-29 14:51:02 +02:00
// text containing only separators.
let text = " :-) " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " :-) " ) ;
2022-03-29 14:51:02 +02:00
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no crop should return complete text, because there is no matches.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , & text ) ;
2022-03-22 15:22:14 +01:00
// Text containing all matches.
2022-03-28 15:57:05 +02:00
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no crop should return complete text with highlighted matches.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>. " ) ;
2022-03-22 15:22:14 +01:00
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no crop should return complete text with highlighted matches.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-22 15:22:14 +01:00
" Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. "
) ;
2022-03-30 15:43:49 +02:00
}
#[ test ]
fn highlight_unicode ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = vec! [
( vec! [ MatchingWord ::new ( " wessfali " . to_string ( ) , 1 , true ) ] , vec! [ 0 ] ) ,
( vec! [ MatchingWord ::new ( " world " . to_string ( ) , 1 , true ) ] , vec! [ 1 ] ) ,
] ;
let matching_words = MatchingWords ::new ( matching_words ) ;
2022-03-30 15:43:49 +02:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-30 15:15:14 +02:00
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : true , crop : None } ;
2022-03-30 15:43:49 +02:00
// Text containing prefix match.
let text = " Ŵôřlḑôle " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-30 15:15:14 +02:00
// no crop should return complete text with highlighted matches.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " <em>Ŵôřlḑ</em>ôle " ) ;
2022-03-30 15:43:49 +02:00
// Text containing unicode match.
let text = " Ŵôřlḑ " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-30 15:43:49 +02:00
// no crop should return complete text with highlighted matches.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " <em>Ŵôřlḑ</em> " ) ;
2022-03-30 15:43:49 +02:00
// Text containing unicode match.
let text = " Westfália " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-30 15:43:49 +02:00
// no crop should return complete text with highlighted matches.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " <em>Westfáli</em>a " ) ;
2022-03-22 15:22:14 +01:00
}
#[ test ]
fn format_crop ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : false , crop : Some ( 10 ) } ;
2022-03-22 15:22:14 +01:00
2022-03-29 14:51:02 +02:00
// empty text.
let text = " " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " " ) ;
2022-03-29 14:51:02 +02:00
// text containing only separators.
let text = " :-) " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " :-) " ) ;
2022-03-29 14:51:02 +02:00
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no highlight should return 10 first words with a marker at the end.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" A quick brown fox can not jump 32 feet, right… "
2022-03-22 15:22:14 +01:00
) ;
2022-03-29 14:51:02 +02:00
// Text without any match starting by a separator.
let text = " (A quick brown fox can not jump 32 feet, right? Brr, it is cold!) " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-29 14:51:02 +02:00
// no highlight should return 10 first words with a marker at the end.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" (A quick brown fox can not jump 32 feet, right… "
2022-03-29 14:51:02 +02:00
) ;
2022-03-28 15:57:05 +02:00
// Test phrase propagation
let text = " Natalie risk her future. Split The World is a book written by Emily Henry. I never read it. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-28 15:57:05 +02:00
// should crop the phrase instead of croping around the match.
2022-03-22 15:22:14 +01:00
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-06-02 15:47:28 +02:00
" … Split The World is a book written by Emily Henry… " ,
2022-03-22 15:22:14 +01:00
) ;
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// no highlight should return 10 last words with a marker at the start.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …future to build a world with the boy she loves… "
2022-03-22 15:22:14 +01:00
) ;
2022-03-28 15:57:05 +02:00
// Text containing all matches.
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-28 15:57:05 +02:00
// no highlight should return 10 last words with a marker at the start.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …she loves. Emily Henry: The Love That Split The World. "
2022-03-28 15:57:05 +02:00
) ;
2022-03-22 15:22:14 +01:00
// Text containing a match unordered and a match ordered.
let text = " The world split void void void void void void void void void split the world void void " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// crop should return 10 last words with a marker at the start.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void split the world void void "
2022-03-22 15:22:14 +01:00
) ;
2022-03-28 18:17:50 +02:00
// Text containing matches with diferent density.
let text = " split void the void void world void void void void void void void void void void split the world void void " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-28 18:17:50 +02:00
// crop should return 10 last words with a marker at the start.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void split the world void void "
2022-03-28 18:17:50 +02:00
) ;
// Text containing matches with same word.
let text = " split split split split split split void void void void void void void void void void split the world void void " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-28 18:17:50 +02:00
// crop should return 10 last words with a marker at the start.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void split the world void void "
2022-03-28 18:17:50 +02:00
) ;
2022-03-22 15:22:14 +01:00
}
#[ test ]
fn format_highlight_crop ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : true , crop : Some ( 10 ) } ;
2022-03-22 15:22:14 +01:00
2022-03-29 14:51:02 +02:00
// empty text.
let text = " " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " " ) ;
2022-03-29 14:51:02 +02:00
// text containing only separators.
let text = " :-) " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " :-) " ) ;
2022-03-29 14:51:02 +02:00
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// both should return 10 first words with a marker at the end.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" A quick brown fox can not jump 32 feet, right… "
2022-03-22 15:22:14 +01:00
) ;
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …future to build a <em>world</em> with <em>the</em> boy she loves… "
2022-03-22 15:22:14 +01:00
) ;
2022-03-28 15:57:05 +02:00
// Text containing all matches.
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-28 15:57:05 +02:00
// both should return 10 last words with a marker at the start and highlighted matches.
2022-04-19 10:35:50 +02:00
assert_eq! ( & matcher . format ( format_options ) , " …she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>. " ) ;
2022-03-28 15:57:05 +02:00
2022-03-22 15:22:14 +01:00
// Text containing a match unordered and a match ordered.
let text = " The world split void void void void void void void void void split the world void void " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-22 15:22:14 +01:00
// crop should return 10 last words with a marker at the start.
assert_eq! (
2022-04-19 10:35:50 +02:00
& matcher . format ( format_options ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void <em>split</em> <em>the</em> <em>world</em> void void "
2022-03-22 15:22:14 +01:00
) ;
}
2022-03-29 14:51:02 +02:00
#[ test ]
fn smaller_crop_size ( ) {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-29 14:51:02 +02:00
2022-04-12 13:42:14 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-29 14:51:02 +02:00
let text = " void void split the world void void. " ;
// set a smaller crop size
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : false , crop : Some ( 2 ) } ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-29 14:51:02 +02:00
// because crop size < query size, partially format matches.
2022-04-12 13:42:14 +02:00
assert_eq! ( & matcher . format ( format_options ) , " …split the… " ) ;
2022-03-29 14:51:02 +02:00
// set a smaller crop size
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : false , crop : Some ( 1 ) } ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-29 14:51:02 +02:00
// because crop size < query size, partially format matches.
2022-04-12 13:42:14 +02:00
assert_eq! ( & matcher . format ( format_options ) , " …split… " ) ;
// set crop size to 0
let format_options = FormatOptions { highlight : false , crop : Some ( 0 ) } ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-03-29 14:51:02 +02:00
// because crop size is 0, crop is ignored.
2022-04-12 13:42:14 +02:00
assert_eq! ( & matcher . format ( format_options ) , " void void split the world void void. " ) ;
2022-03-29 14:51:02 +02:00
}
2022-04-11 16:46:45 +02:00
#[ test ]
fn partial_matches ( ) {
let matching_words = vec! [
( vec! [ MatchingWord ::new ( " the " . to_string ( ) , 0 , false ) ] , vec! [ 0 ] ) ,
(
vec! [
MatchingWord ::new ( " t " . to_string ( ) , 0 , false ) ,
MatchingWord ::new ( " he " . to_string ( ) , 0 , false ) ,
] ,
vec! [ 0 ] ,
) ,
( vec! [ MatchingWord ::new ( " door " . to_string ( ) , 0 , false ) ] , vec! [ 1 ] ) ,
(
vec! [
MatchingWord ::new ( " do " . to_string ( ) , 0 , false ) ,
MatchingWord ::new ( " or " . to_string ( ) , 0 , false ) ,
] ,
vec! [ 1 ] ,
) ,
( vec! [ MatchingWord ::new ( " do " . to_string ( ) , 0 , false ) ] , vec! [ 2 ] ) ,
] ;
let matching_words = MatchingWords ::new ( matching_words ) ;
let mut builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
builder . highlight_prefix ( " _ " . to_string ( ) ) ;
builder . highlight_suffix ( " _ " . to_string ( ) ) ;
2022-04-12 13:42:14 +02:00
let format_options = FormatOptions { highlight : true , crop : None } ;
2022-04-11 16:46:45 +02:00
let text = " the do or die can't be he do and or isn't he " ;
2022-06-02 17:59:04 +02:00
let mut matcher = builder . build ( text ) ;
2022-04-11 16:46:45 +02:00
assert_eq! (
2022-04-12 13:42:14 +02:00
& matcher . format ( format_options ) ,
2022-04-11 16:46:45 +02:00
" _the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_ " ,
" matches: {:?} " ,
& matcher . matches
) ;
}
2022-03-22 15:22:14 +01:00
}