2022-03-22 15:22:14 +01:00
use std ::borrow ::Cow ;
2022-03-29 14:57:21 +02:00
pub use matching_words ::MatchingWords ;
2022-04-04 18:56:59 +02:00
use matching_words ::{ MatchType , PrimitiveWordId } ;
2022-03-30 15:22:18 +02:00
use meilisearch_tokenizer ::token ::{ SeparatorKind , Token } ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
pub mod matching_words ;
2022-03-22 15:22:14 +01:00
const DEFAULT_CROP_SIZE : usize = 10 ;
const DEFAULT_CROP_MARKER : & 'static str = " … " ;
const DEFAULT_HIGHLIGHT_PREFIX : & 'static str = " <em> " ;
const DEFAULT_HIGHLIGHT_SUFFIX : & 'static str = " </em> " ;
pub struct MatcherBuilder {
matching_words : MatchingWords ,
crop_size : usize ,
crop_marker : Option < String > ,
highlight_prefix : Option < String > ,
highlight_suffix : Option < String > ,
}
impl MatcherBuilder {
2022-03-30 10:50:23 +02:00
pub fn from_matching_words ( matching_words : MatchingWords ) -> Self {
Self {
matching_words ,
crop_size : DEFAULT_CROP_SIZE ,
crop_marker : None ,
highlight_prefix : None ,
highlight_suffix : None ,
}
}
2022-03-22 15:22:14 +01:00
pub fn crop_size ( & mut self , word_count : usize ) -> & Self {
self . crop_size = word_count ;
self
}
pub fn crop_marker ( & mut self , marker : String ) -> & Self {
self . crop_marker = Some ( marker ) ;
self
}
pub fn highlight_prefix ( & mut self , prefix : String ) -> & Self {
self . highlight_prefix = Some ( prefix ) ;
self
}
pub fn highlight_suffix ( & mut self , suffix : String ) -> & Self {
self . highlight_suffix = Some ( suffix ) ;
self
}
pub fn build < ' t , ' m > ( & ' m self , tokens : & ' t [ Token ] , text : & ' t str ) -> Matcher < ' t , ' m > {
let crop_marker = match & self . crop_marker {
Some ( marker ) = > marker . as_str ( ) ,
None = > & DEFAULT_CROP_MARKER ,
} ;
let highlight_prefix = match & self . highlight_prefix {
Some ( marker ) = > marker . as_str ( ) ,
None = > & DEFAULT_HIGHLIGHT_PREFIX ,
} ;
let highlight_suffix = match & self . highlight_suffix {
Some ( marker ) = > marker . as_str ( ) ,
None = > & DEFAULT_HIGHLIGHT_SUFFIX ,
} ;
Matcher {
text ,
tokens ,
matching_words : & self . matching_words ,
crop_size : self . crop_size ,
crop_marker ,
highlight_prefix ,
highlight_suffix ,
matches : None ,
}
}
}
2022-03-28 18:17:50 +02:00
#[ derive(Clone, Debug) ]
2022-03-28 15:57:05 +02:00
pub struct Match {
2022-03-22 15:22:14 +01:00
match_len : usize ,
2022-04-04 18:56:59 +02:00
// ids of the query words that matches.
ids : Vec < PrimitiveWordId > ,
2022-03-22 15:22:14 +01:00
// position of the word in the whole text.
2022-03-28 15:57:05 +02:00
word_position : usize ,
// position of the token in the whole text.
token_position : usize ,
2022-03-22 15:22:14 +01:00
}
2022-03-28 18:17:50 +02:00
#[ derive(Clone, Debug) ]
2022-03-22 15:22:14 +01:00
pub struct MatchBounds {
2022-03-30 15:22:18 +02:00
pub start : usize ,
pub length : usize ,
2022-03-22 15:22:14 +01:00
}
pub struct Matcher < ' t , ' m > {
text : & ' t str ,
tokens : & ' t [ Token < ' t > ] ,
matching_words : & ' m MatchingWords ,
crop_size : usize ,
crop_marker : & ' m str ,
highlight_prefix : & ' m str ,
highlight_suffix : & ' m str ,
2022-03-28 15:57:05 +02:00
matches : Option < Vec < Match > > ,
2022-03-22 15:22:14 +01:00
}
impl < ' t > Matcher < ' t , '_ > {
fn compute_matches ( & mut self ) -> & mut Self {
let mut matches = Vec ::new ( ) ;
2022-03-28 15:57:05 +02:00
let mut word_position = 0 ;
let mut token_position = 0 ;
2022-04-04 18:56:59 +02:00
while let Some ( token ) = self . tokens . get ( token_position ) {
2022-03-28 15:57:05 +02:00
if token . is_separator ( ) . is_none ( ) {
2022-04-04 18:56:59 +02:00
' matches : for match_type in self . matching_words . match_token ( & token ) {
match match_type {
MatchType ::Full { char_len , ids } = > {
matches . push ( Match {
match_len : char_len ,
ids : ids . to_vec ( ) ,
word_position ,
token_position ,
} ) ;
// stop on the first match
break ;
}
MatchType ::Partial ( mut partial ) = > {
let mut potential_matches =
vec! [ ( token_position , word_position , partial . char_len ( ) ) ] ;
let mut t_position = 1 ;
let mut w_position = 1 ;
' partials : for token in & self . tokens [ token_position + 1 .. ] {
if token . is_separator ( ) . is_none ( ) {
partial = match partial . match_token ( & token ) {
Some ( MatchType ::Partial ( partial ) ) = > {
potential_matches . push ( (
token_position + t_position ,
word_position + w_position ,
partial . char_len ( ) ,
) ) ;
partial
}
// partial match is now full, we keep this matches and we advance positions
Some ( MatchType ::Full { char_len , ids } ) = > {
let iter = potential_matches . into_iter ( ) . map (
| ( token_position , word_position , match_len ) | {
Match {
match_len ,
ids : ids . to_vec ( ) ,
word_position ,
token_position ,
}
} ,
) ;
matches . extend ( iter ) ;
word_position + = w_position ;
token_position + = t_position ;
matches . push ( Match {
match_len : char_len ,
ids : ids . to_vec ( ) ,
word_position ,
token_position ,
} ) ;
break 'matches ;
}
// no match, continue to next match.
None = > break 'partials ,
} ;
w_position + = 1 ;
}
t_position + = 1 ;
}
}
}
2022-03-22 15:22:14 +01:00
}
2022-03-28 15:57:05 +02:00
word_position + = 1 ;
2022-03-22 15:22:14 +01:00
}
2022-03-28 15:57:05 +02:00
token_position + = 1 ;
2022-03-22 15:22:14 +01:00
}
self . matches = Some ( matches ) ;
self
}
pub fn matches ( & mut self ) -> Vec < MatchBounds > {
match & self . matches {
None = > self . compute_matches ( ) . matches ( ) ,
2022-03-28 15:57:05 +02:00
Some ( matches ) = > matches
. iter ( )
. map ( | m | MatchBounds {
start : self . tokens [ m . token_position ] . byte_start ,
length : m . match_len ,
} )
. collect ( ) ,
}
}
2022-03-28 18:17:50 +02:00
fn token_crop_bounds ( & self , matches : & [ Match ] ) -> ( usize , usize ) {
2022-03-28 15:57:05 +02:00
let first_match_word_position = matches . first ( ) . map ( | m | m . word_position ) . unwrap_or ( 0 ) ;
let first_match_token_position = matches . first ( ) . map ( | m | m . token_position ) . unwrap_or ( 0 ) ;
let last_match_word_position = matches . last ( ) . map ( | m | m . word_position ) . unwrap_or ( 0 ) ;
let last_match_token_position = matches . last ( ) . map ( | m | m . token_position ) . unwrap_or ( 0 ) ;
2022-03-29 14:51:02 +02:00
// TODO: buggy if no match and first token is a sepparator
2022-03-28 15:57:05 +02:00
let mut remaining_words =
2022-03-29 14:51:02 +02:00
self . crop_size + first_match_word_position - last_match_word_position ;
// if first token is a word, then remove 1 to remaining_words.
if let Some ( None ) = self . tokens . get ( first_match_token_position ) . map ( | t | t . is_separator ( ) ) {
remaining_words - = 1 ;
}
2022-03-28 15:57:05 +02:00
let mut first_token_position = first_match_token_position ;
let mut last_token_position = last_match_token_position ;
while remaining_words > 0 {
match (
first_token_position . checked_sub ( 1 ) . and_then ( | i | self . tokens . get ( i ) ) ,
last_token_position . checked_add ( 1 ) . and_then ( | i | self . tokens . get ( i ) ) ,
) {
( Some ( ft ) , Some ( lt ) ) = > {
match ( ft . is_separator ( ) , lt . is_separator ( ) ) {
// if they are both separators and are the same kind then advance both
( Some ( f_kind ) , Some ( s_kind ) ) = > {
if f_kind = = s_kind {
first_token_position - = 1 ;
last_token_position + = 1 ;
} else if f_kind = = SeparatorKind ::Hard {
last_token_position + = 1 ;
} else {
first_token_position - = 1 ;
}
}
// left is a word, advance left
( None , Some ( _ ) ) = > {
first_token_position - = 1 ;
remaining_words - = 1 ;
}
// right is a word, advance right
( Some ( _ ) , None ) = > {
last_token_position + = 1 ;
remaining_words - = 1 ;
}
// both are words, advance left then right if remaining_word > 0
( None , None ) = > {
first_token_position - = 1 ;
remaining_words - = 1 ;
if remaining_words > 0 {
last_token_position + = 1 ;
remaining_words - = 1 ;
}
}
}
}
2022-03-29 14:51:02 +02:00
// the end of the text is reached, advance left.
2022-03-28 15:57:05 +02:00
( Some ( ft ) , None ) = > {
first_token_position - = 1 ;
if ft . is_separator ( ) . is_none ( ) {
remaining_words - = 1 ;
}
}
2022-03-29 14:51:02 +02:00
// the start of the text is reached, advance right.
2022-03-28 15:57:05 +02:00
( None , Some ( lt ) ) = > {
last_token_position + = 1 ;
if lt . is_separator ( ) . is_none ( ) {
remaining_words - = 1 ;
}
}
2022-03-29 14:51:02 +02:00
// no more token to add.
2022-03-28 15:57:05 +02:00
( None , None ) = > break ,
}
}
2022-03-28 18:17:50 +02:00
( first_token_position , last_token_position )
2022-03-22 15:22:14 +01:00
}
2022-03-28 18:17:50 +02:00
fn match_interval_score ( & self , matches : & [ Match ] ) -> ( i16 , i16 , i16 ) {
2022-04-04 18:56:59 +02:00
let mut ids : Vec < PrimitiveWordId > = Vec ::with_capacity ( matches . len ( ) ) ;
2022-03-28 18:17:50 +02:00
let mut order_score = 0 ;
let mut distance_score = 0 ;
let mut iter = matches . iter ( ) . peekable ( ) ;
while let Some ( m ) = iter . next ( ) {
if let Some ( next_match ) = iter . peek ( ) {
// if matches are ordered
2022-04-04 18:56:59 +02:00
if next_match . ids . iter ( ) . min ( ) > m . ids . iter ( ) . min ( ) {
2022-03-28 18:17:50 +02:00
order_score + = 1 ;
}
// compute distance between matches
distance_score - = ( next_match . word_position - m . word_position ) . min ( 7 ) as i16 ;
}
2022-04-04 18:56:59 +02:00
ids . extend ( m . ids . iter ( ) ) ;
2022-03-28 18:17:50 +02:00
}
ids . sort_unstable ( ) ;
ids . dedup ( ) ;
let uniq_score = ids . len ( ) as i16 ;
// rank by unique match count, then by distance between matches, then by ordered match count.
( uniq_score , distance_score , order_score )
}
fn find_best_match_interval < ' a > ( & self , matches : & ' a [ Match ] ) -> & ' a [ Match ] {
if matches . len ( ) > 1 {
2022-03-29 14:51:02 +02:00
let mut best_interval = ( 0 , 0 ) ;
let mut best_interval_score = self . match_interval_score ( & matches [ 0 ..= 0 ] ) ;
2022-03-28 18:17:50 +02:00
let mut interval_first = 0 ;
2022-03-29 14:51:02 +02:00
let mut interval_last = 0 ;
for ( index , next_match ) in matches . iter ( ) . enumerate ( ) . skip ( 1 ) {
2022-03-28 18:17:50 +02:00
// if next match would make interval gross more than crop_size
2022-03-29 14:51:02 +02:00
if next_match . word_position - matches [ interval_first ] . word_position
> = self . crop_size
2022-03-28 18:17:50 +02:00
{
let interval_score =
self . match_interval_score ( & matches [ interval_first ..= interval_last ] ) ;
// keep interval if it's the best
if interval_score > best_interval_score {
best_interval = ( interval_first , interval_last ) ;
best_interval_score = interval_score ;
}
// advance start of the interval while interval is longer than crop_size
while next_match . word_position - matches [ interval_first ] . word_position
2022-03-29 14:51:02 +02:00
> = self . crop_size
2022-03-28 18:17:50 +02:00
{
interval_first + = 1 ;
}
}
interval_last = index ;
}
let interval_score =
self . match_interval_score ( & matches [ interval_first ..= interval_last ] ) ;
if interval_score > best_interval_score {
best_interval = ( interval_first , interval_last ) ;
}
& matches [ best_interval . 0 ..= best_interval . 1 ]
} else {
matches
2022-03-28 15:57:05 +02:00
}
2022-03-22 15:22:14 +01:00
}
2022-03-28 18:17:50 +02:00
fn crop_bounds ( & self , matches : & [ Match ] ) -> ( usize , usize ) {
let match_interval = self . find_best_match_interval ( matches ) ;
let ( first_token_position , last_token_position ) = self . token_crop_bounds ( match_interval ) ;
2022-03-29 14:51:02 +02:00
let byte_start = self . tokens . get ( first_token_position ) . map_or ( 0 , | t | t . byte_start ) ;
let byte_end = self . tokens . get ( last_token_position ) . map_or ( byte_start , | t | t . byte_end ) ;
( byte_start , byte_end )
2022-03-28 18:17:50 +02:00
}
2022-03-22 15:22:14 +01:00
pub fn format ( & mut self , highlight : bool , crop : bool ) -> Cow < ' t , str > {
2022-03-29 14:51:02 +02:00
// If 0 it will be considered null and thus not crop the field
// https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let crop = crop & & self . crop_size > 0 ;
2022-03-22 15:22:14 +01:00
if ! highlight & & ! crop {
// compute matches is not needed if no highlight or crop is requested.
Cow ::Borrowed ( self . text )
} else {
match & self . matches {
Some ( matches ) = > {
let ( byte_start , byte_end ) =
if crop { self . crop_bounds ( matches ) } else { ( 0 , self . text . len ( ) ) } ;
let mut formatted = Vec ::new ( ) ;
// push crop marker if it's not the start of the text.
if byte_start > 0 & & ! self . crop_marker . is_empty ( ) {
formatted . push ( self . crop_marker ) ;
}
let mut byte_index = byte_start ;
if highlight {
// insert highlight markers around matches.
2022-03-28 15:57:05 +02:00
let tokens = self . tokens ;
2022-03-22 15:22:14 +01:00
for m in matches
. iter ( )
2022-03-28 15:57:05 +02:00
. skip_while ( | m | tokens [ m . token_position ] . byte_start < byte_start )
. take_while ( | m | tokens [ m . token_position ] . byte_start < byte_end )
2022-03-22 15:22:14 +01:00
{
2022-03-28 15:57:05 +02:00
let token = & tokens [ m . token_position ] ;
if byte_index < token . byte_start {
formatted . push ( & self . text [ byte_index .. token . byte_start ] ) ;
2022-03-22 15:22:14 +01:00
}
2022-03-30 15:43:49 +02:00
let highlight_byte_index = self . text [ token . byte_start .. ]
. char_indices ( )
. enumerate ( )
. find ( | ( i , _ ) | * i = = m . match_len )
2022-04-04 18:56:59 +02:00
. map_or ( token . byte_end , | ( _ , ( i , _ ) ) | i + token . byte_start )
. min ( token . byte_end ) ;
2022-03-22 15:22:14 +01:00
formatted . push ( self . highlight_prefix ) ;
2022-03-30 15:43:49 +02:00
formatted . push ( & self . text [ token . byte_start .. highlight_byte_index ] ) ;
2022-03-22 15:22:14 +01:00
formatted . push ( self . highlight_suffix ) ;
2022-03-30 15:43:49 +02:00
formatted . push ( & self . text [ highlight_byte_index .. token . byte_end ] ) ;
2022-03-22 15:22:14 +01:00
2022-03-28 15:57:05 +02:00
byte_index = token . byte_end ;
2022-03-22 15:22:14 +01:00
}
}
// push the rest of the text between last match and the end of crop.
if byte_index < byte_end {
formatted . push ( & self . text [ byte_index .. byte_end ] ) ;
}
// push crop marker if it's not the end of the text.
if byte_end < self . text . len ( ) & & ! self . crop_marker . is_empty ( ) {
formatted . push ( self . crop_marker ) ;
}
if formatted . len ( ) = = 1 {
// avoid concatenating if there is already 1 slice.
Cow ::Borrowed ( & self . text [ byte_start .. byte_end ] )
} else {
Cow ::Owned ( formatted . concat ( ) )
}
}
None = > self . compute_matches ( ) . format ( highlight , crop ) ,
}
}
}
}
#[ cfg(test) ]
mod tests {
2022-03-30 15:43:49 +02:00
use meilisearch_tokenizer ::{ Analyzer , AnalyzerConfig } ;
2022-03-22 15:22:14 +01:00
use super ::* ;
2022-04-04 18:56:59 +02:00
use crate ::search ::matches ::matching_words ::MatchingWord ;
fn matching_words ( ) -> MatchingWords {
let matching_words = vec! [
( vec! [ MatchingWord ::new ( " split " . to_string ( ) , 0 , false ) ] , vec! [ 0 ] ) ,
( vec! [ MatchingWord ::new ( " the " . to_string ( ) , 0 , false ) ] , vec! [ 1 ] ) ,
( vec! [ MatchingWord ::new ( " world " . to_string ( ) , 1 , true ) ] , vec! [ 2 ] ) ,
] ;
MatchingWords ::new ( matching_words )
2022-03-22 15:22:14 +01:00
}
#[ test ]
fn format_identity ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer ::new ( AnalyzerConfig ::< Vec < u8 > > ::default ( ) ) ;
let highlight = false ;
let crop = false ;
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop and no highlight should return complete text.
assert_eq! ( & matcher . format ( highlight , crop ) , & text ) ;
// Text containing all matches.
2022-03-28 15:57:05 +02:00
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
2022-03-22 15:22:14 +01:00
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop and no highlight should return complete text.
assert_eq! ( & matcher . format ( highlight , crop ) , & text ) ;
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop and no highlight should return complete text.
assert_eq! ( & matcher . format ( highlight , crop ) , & text ) ;
}
#[ test ]
fn format_highlight ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer ::new ( AnalyzerConfig ::< Vec < u8 > > ::default ( ) ) ;
let highlight = true ;
let crop = false ;
2022-03-29 14:51:02 +02:00
// empty text.
let text = " " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
assert_eq! ( & matcher . format ( highlight , crop ) , " " ) ;
// text containing only separators.
let text = " :-) " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
assert_eq! ( & matcher . format ( highlight , crop ) , " :-) " ) ;
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop should return complete text, because there is no matches.
assert_eq! ( & matcher . format ( highlight , crop ) , & text ) ;
// Text containing all matches.
2022-03-28 15:57:05 +02:00
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
2022-03-22 15:22:14 +01:00
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop should return complete text with highlighted matches.
2022-03-28 15:57:05 +02:00
assert_eq! ( & matcher . format ( highlight , crop ) , " Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>. " ) ;
2022-03-22 15:22:14 +01:00
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop should return complete text with highlighted matches.
assert_eq! (
& matcher . format ( highlight , crop ) ,
" Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. "
) ;
2022-03-30 15:43:49 +02:00
}
#[ test ]
fn highlight_unicode ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = vec! [
( vec! [ MatchingWord ::new ( " wessfali " . to_string ( ) , 1 , true ) ] , vec! [ 0 ] ) ,
( vec! [ MatchingWord ::new ( " world " . to_string ( ) , 1 , true ) ] , vec! [ 1 ] ) ,
] ;
let matching_words = MatchingWords ::new ( matching_words ) ;
2022-03-30 15:43:49 +02:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-30 15:43:49 +02:00
let analyzer = Analyzer ::new ( AnalyzerConfig ::< Vec < u8 > > ::default ( ) ) ;
2022-03-30 15:15:14 +02:00
2022-03-30 15:43:49 +02:00
let highlight = true ;
let crop = false ;
// Text containing prefix match.
let text = " Ŵôřlḑôle " ;
2022-03-30 15:15:14 +02:00
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop should return complete text with highlighted matches.
2022-03-30 15:43:49 +02:00
assert_eq! ( & matcher . format ( highlight , crop ) , " <em>Ŵôřlḑ</em>ôle " ) ;
// Text containing unicode match.
let text = " Ŵôřlḑ " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop should return complete text with highlighted matches.
assert_eq! ( & matcher . format ( highlight , crop ) , " <em>Ŵôřlḑ</em> " ) ;
// Text containing unicode match.
let text = " Westfália " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no crop should return complete text with highlighted matches.
2022-04-04 18:56:59 +02:00
assert_eq! ( & matcher . format ( highlight , crop ) , " <em>Westfáli</em>a " ) ;
2022-03-22 15:22:14 +01:00
}
#[ test ]
fn format_crop ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer ::new ( AnalyzerConfig ::< Vec < u8 > > ::default ( ) ) ;
let highlight = false ;
let crop = true ;
2022-03-29 14:51:02 +02:00
// empty text.
let text = " " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
assert_eq! ( & matcher . format ( highlight , crop ) , " " ) ;
// text containing only separators.
let text = " :-) " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
assert_eq! ( & matcher . format ( highlight , crop ) , " :-) " ) ;
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no highlight should return 10 first words with a marker at the end.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" A quick brown fox can not jump 32 feet, right… "
2022-03-22 15:22:14 +01:00
) ;
2022-03-29 14:51:02 +02:00
// Text without any match starting by a separator.
let text = " (A quick brown fox can not jump 32 feet, right? Brr, it is cold!) " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no highlight should return 10 first words with a marker at the end.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" (A quick brown fox can not jump 32 feet, right… "
2022-03-29 14:51:02 +02:00
) ;
2022-03-28 15:57:05 +02:00
// Test phrase propagation
let text = " Natalie risk her future. Split The World is a book written by Emily Henry. I never read it. " ;
2022-03-22 15:22:14 +01:00
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
2022-03-28 15:57:05 +02:00
// should crop the phrase instead of croping around the match.
2022-03-22 15:22:14 +01:00
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …Split The World is a book written by Emily Henry… "
2022-03-22 15:22:14 +01:00
) ;
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no highlight should return 10 last words with a marker at the start.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …future to build a world with the boy she loves… "
2022-03-22 15:22:14 +01:00
) ;
2022-03-28 15:57:05 +02:00
// Text containing all matches.
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// no highlight should return 10 last words with a marker at the start.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …she loves. Emily Henry: The Love That Split The World. "
2022-03-28 15:57:05 +02:00
) ;
2022-03-22 15:22:14 +01:00
// Text containing a match unordered and a match ordered.
let text = " The world split void void void void void void void void void split the world void void " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// crop should return 10 last words with a marker at the start.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void split the world void void "
2022-03-22 15:22:14 +01:00
) ;
2022-03-28 18:17:50 +02:00
// Text containing matches with diferent density.
let text = " split void the void void world void void void void void void void void void void split the world void void " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// crop should return 10 last words with a marker at the start.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void split the world void void "
2022-03-28 18:17:50 +02:00
) ;
// Text containing matches with same word.
let text = " split split split split split split void void void void void void void void void void split the world void void " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// crop should return 10 last words with a marker at the start.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void split the world void void "
2022-03-28 18:17:50 +02:00
) ;
2022-03-22 15:22:14 +01:00
}
#[ test ]
fn format_highlight_crop ( ) {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer ::new ( AnalyzerConfig ::< Vec < u8 > > ::default ( ) ) ;
let highlight = true ;
let crop = true ;
2022-03-29 14:51:02 +02:00
// empty text.
let text = " " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
assert_eq! ( & matcher . format ( highlight , crop ) , " " ) ;
// text containing only separators.
let text = " :-) " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
assert_eq! ( & matcher . format ( highlight , crop ) , " :-) " ) ;
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = " A quick brown fox can not jump 32 feet, right? Brr, it is cold! " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// both should return 10 first words with a marker at the end.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" A quick brown fox can not jump 32 feet, right… "
2022-03-22 15:22:14 +01:00
) ;
// Text containing some matches.
let text = " Natalie risk her future to build a world with the boy she loves. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …future to build a <em>world</em> with <em>the</em> boy she loves… "
2022-03-22 15:22:14 +01:00
) ;
2022-03-28 15:57:05 +02:00
// Text containing all matches.
let text = " Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// both should return 10 last words with a marker at the start and highlighted matches.
2022-03-30 17:22:58 +02:00
assert_eq! ( & matcher . format ( highlight , crop ) , " …she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>. " ) ;
2022-03-28 15:57:05 +02:00
2022-03-22 15:22:14 +01:00
// Text containing a match unordered and a match ordered.
let text = " The world split void void void void void void void void void split the world void void " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// crop should return 10 last words with a marker at the start.
assert_eq! (
& matcher . format ( highlight , crop ) ,
2022-03-30 17:22:58 +02:00
" …void void void void void <em>split</em> <em>the</em> <em>world</em> void void "
2022-03-22 15:22:14 +01:00
) ;
}
2022-03-29 14:51:02 +02:00
#[ test ]
fn smaller_crop_size ( ) {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
2022-04-04 18:56:59 +02:00
let matching_words = matching_words ( ) ;
2022-03-29 14:51:02 +02:00
2022-04-04 18:56:59 +02:00
let mut builder = MatcherBuilder ::from_matching_words ( matching_words ) ;
2022-03-29 14:51:02 +02:00
let analyzer = Analyzer ::new ( AnalyzerConfig ::< Vec < u8 > > ::default ( ) ) ;
let highlight = false ;
let crop = true ;
let text = " void void split the world void void. " ;
let analyzed = analyzer . analyze ( & text ) ;
let tokens : Vec < _ > = analyzed . tokens ( ) . collect ( ) ;
// set a smaller crop size
builder . crop_size ( 2 ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// because crop size < query size, partially format matches.
2022-03-30 17:22:58 +02:00
assert_eq! ( & matcher . format ( highlight , crop ) , " …split the… " ) ;
2022-03-29 14:51:02 +02:00
// set a smaller crop size
builder . crop_size ( 1 ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// because crop size < query size, partially format matches.
2022-03-30 17:22:58 +02:00
assert_eq! ( & matcher . format ( highlight , crop ) , " …split… " ) ;
2022-03-29 14:51:02 +02:00
// set a smaller crop size
builder . crop_size ( 0 ) ;
let mut matcher = builder . build ( & tokens [ .. ] , text ) ;
// because crop size is 0, crop is ignored.
assert_eq! ( & matcher . format ( highlight , crop ) , " void void split the world void void. " ) ;
}
2022-03-22 15:22:14 +01:00
}