2019-10-31 15:00:36 +01:00
use std ::cmp ::Ordering ;
use std ::collections ::{ HashMap , HashSet } ;
2020-03-02 14:34:29 +01:00
use std ::hash ::{ Hash , Hasher } ;
2020-04-24 18:18:40 +02:00
use std ::time ::Instant ;
2019-10-31 15:00:36 +01:00
2020-01-23 11:30:18 +01:00
use indexmap ::IndexMap ;
use log ::error ;
2020-05-22 12:35:23 +02:00
use meilisearch_core ::{ Filter , MainReader } ;
2020-05-05 22:29:35 +02:00
use meilisearch_core ::facets ::FacetFilter ;
2020-01-23 11:30:18 +01:00
use meilisearch_core ::criterion ::* ;
use meilisearch_core ::settings ::RankingRule ;
2020-05-22 12:35:23 +02:00
use meilisearch_core ::{ Highlight , Index , RankedMap } ;
2020-01-23 11:30:18 +01:00
use meilisearch_schema ::{ FieldId , Schema } ;
2020-04-02 19:53:51 +02:00
use meilisearch_tokenizer ::is_cjk ;
2020-01-23 11:30:18 +01:00
use serde ::{ Deserialize , Serialize } ;
use serde_json ::Value ;
2020-03-02 14:34:29 +01:00
use siphasher ::sip ::SipHasher ;
2020-04-24 13:21:22 +02:00
use slice_group_by ::GroupBy ;
2020-01-23 11:30:18 +01:00
2020-05-22 12:03:57 +02:00
use crate ::error ::{ Error , ResponseError } ;
2020-01-29 18:30:21 +01:00
2019-10-31 15:00:36 +01:00
pub trait IndexSearchExt {
fn new_search ( & self , query : String ) -> SearchBuilder ;
}
impl IndexSearchExt for Index {
fn new_search ( & self , query : String ) -> SearchBuilder {
SearchBuilder {
index : self ,
query ,
offset : 0 ,
limit : 20 ,
attributes_to_crop : None ,
attributes_to_retrieve : None ,
attributes_to_highlight : None ,
filters : None ,
matches : false ,
2020-05-05 22:29:35 +02:00
facet_filters : None ,
2020-05-07 19:25:18 +02:00
facets : None ,
2019-10-31 15:00:36 +01:00
}
}
}
pub struct SearchBuilder < ' a > {
index : & ' a Index ,
query : String ,
offset : usize ,
limit : usize ,
attributes_to_crop : Option < HashMap < String , usize > > ,
attributes_to_retrieve : Option < HashSet < String > > ,
attributes_to_highlight : Option < HashSet < String > > ,
filters : Option < String > ,
matches : bool ,
2020-05-05 22:29:35 +02:00
facet_filters : Option < FacetFilter > ,
2020-05-12 12:19:44 +02:00
facets : Option < Vec < ( FieldId , String ) > >
2019-10-31 15:00:36 +01:00
}
impl < ' a > SearchBuilder < ' a > {
pub fn offset ( & mut self , value : usize ) -> & SearchBuilder {
self . offset = value ;
self
}
pub fn limit ( & mut self , value : usize ) -> & SearchBuilder {
self . limit = value ;
self
}
pub fn attributes_to_crop ( & mut self , value : HashMap < String , usize > ) -> & SearchBuilder {
self . attributes_to_crop = Some ( value ) ;
self
}
pub fn attributes_to_retrieve ( & mut self , value : HashSet < String > ) -> & SearchBuilder {
self . attributes_to_retrieve = Some ( value ) ;
self
}
pub fn add_retrievable_field ( & mut self , value : String ) -> & SearchBuilder {
let attributes_to_retrieve = self . attributes_to_retrieve . get_or_insert ( HashSet ::new ( ) ) ;
attributes_to_retrieve . insert ( value ) ;
self
}
pub fn attributes_to_highlight ( & mut self , value : HashSet < String > ) -> & SearchBuilder {
self . attributes_to_highlight = Some ( value ) ;
self
}
2020-05-05 22:29:35 +02:00
pub fn add_facet_filters ( & mut self , filters : FacetFilter ) -> & SearchBuilder {
self . facet_filters = Some ( filters ) ;
self
}
2019-10-31 15:00:36 +01:00
pub fn filters ( & mut self , value : String ) -> & SearchBuilder {
self . filters = Some ( value ) ;
self
}
pub fn get_matches ( & mut self ) -> & SearchBuilder {
self . matches = true ;
self
}
2020-05-12 12:19:44 +02:00
pub fn add_facets ( & mut self , facets : Vec < ( FieldId , String ) > ) -> & SearchBuilder {
2020-05-07 19:25:18 +02:00
self . facets = Some ( facets ) ;
self
}
2020-05-22 12:35:23 +02:00
pub fn search ( self , reader : & MainReader ) -> Result < SearchResult , ResponseError > {
2020-04-17 14:52:13 +02:00
let schema = self
. index
. main
. schema ( reader ) ?
2020-05-19 18:20:29 +02:00
. ok_or ( Error ::internal ( " missing schema " ) ) ? ;
2019-10-31 15:00:36 +01:00
2020-04-17 14:52:13 +02:00
let ranked_map = self . index . main . ranked_map ( reader ) ? . unwrap_or_default ( ) ;
2019-10-31 15:00:36 +01:00
// Change criteria
let mut query_builder = match self . get_criteria ( reader , & ranked_map , & schema ) ? {
Some ( criteria ) = > self . index . query_builder_with_criteria ( criteria ) ,
None = > self . index . query_builder ( ) ,
} ;
2020-04-06 20:05:02 +02:00
if let Some ( filter_expression ) = & self . filters {
let filter = Filter ::parse ( filter_expression , & schema ) ? ;
2020-05-12 12:19:44 +02:00
let index = & self . index ;
2020-04-06 20:05:02 +02:00
query_builder . with_filter ( move | id | {
let reader = & reader ;
let filter = & filter ;
match filter . test ( reader , index , id ) {
Ok ( res ) = > res ,
Err ( e ) = > {
log ::warn! ( " unexpected error during filtering: {} " , e ) ;
false
}
2019-10-31 15:00:36 +01:00
}
2020-04-06 20:05:02 +02:00
} ) ;
2019-10-31 15:00:36 +01:00
}
2020-03-02 14:34:29 +01:00
if let Some ( field ) = self . index . main . distinct_attribute ( reader ) ? {
if let Some ( field_id ) = schema . id ( & field ) {
2020-05-12 12:19:44 +02:00
let index = & self . index ;
2020-03-02 14:34:29 +01:00
query_builder . with_distinct ( 1 , move | id | {
2020-05-12 12:19:44 +02:00
match index . document_attribute_bytes ( reader , id , field_id ) {
2020-03-02 14:34:29 +01:00
Ok ( Some ( bytes ) ) = > {
let mut s = SipHasher ::new ( ) ;
bytes . hash ( & mut s ) ;
Some ( s . finish ( ) )
}
_ = > None ,
}
} ) ;
}
}
2020-05-12 12:37:16 +02:00
query_builder . set_facet_filter ( self . facet_filters ) ;
2020-05-12 12:19:44 +02:00
query_builder . set_facets ( self . facets ) ;
2020-05-05 22:29:35 +02:00
2019-12-22 20:55:11 +01:00
let start = Instant ::now ( ) ;
2020-05-12 12:19:44 +02:00
let result = query_builder . query ( reader , & self . query , self . offset .. ( self . offset + self . limit ) ) ;
2020-05-19 18:20:29 +02:00
let search_result = result . map_err ( Error ::search_documents ) ? ;
2020-03-25 14:00:29 +01:00
let time_ms = start . elapsed ( ) . as_millis ( ) as usize ;
2019-10-31 15:00:36 +01:00
2020-04-02 19:53:51 +02:00
let mut all_attributes : HashSet < & str > = HashSet ::new ( ) ;
let mut all_formatted : HashSet < & str > = HashSet ::new ( ) ;
match & self . attributes_to_retrieve {
Some ( to_retrieve ) = > {
all_attributes . extend ( to_retrieve . iter ( ) . map ( String ::as_str ) ) ;
if let Some ( to_highlight ) = & self . attributes_to_highlight {
all_formatted . extend ( to_highlight . iter ( ) . map ( String ::as_str ) ) ;
2019-10-31 15:00:36 +01:00
}
2019-11-15 12:04:46 +01:00
2020-04-02 19:53:51 +02:00
if let Some ( to_crop ) = & self . attributes_to_crop {
all_formatted . extend ( to_crop . keys ( ) . map ( String ::as_str ) ) ;
}
all_attributes . extend ( & all_formatted ) ;
} ,
None = > {
all_attributes . extend ( schema . displayed_name ( ) ) ;
// If we specified at least one attribute to highlight or crop then
// all available attributes will be returned in the _formatted field.
if self . attributes_to_highlight . is_some ( ) | | self . attributes_to_crop . is_some ( ) {
all_formatted . extend ( all_attributes . iter ( ) . cloned ( ) ) ;
}
} ,
}
let mut hits = Vec ::with_capacity ( self . limit ) ;
2020-05-07 19:25:18 +02:00
for doc in search_result . documents {
2020-04-02 19:53:51 +02:00
let mut document : IndexMap < String , Value > = self
2019-10-31 15:00:36 +01:00
. index
2020-04-26 20:54:35 +02:00
. document ( reader , Some ( & all_attributes ) , doc . id )
2020-05-19 18:20:29 +02:00
. map_err ( | e | Error ::retrieve_document ( doc . id . 0 , e ) ) ?
. ok_or ( Error ::internal (
2020-04-24 18:18:40 +02:00
" Impossible to retrieve the document; Corrupted data " ,
2020-04-17 14:52:13 +02:00
) ) ? ;
2019-10-31 15:00:36 +01:00
2020-04-02 19:53:51 +02:00
let mut formatted = document . iter ( )
. filter ( | ( key , _ ) | all_formatted . contains ( key . as_str ( ) ) )
. map ( | ( k , v ) | ( k . clone ( ) , v . clone ( ) ) )
. collect ( ) ;
2019-12-12 16:36:42 +01:00
2019-10-31 15:00:36 +01:00
let mut matches = doc . highlights . clone ( ) ;
// Crops fields if needed
2019-11-15 12:04:46 +01:00
if let Some ( fields ) = & self . attributes_to_crop {
crop_document ( & mut formatted , & mut matches , & schema , fields ) ;
2019-10-31 15:00:36 +01:00
}
// Transform to readable matches
2020-01-24 11:29:08 +01:00
if let Some ( attributes_to_highlight ) = & self . attributes_to_highlight {
2020-04-02 19:53:51 +02:00
let matches = calculate_matches (
2020-05-12 16:49:13 +02:00
& matches ,
2020-04-02 19:53:51 +02:00
self . attributes_to_highlight . clone ( ) ,
& schema ,
) ;
2020-01-24 11:29:08 +01:00
formatted = calculate_highlights ( & formatted , & matches , attributes_to_highlight ) ;
2019-10-31 15:00:36 +01:00
}
2020-04-02 19:53:51 +02:00
let matches_info = if self . matches {
2020-05-12 16:49:13 +02:00
Some ( calculate_matches ( & matches , self . attributes_to_retrieve . clone ( ) , & schema ) )
2020-04-02 19:53:51 +02:00
} else {
None
} ;
if let Some ( attributes_to_retrieve ) = & self . attributes_to_retrieve {
document . retain ( | key , _ | attributes_to_retrieve . contains ( & key . to_string ( ) ) )
}
2019-10-31 15:00:36 +01:00
let hit = SearchHit {
2019-11-15 12:04:46 +01:00
document ,
formatted ,
2019-10-31 15:00:36 +01:00
matches_info ,
} ;
hits . push ( hit ) ;
}
let results = SearchResult {
hits ,
offset : self . offset ,
limit : self . limit ,
2020-05-07 19:25:18 +02:00
nb_hits : search_result . nb_hits ,
2020-05-12 14:36:28 +02:00
exhaustive_nb_hits : search_result . exhaustive_nb_hit ,
2019-10-31 15:00:36 +01:00
processing_time_ms : time_ms ,
query : self . query . to_string ( ) ,
2020-05-26 17:56:07 +02:00
facets_distribution : search_result . facets ,
2020-05-27 19:45:23 +02:00
exhaustive_facets_count : search_result . exhaustive_facets_count ,
2019-10-31 15:00:36 +01:00
} ;
Ok ( results )
}
pub fn get_criteria (
& self ,
2020-05-22 12:35:23 +02:00
reader : & MainReader ,
2019-10-31 15:00:36 +01:00
ranked_map : & ' a RankedMap ,
schema : & Schema ,
2020-05-22 12:03:57 +02:00
) -> Result < Option < Criteria < ' a > > , ResponseError > {
2020-01-29 18:30:21 +01:00
let ranking_rules = self . index . main . ranking_rules ( reader ) ? ;
2019-10-31 15:00:36 +01:00
if let Some ( ranking_rules ) = ranking_rules {
let mut builder = CriteriaBuilder ::with_capacity ( 7 + ranking_rules . len ( ) ) ;
2020-01-14 17:26:27 +01:00
for rule in ranking_rules {
match rule {
RankingRule ::Typo = > builder . push ( Typo ) ,
RankingRule ::Words = > builder . push ( Words ) ,
RankingRule ::Proximity = > builder . push ( Proximity ) ,
RankingRule ::Attribute = > builder . push ( Attribute ) ,
RankingRule ::WordsPosition = > builder . push ( WordsPosition ) ,
2020-01-31 11:45:57 +01:00
RankingRule ::Exactness = > builder . push ( Exactness ) ,
2020-01-29 18:30:21 +01:00
RankingRule ::Asc ( field ) = > {
match SortByAttr ::lower_is_better ( & ranked_map , & schema , & field ) {
Ok ( rule ) = > builder . push ( rule ) ,
Err ( err ) = > error! ( " Error during criteria builder; {:?} " , err ) ,
}
}
2020-03-02 17:13:23 +01:00
RankingRule ::Desc ( field ) = > {
2020-01-29 18:30:21 +01:00
match SortByAttr ::higher_is_better ( & ranked_map , & schema , & field ) {
Ok ( rule ) = > builder . push ( rule ) ,
Err ( err ) = > error! ( " Error during criteria builder; {:?} " , err ) ,
}
}
2020-02-13 10:25:37 +01:00
}
2019-10-31 15:00:36 +01:00
}
2020-01-14 17:26:27 +01:00
builder . push ( DocumentId ) ;
return Ok ( Some ( builder . build ( ) ) ) ;
2019-10-31 15:00:36 +01:00
}
Ok ( None )
}
}
#[ derive(Debug, Clone, Eq, PartialEq, PartialOrd, Serialize, Deserialize) ]
pub struct MatchPosition {
pub start : usize ,
pub length : usize ,
}
impl Ord for MatchPosition {
fn cmp ( & self , other : & Self ) -> Ordering {
match self . start . cmp ( & other . start ) {
Ordering ::Equal = > self . length . cmp ( & other . length ) ,
_ = > self . start . cmp ( & other . start ) ,
}
}
}
pub type HighlightInfos = HashMap < String , Value > ;
pub type MatchesInfos = HashMap < String , Vec < MatchPosition > > ;
// pub type RankingInfos = HashMap<String, u64>;
#[ derive(Debug, Clone, Serialize, Deserialize) ]
pub struct SearchHit {
#[ serde(flatten) ]
2019-11-15 12:04:46 +01:00
pub document : IndexMap < String , Value > ,
#[ serde(rename = " _formatted " , skip_serializing_if = " IndexMap::is_empty " ) ]
pub formatted : IndexMap < String , Value > ,
2019-10-31 15:00:36 +01:00
#[ serde(rename = " _matchesInfo " , skip_serializing_if = " Option::is_none " ) ]
pub matches_info : Option < MatchesInfos > ,
}
2020-03-25 13:59:15 +01:00
#[ derive(Debug, Clone, Serialize) ]
2019-10-31 15:00:36 +01:00
#[ serde(rename_all = " camelCase " ) ]
pub struct SearchResult {
pub hits : Vec < SearchHit > ,
pub offset : usize ,
pub limit : usize ,
2020-03-25 12:11:37 +01:00
pub nb_hits : usize ,
pub exhaustive_nb_hits : bool ,
2019-10-31 15:00:36 +01:00
pub processing_time_ms : usize ,
pub query : String ,
2020-05-26 16:27:09 +02:00
#[ serde(skip_serializing_if = " Option::is_none " ) ]
2020-05-26 17:56:07 +02:00
pub facets_distribution : Option < HashMap < String , HashMap < String , usize > > > ,
2020-05-27 19:45:23 +02:00
#[ serde(skip_serializing_if = " Option::is_none " ) ]
pub exhaustive_facets_count : Option < bool > ,
2019-10-31 15:00:36 +01:00
}
2020-04-02 19:53:51 +02:00
/// returns the start index and the length on the crop.
2020-03-25 19:51:22 +01:00
fn aligned_crop ( text : & str , match_index : usize , context : usize ) -> ( usize , usize ) {
2020-03-26 11:34:50 +01:00
let is_word_component = | c : & char | c . is_alphanumeric ( ) & & ! is_cjk ( * c ) ;
2020-03-25 19:51:22 +01:00
let word_end_index = | mut index | {
2020-03-26 11:34:50 +01:00
if text . chars ( ) . nth ( index - 1 ) . map_or ( false , | c | is_word_component ( & c ) ) {
index + = text . chars ( ) . skip ( index ) . take_while ( is_word_component ) . count ( ) ;
2020-03-25 19:51:22 +01:00
}
index
} ;
2020-03-26 11:34:50 +01:00
if context = = 0 {
// count need to be at least 1 for cjk queries to return something
return ( match_index , 1 + text . chars ( ) . skip ( match_index ) . take_while ( is_word_component ) . count ( ) ) ;
}
2020-03-25 19:51:22 +01:00
let start = match match_index . saturating_sub ( context ) {
2020-05-13 15:57:58 +02:00
0 = > 0 ,
n = > {
let word_end_index = word_end_index ( n ) ;
// skip whitespaces if any
word_end_index + text . chars ( ) . skip ( word_end_index ) . take_while ( char ::is_ascii_whitespace ) . count ( )
}
2020-03-25 19:51:22 +01:00
} ;
2020-05-13 15:57:58 +02:00
let end = word_end_index ( match_index + context ) ;
2020-03-25 19:51:22 +01:00
( start , end - start )
}
2019-10-31 15:00:36 +01:00
fn crop_text (
text : & str ,
matches : impl IntoIterator < Item = Highlight > ,
context : usize ,
) -> ( String , Vec < Highlight > ) {
let mut matches = matches . into_iter ( ) . peekable ( ) ;
let char_index = matches . peek ( ) . map ( | m | m . char_index as usize ) . unwrap_or ( 0 ) ;
2020-03-25 19:51:22 +01:00
let ( start , count ) = aligned_crop ( text , char_index , context ) ;
2020-05-13 15:57:58 +02:00
// TODO do something about double allocation
let text = text
. chars ( )
. skip ( start )
. take ( count )
. collect ::< String > ( )
. trim ( )
. to_string ( ) ;
2019-10-31 15:00:36 +01:00
2020-03-26 11:34:50 +01:00
// update matches index to match the new cropped text
2019-10-31 15:00:36 +01:00
let matches = matches
2020-05-13 15:57:58 +02:00
. take_while ( | m | ( m . char_index as usize ) + ( m . char_length as usize ) < = start + count )
. map ( | m | Highlight {
char_index : m . char_index - start as u16 ,
.. m
2019-10-31 15:00:36 +01:00
} )
. collect ( ) ;
( text , matches )
}
fn crop_document (
document : & mut IndexMap < String , Value > ,
matches : & mut Vec < Highlight > ,
schema : & Schema ,
2019-11-15 12:04:46 +01:00
fields : & HashMap < String , usize > ,
) {
2019-10-31 15:00:36 +01:00
matches . sort_unstable_by_key ( | m | ( m . char_index , m . char_length ) ) ;
2019-11-15 12:04:46 +01:00
for ( field , length ) in fields {
2020-01-29 18:30:21 +01:00
let attribute = match schema . id ( field ) {
2019-11-15 12:04:46 +01:00
Some ( attribute ) = > attribute ,
None = > continue ,
} ;
let selected_matches = matches
. iter ( )
2020-01-14 17:26:27 +01:00
. filter ( | m | FieldId ::new ( m . attribute ) = = attribute )
2019-11-15 12:04:46 +01:00
. cloned ( ) ;
if let Some ( Value ::String ( ref mut original_text ) ) = document . get_mut ( field ) {
let ( cropped_text , cropped_matches ) =
crop_text ( original_text , selected_matches , * length ) ;
* original_text = cropped_text ;
2020-01-14 17:26:27 +01:00
matches . retain ( | m | FieldId ::new ( m . attribute ) ! = attribute ) ;
2019-11-15 12:04:46 +01:00
matches . extend_from_slice ( & cropped_matches ) ;
}
}
2019-10-31 15:00:36 +01:00
}
fn calculate_matches (
2020-05-12 16:49:13 +02:00
matches : & [ Highlight ] ,
2019-10-31 15:00:36 +01:00
attributes_to_retrieve : Option < HashSet < String > > ,
schema : & Schema ,
) -> MatchesInfos {
let mut matches_result : HashMap < String , Vec < MatchPosition > > = HashMap ::new ( ) ;
for m in matches . iter ( ) {
2020-01-29 18:30:21 +01:00
if let Some ( attribute ) = schema . name ( FieldId ::new ( m . attribute ) ) {
2020-05-13 15:57:58 +02:00
if let Some ( ref attributes_to_retrieve ) = attributes_to_retrieve {
2020-01-29 18:30:21 +01:00
if ! attributes_to_retrieve . contains ( attribute ) {
2020-01-14 17:26:27 +01:00
continue ;
}
2020-02-13 10:25:37 +01:00
}
2020-01-29 18:30:21 +01:00
if ! schema . displayed_name ( ) . contains ( attribute ) {
2020-01-27 18:25:42 +01:00
continue ;
}
2020-01-29 18:30:21 +01:00
if let Some ( pos ) = matches_result . get_mut ( attribute ) {
2020-01-14 17:26:27 +01:00
pos . push ( MatchPosition {
start : m . char_index as usize ,
length : m . char_length as usize ,
} ) ;
} else {
let mut positions = Vec ::new ( ) ;
positions . push ( MatchPosition {
start : m . char_index as usize ,
length : m . char_length as usize ,
} ) ;
2020-01-29 18:30:21 +01:00
matches_result . insert ( attribute . to_string ( ) , positions ) ;
2019-10-31 15:00:36 +01:00
}
}
}
for ( _ , val ) in matches_result . iter_mut ( ) {
val . sort_unstable ( ) ;
val . dedup ( ) ;
}
matches_result
}
fn calculate_highlights (
2019-11-15 12:04:46 +01:00
document : & IndexMap < String , Value > ,
matches : & MatchesInfos ,
attributes_to_highlight : & HashSet < String > ,
) -> IndexMap < String , Value > {
2019-12-12 16:36:42 +01:00
let mut highlight_result = document . clone ( ) ;
2019-11-15 12:04:46 +01:00
2019-10-31 15:00:36 +01:00
for ( attribute , matches ) in matches . iter ( ) {
2019-11-15 12:04:46 +01:00
if attributes_to_highlight . contains ( attribute ) {
2019-10-31 15:00:36 +01:00
if let Some ( Value ::String ( value ) ) = document . get ( attribute ) {
let value : Vec < _ > = value . chars ( ) . collect ( ) ;
let mut highlighted_value = String ::new ( ) ;
let mut index = 0 ;
2020-04-24 13:21:22 +02:00
let longest_matches = matches
. linear_group_by_key ( | m | m . start )
2020-05-13 15:57:58 +02:00
. map ( | group | group . last ( ) . unwrap ( ) )
. filter ( move | m | m . start > = index ) ;
2020-04-24 13:21:22 +02:00
for m in longest_matches {
2020-05-13 15:57:58 +02:00
let before = value . get ( index .. m . start ) ;
let highlighted = value . get ( m . start .. ( m . start + m . length ) ) ;
if let ( Some ( before ) , Some ( highlighted ) ) = ( before , highlighted ) {
highlighted_value . extend ( before ) ;
highlighted_value . push_str ( " <em> " ) ;
highlighted_value . extend ( highlighted ) ;
highlighted_value . push_str ( " </em> " ) ;
index = m . start + m . length ;
} else {
error! ( " value: {:?}; index: {:?}, match: {:?} " , value , index , m ) ;
2019-10-31 15:00:36 +01:00
}
}
highlighted_value . extend ( value [ index .. ] . iter ( ) ) ;
highlight_result . insert ( attribute . to_string ( ) , Value ::String ( highlighted_value ) ) ;
} ;
}
}
highlight_result
}
#[ cfg(test) ]
mod tests {
use super ::* ;
2020-03-26 14:44:03 +01:00
#[ test ]
fn aligned_crops ( ) {
let text = r # "En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation."# ;
// simple test
let ( start , length ) = aligned_crop ( & text , 6 , 2 ) ;
let cropped = text . chars ( ) . skip ( start ) . take ( length ) . collect ::< String > ( ) . trim ( ) . to_string ( ) ;
assert_eq! ( " début " , cropped ) ;
// first word test
let ( start , length ) = aligned_crop ( & text , 0 , 1 ) ;
let cropped = text . chars ( ) . skip ( start ) . take ( length ) . collect ::< String > ( ) . trim ( ) . to_string ( ) ;
assert_eq! ( " En " , cropped ) ;
// last word test
let ( start , length ) = aligned_crop ( & text , 510 , 2 ) ;
let cropped = text . chars ( ) . skip ( start ) . take ( length ) . collect ::< String > ( ) . trim ( ) . to_string ( ) ;
assert_eq! ( " Fondation " , cropped ) ;
// CJK tests
let text = " this isのス foo myタイリ test " ;
// mixed charset
let ( start , length ) = aligned_crop ( & text , 5 , 3 ) ;
let cropped = text . chars ( ) . skip ( start ) . take ( length ) . collect ::< String > ( ) . trim ( ) . to_string ( ) ;
2020-05-13 15:57:58 +02:00
assert_eq! ( " isの " , cropped ) ;
2020-04-02 19:53:51 +02:00
// split regular word / CJK word, no space
2020-03-26 14:44:03 +01:00
let ( start , length ) = aligned_crop ( & text , 7 , 1 ) ;
let cropped = text . chars ( ) . skip ( start ) . take ( length ) . collect ::< String > ( ) . trim ( ) . to_string ( ) ;
2020-05-13 15:57:58 +02:00
assert_eq! ( " の " , cropped ) ;
2020-03-26 14:44:03 +01:00
}
2020-04-27 19:10:40 +02:00
#[ test ]
fn calculate_matches ( ) {
let mut matches = Vec ::new ( ) ;
matches . push ( Highlight { attribute : 0 , char_index : 0 , char_length : 3 } ) ;
matches . push ( Highlight { attribute : 0 , char_index : 0 , char_length : 2 } ) ;
let mut attributes_to_retrieve : HashSet < String > = HashSet ::new ( ) ;
attributes_to_retrieve . insert ( " title " . to_string ( ) ) ;
let schema = Schema ::with_primary_key ( " title " ) ;
2020-05-12 16:49:13 +02:00
let matches_result = super ::calculate_matches ( & matches , Some ( attributes_to_retrieve ) , & schema ) ;
2020-04-27 19:10:40 +02:00
let mut matches_result_expected : HashMap < String , Vec < MatchPosition > > = HashMap ::new ( ) ;
let mut positions = Vec ::new ( ) ;
positions . push ( MatchPosition {
start : 0 ,
length : 2 ,
} ) ;
positions . push ( MatchPosition {
start : 0 ,
length : 3 ,
} ) ;
matches_result_expected . insert ( " title " . to_string ( ) , positions ) ;
assert_eq! ( matches_result , matches_result_expected ) ;
}
2019-10-31 15:00:36 +01:00
#[ test ]
fn calculate_highlights ( ) {
let data = r #" {
" title " : " Fondation (Isaac ASIMOV) " ,
" description " : " En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la Fondation. "
} " #;
let document : IndexMap < String , Value > = serde_json ::from_str ( data ) . unwrap ( ) ;
let mut attributes_to_highlight = HashSet ::new ( ) ;
2019-11-15 12:04:46 +01:00
attributes_to_highlight . insert ( " title " . to_string ( ) ) ;
attributes_to_highlight . insert ( " description " . to_string ( ) ) ;
2019-10-31 15:00:36 +01:00
2019-11-15 12:04:46 +01:00
let mut matches = HashMap ::new ( ) ;
2019-10-31 15:00:36 +01:00
let mut m = Vec ::new ( ) ;
m . push ( MatchPosition {
start : 0 ,
length : 9 ,
} ) ;
matches . insert ( " title " . to_string ( ) , m ) ;
let mut m = Vec ::new ( ) ;
m . push ( MatchPosition {
start : 510 ,
length : 9 ,
} ) ;
matches . insert ( " description " . to_string ( ) , m ) ;
2019-11-15 12:04:46 +01:00
let result = super ::calculate_highlights ( & document , & matches , & attributes_to_highlight ) ;
2019-10-31 15:00:36 +01:00
2019-11-15 12:04:46 +01:00
let mut result_expected = IndexMap ::new ( ) ;
2019-10-31 15:00:36 +01:00
result_expected . insert (
" title " . to_string ( ) ,
Value ::String ( " <em>Fondation</em> (Isaac ASIMOV) " . to_string ( ) ) ,
) ;
result_expected . insert ( " description " . to_string ( ) , Value ::String ( " En ce début de trentième millénaire, l'Empire n'a jamais été aussi puissant, aussi étendu à travers toute la galaxie. C'est dans sa capitale, Trantor, que l'éminent savant Hari Seldon invente la psychohistoire, une science toute nouvelle, à base de psychologie et de mathématiques, qui lui permet de prédire l'avenir... C'est-à-dire l'effondrement de l'Empire d'ici cinq siècles et au-delà, trente mille années de chaos et de ténèbres. Pour empêcher cette catastrophe et sauver la civilisation, Seldon crée la <em>Fondation</em>. " . to_string ( ) ) ) ;
assert_eq! ( result , result_expected ) ;
}
2020-04-27 19:10:40 +02:00
#[ test ]
fn highlight_longest_match ( ) {
let data = r #" {
" title " : " Ice "
} " #;
let document : IndexMap < String , Value > = serde_json ::from_str ( data ) . unwrap ( ) ;
let mut attributes_to_highlight = HashSet ::new ( ) ;
attributes_to_highlight . insert ( " title " . to_string ( ) ) ;
let mut matches = HashMap ::new ( ) ;
let mut m = Vec ::new ( ) ;
m . push ( MatchPosition {
start : 0 ,
length : 2 ,
} ) ;
m . push ( MatchPosition {
start : 0 ,
length : 3 ,
} ) ;
matches . insert ( " title " . to_string ( ) , m ) ;
let result = super ::calculate_highlights ( & document , & matches , & attributes_to_highlight ) ;
let mut result_expected = IndexMap ::new ( ) ;
result_expected . insert (
" title " . to_string ( ) ,
Value ::String ( " <em>Ice</em> " . to_string ( ) ) ,
) ;
assert_eq! ( result , result_expected ) ;
}
2019-10-31 15:00:36 +01:00
}