542: Refactor matching word r=Kerollmops a=ManyTheFish

Simplify MatchingWords API


Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
bors[bot] 2022-06-02 16:23:41 +00:00 committed by GitHub
commit 78f76c841d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 89 additions and 121 deletions

View File

@ -1,5 +1,5 @@
use criterion::{criterion_group, criterion_main}; use criterion::{criterion_group, criterion_main};
use milli::tokenizer::Tokenize; use milli::tokenizer::TokenizerBuilder;
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
@ -9,7 +9,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
struct Conf<'a> { struct Conf<'a> {
name: &'a str, name: &'a str,
text: &'a str, text: &'a str,
matching_words: MatcherBuilder, matching_words: MatcherBuilder<'a, Vec<u8>>,
} }
fn bench_formatting(c: &mut criterion::Criterion) { fn bench_formatting(c: &mut criterion::Criterion) {
@ -18,7 +18,7 @@ fn bench_formatting(c: &mut criterion::Criterion) {
Conf { Conf {
name: "'the door d'", name: "'the door d'",
text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
matching_words: MatcherBuilder::from_matching_words(MatchingWords::new(vec![ matching_words: MatcherBuilder::new(MatchingWords::new(vec![
(vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]),
@ -27,7 +27,8 @@ fn bench_formatting(c: &mut criterion::Criterion) {
(vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]),
(vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]),
(vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]),
])), ]
), TokenizerBuilder::default().build()),
}, },
]; ];
@ -52,8 +53,7 @@ fn bench_formatting(c: &mut criterion::Criterion) {
for conf in confs { for conf in confs {
group.bench_function(conf.name, |b| { group.bench_function(conf.name, |b| {
b.iter(|| { b.iter(|| {
let tokens: Vec<_> = conf.text.tokenize().collect(); let mut matcher = conf.matching_words.build(conf.text);
let mut matcher = conf.matching_words.build(&tokens[..], conf.text);
matcher.format(option.clone()); matcher.format(option.clone());
}) })
}); });

View File

@ -19,7 +19,7 @@ use flate2::read::GzDecoder;
use futures::{stream, FutureExt, StreamExt}; use futures::{stream, FutureExt, StreamExt};
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use milli::documents::DocumentBatchReader; use milli::documents::DocumentBatchReader;
use milli::tokenizer::{Tokenizer, TokenizerBuilder}; use milli::tokenizer::TokenizerBuilder;
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{ use milli::update::{
ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
@ -140,38 +140,31 @@ pub struct IndexerOpt {
} }
struct Highlighter<'s, A> { struct Highlighter<'s, A> {
tokenizer: Tokenizer<'s, A>, matcher_builder: MatcherBuilder<'s, A>,
} }
impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
fn new(stop_words: &'s fst::Set<A>) -> Self { fn new(matcher_builder: MatcherBuilder<'s, A>) -> Self {
let mut builder = TokenizerBuilder::new(); Self { matcher_builder }
builder.stop_words(stop_words);
Self { tokenizer: builder.build() }
} }
fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { fn highlight_value(&self, value: Value) -> Value {
match value { match value {
Value::Null => Value::Null, Value::Null => Value::Null,
Value::Bool(boolean) => Value::Bool(boolean), Value::Bool(boolean) => Value::Bool(boolean),
Value::Number(number) => Value::Number(number), Value::Number(number) => Value::Number(number),
Value::String(old_string) => { Value::String(old_string) => {
let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect(); let mut matcher = self.matcher_builder.build(&old_string);
let mut matcher = matcher_builder.build(&tokens[..], &old_string);
let format_options = FormatOptions { highlight: true, crop: Some(10) }; let format_options = FormatOptions { highlight: true, crop: Some(10) };
Value::String(matcher.format(format_options).to_string()) Value::String(matcher.format(format_options).to_string())
} }
Value::Array(values) => Value::Array( Value::Array(values) => {
values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), Value::Array(values.into_iter().map(|v| self.highlight_value(v)).collect())
), }
Value::Object(object) => Value::Object( Value::Object(object) => Value::Object(
object object.into_iter().map(|(k, v)| (k, self.highlight_value(v))).collect(),
.into_iter()
.map(|(k, v)| (k, self.highlight_value(v, matcher_builder)))
.collect(),
), ),
} }
} }
@ -179,14 +172,13 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
fn highlight_record( fn highlight_record(
&self, &self,
object: &mut Map<String, Value>, object: &mut Map<String, Value>,
matcher_builder: &MatcherBuilder,
attributes_to_highlight: &HashSet<String>, attributes_to_highlight: &HashSet<String>,
) { ) {
// TODO do we need to create a string for element that are not and needs to be highlight? // TODO do we need to create a string for element that are not and needs to be highlight?
for (key, value) in object.iter_mut() { for (key, value) in object.iter_mut() {
if attributes_to_highlight.contains(key) { if attributes_to_highlight.contains(key) {
let old_value = mem::take(value); let old_value = mem::take(value);
*value = self.highlight_value(old_value, matcher_builder); *value = self.highlight_value(old_value);
} }
} }
} }
@ -798,20 +790,15 @@ async fn main() -> anyhow::Result<()> {
None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(), None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(),
}; };
let stop_words = fst::Set::default(); let mut matcher_builder =
let highlighter = Highlighter::new(&stop_words); MatcherBuilder::new(matching_words, TokenizerBuilder::default().build());
let mut matcher_builder = MatcherBuilder::from_matching_words(matching_words);
matcher_builder.highlight_prefix("<mark>".to_string()); matcher_builder.highlight_prefix("<mark>".to_string());
matcher_builder.highlight_suffix("</mark>".to_string()); matcher_builder.highlight_suffix("</mark>".to_string());
let highlighter = Highlighter::new(matcher_builder);
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting { if !disable_highlighting {
highlighter.highlight_record( highlighter.highlight_record(&mut object, &attributes_to_highlight);
&mut object,
&matcher_builder,
&attributes_to_highlight,
);
} }
documents.push(object); documents.push(object);

View File

@ -1,6 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use charabia::{SeparatorKind, Token}; use charabia::{SeparatorKind, Token, Tokenizer};
use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
pub use matching_words::{MatchingWord, MatchingWords}; pub use matching_words::{MatchingWord, MatchingWords};
use serde::Serialize; use serde::Serialize;
@ -11,16 +11,23 @@ const DEFAULT_CROP_MARKER: &'static str = "…";
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>"; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
pub struct MatcherBuilder { pub struct MatcherBuilder<'a, A> {
matching_words: MatchingWords, matching_words: MatchingWords,
tokenizer: Tokenizer<'a, A>,
crop_marker: Option<String>, crop_marker: Option<String>,
highlight_prefix: Option<String>, highlight_prefix: Option<String>,
highlight_suffix: Option<String>, highlight_suffix: Option<String>,
} }
impl MatcherBuilder { impl<'a, A> MatcherBuilder<'a, A> {
pub fn from_matching_words(matching_words: MatchingWords) -> Self { pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self {
Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None } Self {
matching_words,
tokenizer,
crop_marker: None,
highlight_prefix: None,
highlight_suffix: None,
}
} }
pub fn crop_marker(&mut self, marker: String) -> &Self { pub fn crop_marker(&mut self, marker: String) -> &Self {
@ -38,7 +45,7 @@ impl MatcherBuilder {
self self
} }
pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> { pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> {
let crop_marker = match &self.crop_marker { let crop_marker = match &self.crop_marker {
Some(marker) => marker.as_str(), Some(marker) => marker.as_str(),
None => &DEFAULT_CROP_MARKER, None => &DEFAULT_CROP_MARKER,
@ -54,8 +61,8 @@ impl MatcherBuilder {
}; };
Matcher { Matcher {
text, text,
tokens,
matching_words: &self.matching_words, matching_words: &self.matching_words,
tokenizer: &self.tokenizer,
crop_marker, crop_marker,
highlight_prefix, highlight_prefix,
highlight_suffix, highlight_suffix,
@ -93,17 +100,17 @@ pub struct MatchBounds {
pub length: usize, pub length: usize,
} }
pub struct Matcher<'t, 'm> { pub struct Matcher<'t, 'm, A> {
text: &'t str, text: &'t str,
tokens: &'t [Token<'t>],
matching_words: &'m MatchingWords, matching_words: &'m MatchingWords,
tokenizer: &'m Tokenizer<'m, A>,
crop_marker: &'m str, crop_marker: &'m str,
highlight_prefix: &'m str, highlight_prefix: &'m str,
highlight_suffix: &'m str, highlight_suffix: &'m str,
matches: Option<Vec<Match>>, matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
} }
impl<'t> Matcher<'t, '_> { impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
/// Iterates over tokens and save any of them that matches the query. /// Iterates over tokens and save any of them that matches the query.
fn compute_matches(&mut self) -> &mut Self { fn compute_matches(&mut self) -> &mut Self {
fn compute_partial_match<'a>( fn compute_partial_match<'a>(
@ -159,10 +166,10 @@ impl<'t> Matcher<'t, '_> {
false false
} }
let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect();
let mut matches = Vec::new(); let mut matches = Vec::new();
let mut words_positions = self let mut words_positions = tokens
.tokens
.iter() .iter()
.scan((0, 0), |(token_position, word_position), token| { .scan((0, 0), |(token_position, word_position), token| {
let current_token_position = *token_position; let current_token_position = *token_position;
@ -210,7 +217,7 @@ impl<'t> Matcher<'t, '_> {
} }
} }
self.matches = Some(matches); self.matches = Some((tokens, matches));
self self
} }
@ -218,10 +225,10 @@ impl<'t> Matcher<'t, '_> {
pub fn matches(&mut self) -> Vec<MatchBounds> { pub fn matches(&mut self) -> Vec<MatchBounds> {
match &self.matches { match &self.matches {
None => self.compute_matches().matches(), None => self.compute_matches().matches(),
Some(matches) => matches Some((tokens, matches)) => matches
.iter() .iter()
.map(|m| MatchBounds { .map(|m| MatchBounds {
start: self.tokens[m.token_position].byte_start, start: tokens[m.token_position].byte_start,
length: m.match_len, length: m.match_len,
}) })
.collect(), .collect(),
@ -229,7 +236,7 @@ impl<'t> Matcher<'t, '_> {
} }
/// Returns the bounds in byte index of the crop window. /// Returns the bounds in byte index of the crop window.
fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) { fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) {
// if there is no match, we start from the beginning of the string by default. // if there is no match, we start from the beginning of the string by default.
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
@ -239,8 +246,8 @@ impl<'t> Matcher<'t, '_> {
// matches needs to be counted in the crop len. // matches needs to be counted in the crop len.
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
while remaining_words > 0 { while remaining_words > 0 {
let before_token = before_tokens.peek().map(|t| t.separator_kind()); let before_token = before_tokens.peek().map(|t| t.separator_kind());
@ -396,7 +403,7 @@ impl<'t> Matcher<'t, '_> {
Cow::Borrowed(self.text) Cow::Borrowed(self.text)
} else { } else {
match &self.matches { match &self.matches {
Some(matches) => { Some((tokens, matches)) => {
let matches = match format_options.crop { let matches = match format_options.crop {
Some(crop_size) if crop_size > 0 => { Some(crop_size) if crop_size > 0 => {
self.find_best_match_interval(matches, crop_size) self.find_best_match_interval(matches, crop_size)
@ -405,7 +412,9 @@ impl<'t> Matcher<'t, '_> {
}; };
let (byte_start, byte_end) = match format_options.crop { let (byte_start, byte_end) = match format_options.crop {
Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size), Some(crop_size) if crop_size > 0 => {
self.crop_bounds(tokens, matches, crop_size)
}
_ => (0, self.text.len()), _ => (0, self.text.len()),
}; };
@ -420,7 +429,6 @@ impl<'t> Matcher<'t, '_> {
if format_options.highlight { if format_options.highlight {
// insert highlight markers around matches. // insert highlight markers around matches.
let tokens = self.tokens;
for m in matches { for m in matches {
let token = &tokens[m.token_position]; let token = &tokens[m.token_position];
@ -470,7 +478,7 @@ impl<'t> Matcher<'t, '_> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use charabia::Tokenize; use charabia::TokenizerBuilder;
use super::*; use super::*;
use crate::search::matches::matching_words::MatchingWord; use crate::search::matches::matching_words::MatchingWord;
@ -485,6 +493,12 @@ mod tests {
MatchingWords::new(matching_words) MatchingWords::new(matching_words)
} }
impl MatcherBuilder<'_, Vec<u8>> {
pub fn from_matching_words(matching_words: MatchingWords) -> Self {
Self::new(matching_words, TokenizerBuilder::default().build())
}
}
#[test] #[test]
fn format_identity() { fn format_identity() {
let matching_words = matching_words(); let matching_words = matching_words();
@ -495,22 +509,19 @@ mod tests {
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text. // no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text. // no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text. // no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
} }
@ -525,34 +536,29 @@ mod tests {
// empty text. // empty text.
let text = ""; let text = "";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ""); assert_eq!(&matcher.format(format_options), "");
// text containing only separators. // text containing only separators.
let text = ":-)"; let text = ":-)";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ":-)"); assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text, because there is no matches. // no crop should return complete text, because there is no matches.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -575,22 +581,19 @@ mod tests {
// Text containing prefix match. // Text containing prefix match.
let text = "Ŵôřlḑôle"; let text = "Ŵôřlḑôle";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle"); assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle");
// Text containing unicode match. // Text containing unicode match.
let text = "Ŵôřlḑ"; let text = "Ŵôřlḑ";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>"); assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>");
// Text containing unicode match. // Text containing unicode match.
let text = "Westfália"; let text = "Westfália";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a"); assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a");
} }
@ -605,20 +608,17 @@ mod tests {
// empty text. // empty text.
let text = ""; let text = "";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ""); assert_eq!(&matcher.format(format_options), "");
// text containing only separators. // text containing only separators.
let text = ":-)"; let text = ":-)";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ":-)"); assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end. // no highlight should return 10 first words with a marker at the end.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -627,8 +627,7 @@ mod tests {
// Text without any match starting by a separator. // Text without any match starting by a separator.
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end. // no highlight should return 10 first words with a marker at the end.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -637,8 +636,7 @@ mod tests {
// Test phrase propagation // Test phrase propagation
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// should crop the phrase instead of croping around the match. // should crop the phrase instead of croping around the match.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -647,8 +645,7 @@ mod tests {
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start. // no highlight should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -657,8 +654,7 @@ mod tests {
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start. // no highlight should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -667,8 +663,7 @@ mod tests {
// Text containing a match unordered and a match ordered. // Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void"; let text = "The world split void void void void void void void void void split the world void void";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -677,8 +672,7 @@ mod tests {
// Text containing matches with diferent density. // Text containing matches with diferent density.
let text = "split void the void void world void void void void void void void void void void split the world void void"; let text = "split void the void void world void void void void void void void void void void split the world void void";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -687,8 +681,7 @@ mod tests {
// Text containing matches with same word. // Text containing matches with same word.
let text = "split split split split split split void void void void void void void void void void split the world void void"; let text = "split split split split split split void void void void void void void void void void split the world void void";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -706,20 +699,17 @@ mod tests {
// empty text. // empty text.
let text = ""; let text = "";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ""); assert_eq!(&matcher.format(format_options), "");
// text containing only separators. // text containing only separators.
let text = ":-)"; let text = ":-)";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ":-)"); assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 first words with a marker at the end. // both should return 10 first words with a marker at the end.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -728,8 +718,7 @@ mod tests {
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches. // both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -738,15 +727,13 @@ mod tests {
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches. // both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
// Text containing a match unordered and a match ordered. // Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void"; let text = "The world split void void void void void void void void void split the world void void";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
@ -762,26 +749,22 @@ mod tests {
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let text = "void void split the world void void."; let text = "void void split the world void void.";
let tokens: Vec<_> = text.tokenize().collect();
// set a smaller crop size // set a smaller crop size
let format_options = FormatOptions { highlight: false, crop: Some(2) }; let format_options = FormatOptions { highlight: false, crop: Some(2) };
let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// because crop size < query size, partially format matches. // because crop size < query size, partially format matches.
assert_eq!(&matcher.format(format_options), "…split the…"); assert_eq!(&matcher.format(format_options), "…split the…");
// set a smaller crop size // set a smaller crop size
let format_options = FormatOptions { highlight: false, crop: Some(1) }; let format_options = FormatOptions { highlight: false, crop: Some(1) };
let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// because crop size < query size, partially format matches. // because crop size < query size, partially format matches.
assert_eq!(&matcher.format(format_options), "…split…"); assert_eq!(&matcher.format(format_options), "…split…");
// set crop size to 0 // set crop size to 0
let format_options = FormatOptions { highlight: false, crop: Some(0) }; let format_options = FormatOptions { highlight: false, crop: Some(0) };
let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
// because crop size is 0, crop is ignored. // because crop size is 0, crop is ignored.
assert_eq!(&matcher.format(format_options), "void void split the world void void."); assert_eq!(&matcher.format(format_options), "void void split the world void void.");
} }
@ -817,9 +800,7 @@ mod tests {
let format_options = FormatOptions { highlight: true, crop: None }; let format_options = FormatOptions { highlight: true, crop: None };
let text = "the do or die can't be he do and or isn't he"; let text = "the do or die can't be he do and or isn't he";
let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(text);
let mut matcher = builder.build(&tokens[..], text);
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_",