From 827cedcd15b9943d52194d8ea17bfc154dfeebf4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Apr 2022 13:42:14 +0200 Subject: [PATCH] Add format option structure --- http-ui/src/main.rs | 8 +- milli/src/lib.rs | 3 +- milli/src/search/matches/mod.rs | 163 +++++++++++++++----------------- milli/src/search/mod.rs | 2 +- 4 files changed, 85 insertions(+), 91 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index fdfc04af9..adf7f1788 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -25,8 +25,8 @@ use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, }; use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatcherBuilder, - SearchResult, SortError, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, + MatcherBuilder, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; @@ -162,7 +162,9 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed: Vec<_> = analyzed.tokens().collect(); let mut matcher = matcher_builder.build(&analyzed[..], &old_string); - Value::String(matcher.format(true, true).to_string()) + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + Value::String(matcher.format(format_options).to_string()) } Value::Array(values) => Value::Array( values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6cbb9f126..793079563 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, + SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 04e552c8d..65ff0a255 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -8,14 +8,12 @@ use crate::search::matches::matching_words::PartialMatch; pub mod matching_words; -const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; pub struct MatcherBuilder { matching_words: MatchingWords, - crop_size: usize, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, @@ -23,18 +21,7 @@ pub struct MatcherBuilder { impl MatcherBuilder { pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self { - matching_words, - crop_size: DEFAULT_CROP_SIZE, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - - pub fn crop_size(&mut self, word_count: usize) -> &Self { - self.crop_size = word_count; - self + Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None } } pub fn crop_marker(&mut self, marker: String) -> &Self { @@ -70,7 +57,6 @@ impl MatcherBuilder { text, tokens, matching_words: &self.matching_words, - crop_size: self.crop_size, crop_marker, highlight_prefix, highlight_suffix, @@ -79,6 +65,18 @@ impl MatcherBuilder { } } +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + #[derive(Clone, Debug)] pub struct Match { match_len: usize, @@ -100,7 +98,6 @@ pub struct Matcher<'t, 'm> { text: &'t str, tokens: &'t [Token<'t>], matching_words: &'m MatchingWords, - crop_size: usize, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, @@ -233,7 +230,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); @@ -241,8 +238,7 @@ impl<'t> Matcher<'t, '_> { let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); // matches needs to be counted in the crop len. - let mut remaining_words = - self.crop_size + first_match_word_position - last_match_word_position; + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); @@ -348,7 +344,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the matches interval where the score computed by match_interval_score is maximal. - fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { // we compute the matches interval if we have at least 2 matches. if matches.len() > 1 { // positions of the first and the last match of the best matches interval in `matches`. @@ -361,9 +357,7 @@ impl<'t> Matcher<'t, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position - >= self.crop_size - { + if next_match.word_position - matches[interval_first].word_position >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -375,7 +369,7 @@ impl<'t> Matcher<'t, '_> { // advance start of the interval while interval is longer than crop_size. while next_match.word_position - matches[interval_first].word_position - >= self.crop_size + >= crop_size { interval_first += 1; } @@ -397,21 +391,24 @@ impl<'t> Matcher<'t, '_> { } // Returns the formatted version of the original text. - pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { - // If 0 it will be considered null and thus not crop the field - // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let crop = crop && self.crop_size > 0; - if !highlight && !crop { + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { // compute matches is not needed if no highlight nor crop is requested. Cow::Borrowed(self.text) } else { match &self.matches { Some(matches) => { - let matches = - if crop { self.find_best_match_interval(matches) } else { matches }; + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; - let (byte_start, byte_end) = - if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size), + _ => (0, self.text.len()), + }; let mut formatted = Vec::new(); @@ -422,7 +419,7 @@ impl<'t> Matcher<'t, '_> { let mut byte_index = byte_start; - if highlight { + if format_options.highlight { // insert highlight markers around matches. let tokens = self.tokens; for m in matches { @@ -466,7 +463,7 @@ impl<'t> Matcher<'t, '_> { Cow::Owned(formatted.concat()) } } - None => self.compute_matches().format(highlight, crop), + None => self.compute_matches().format(format_options), } } } @@ -496,8 +493,7 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = false; + let format_options = FormatOptions { highlight: false, crop: None }; // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -505,7 +501,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -513,7 +509,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -521,7 +517,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); } #[test] @@ -531,22 +527,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -554,7 +549,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -562,7 +557,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -571,7 +566,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves." ); } @@ -588,8 +583,7 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. let text = "Ŵôřlḑôle"; @@ -597,7 +591,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑôle"); + assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; @@ -605,7 +599,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑ"); + assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; @@ -613,7 +607,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Westfália"); + assert_eq!(&matcher.format(format_options.clone()), "Westfália"); } #[test] @@ -623,22 +617,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = true; + let format_options = FormatOptions { highlight: false, crop: Some(10) }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -647,7 +640,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "A quick brown fox can not jump 32 feet, right…" ); @@ -658,7 +651,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "(A quick brown fox can not jump 32 feet, right…" ); @@ -669,7 +662,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…Split The World is a book written by Emily Henry…" ); @@ -680,7 +673,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…future to build a world with the boy she loves…" ); @@ -691,7 +684,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World." ); @@ -702,7 +695,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); @@ -713,7 +706,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); @@ -724,7 +717,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); } @@ -736,22 +729,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = true; + let format_options = FormatOptions { highlight: true, crop: Some(10) }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -760,7 +752,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "A quick brown fox can not jump 32 feet, right…" ); @@ -771,7 +763,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…future to build a world with the boy she loves…" ); @@ -781,7 +773,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -790,7 +782,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); } @@ -800,33 +792,33 @@ mod tests { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let matching_words = matching_words(); - let mut builder = MatcherBuilder::from_matching_words(matching_words); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = true; - let text = "void void split the world void void."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); // set a smaller crop size - builder.crop_size(2); + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split the…"); + assert_eq!(&matcher.format(format_options), "…split the…"); // set a smaller crop size - builder.crop_size(1); + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split…"); + assert_eq!(&matcher.format(format_options), "…split…"); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; - // set a smaller crop size - builder.crop_size(0); let mut matcher = builder.build(&tokens[..], text); // because crop size is 0, crop is ignored. - assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); + assert_eq!(&matcher.format(format_options), "void void split the world void void."); } #[test] @@ -858,8 +850,7 @@ mod tests { builder.highlight_suffix("_".to_string()); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; let analyzed = analyzer.analyze(&text); @@ -867,7 +858,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options), "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", "matches: {:?}", &matcher.matches diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2b025f269..a9712d261 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult};