From 7473cc6e27fe2658ee48f90afe4656042a6826ac Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Tue, 11 May 2021 18:30:55 +0200 Subject: [PATCH] implement crop around --- meilisearch-http/src/index/search.rs | 126 +++++++++++++++++---------- 1 file changed, 79 insertions(+), 47 deletions(-) diff --git a/meilisearch-http/src/index/search.rs b/meilisearch-http/src/index/search.rs index ce9338d5f..760357b9a 100644 --- a/meilisearch-http/src/index/search.rs +++ b/meilisearch-http/src/index/search.rs @@ -1,6 +1,6 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; +use std::collections::{BTreeMap, HashSet, VecDeque}; use std::time::Instant; +use std::{borrow::Cow, collections::HashMap}; use anyhow::bail; use either::Either; @@ -157,7 +157,12 @@ impl Index { let stop_words = fst::Set::default(); let highlighter = - Highlighter::new(&stop_words, (String::from(""), String::from(""))); + Formatter::new(&stop_words, (String::from(""), String::from(""))); + + let to_crop = to_crop_ids + .into_iter() + .map(|id| (id, query.crop_length)) + .collect::>(); for (_id, obkv) in self.documents(&rtxn, documents_ids)? { let document = make_document(&all_attributes, &fields_ids_map, obkv)?; @@ -168,7 +173,7 @@ impl Index { &matching_words, all_formatted.as_ref().as_slice(), &to_highlight_ids, - &to_crop_ids, + &to_crop, )?; let hit = SearchHit { document, @@ -230,11 +235,11 @@ fn make_document( fn compute_formatted>( field_ids_map: &FieldsIdsMap, obkv: obkv::KvReader, - highlighter: &Highlighter, + highlighter: &Formatter, matching_words: &impl Matcher, all_formatted: &[FieldId], to_highlight_fields: &HashSet, - to_crop_fields: &HashSet, + to_crop_fields: &HashMap>, ) -> anyhow::Result { let mut document = Document::new(); @@ -242,15 +247,12 @@ fn compute_formatted>( if let Some(value) = obkv.get(*field) { let mut value: Value = serde_json::from_slice(value)?; - let need_to_crop = if to_crop_fields.contains(field) { - Some(200) // TO CHANGE - } else { - None - }; - - if to_highlight_fields.contains(field) { - value = highlighter.format_value(value, matching_words, need_to_crop, to_highlight_fields.contains(field)); - } + value = highlighter.format_value( + value, + matching_words, + to_crop_fields.get(field).copied().flatten(), + to_highlight_fields.contains(field), + ); // This unwrap must be safe since we got the ids from the fields_ids_map just // before. @@ -284,12 +286,12 @@ impl Matcher for MatchingWords { } } -struct Highlighter<'a, A> { +struct Formatter<'a, A> { analyzer: Analyzer<'a, A>, marks: (String, String), } -impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { +impl<'a, A: AsRef<[u8]>> Formatter<'a, A> { pub fn new(stop_words: &'a fst::Set, marks: (String, String)) -> Self { let mut config = AnalyzerConfig::default(); config.stop_words(stop_words); @@ -305,10 +307,11 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { matcher: &impl Matcher, need_to_crop: Option, need_to_highlight: bool, - ) -> Value { + ) -> Value { match value { Value::String(old_string) => { - let value = self.format_string(old_string, matcher, need_to_crop, need_to_highlight); + let value = + self.format_string(old_string, matcher, need_to_crop, need_to_highlight); Value::String(value) } Value::Array(values) => Value::Array( @@ -326,41 +329,67 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { value => value, } } - fn format_string(&self, s: String, matcher: &impl Matcher, need_to_crop: Option, need_to_highlight: bool) -> String { + fn format_string( + &self, + s: String, + matcher: &impl Matcher, + need_to_crop: Option, + need_to_highlight: bool, + ) -> String { let analyzed = self.analyzer.analyze(&s); - let tokens: Box> = match need_to_crop { + let tokens: Box> = match need_to_crop { Some(crop_len) => { - let mut taken = 0; - let iter = analyzed - .reconstruct() - .skip_while(|(_, token)| !matcher.matches(token.text())) + let mut buffer = VecDeque::new(); + let mut tokens = analyzed.reconstruct().peekable(); + let mut taken_before = 0; + while let Some((word, token)) = tokens.next_if(|(_, token)| !matcher.matches(token.text())) { + buffer.push_back((word, token)); + taken_before += word.chars().count(); + while taken_before > crop_len { + if let Some((word, _)) = buffer.pop_front() { + taken_before -= word.chars().count(); + } + } + } + + if let Some(token) = tokens.next() { + buffer.push_back(token); + } + + let mut taken_after = 0; + + let after_iter = tokens .take_while(move |(word, _)| { - let take = taken < crop_len; - taken += word.chars().count(); + let take = taken_after <= crop_len; + taken_after += word.chars().count(); take }); + let iter = buffer + .into_iter() + .chain(after_iter); + Box::new(iter) - }, + } None => Box::new(analyzed.reconstruct()), }; - tokens.map(|(word, token)| { - if need_to_highlight && token.is_word() && matcher.matches(token.text()){ - let mut new_word = String::new(); - new_word.push_str(&self.marks.0); - new_word.push_str(&word); - new_word.push_str(&self.marks.1); - new_word - } else { - word.to_string() - } - }) - .collect::() + tokens + .map(|(word, token)| { + if need_to_highlight && token.is_word() && matcher.matches(token.text()) { + let mut new_word = String::new(); + new_word.push_str(&self.marks.0); + new_word.push_str(&word); + new_word.push_str(&self.marks.1); + new_word + } else { + word.to_string() + } + }) + .collect::() } } - fn parse_facets( facets: &Value, index: &Index, @@ -412,7 +441,7 @@ mod test { fn no_formatted() { let stop_words = fst::Set::default(); let highlighter = - Highlighter::new(&stop_words, (String::from(""), String::from(""))); + Formatter::new(&stop_words, (String::from(""), String::from(""))); let mut fields = FieldsIdsMap::new(); let id = fields.insert("test").unwrap(); @@ -439,7 +468,8 @@ mod test { &all_formatted, &to_highlight_ids, &to_crop_ids, - ).unwrap(); + ) + .unwrap(); assert!(value.is_empty()); } @@ -448,7 +478,7 @@ mod test { fn formatted_no_highlight() { let stop_words = fst::Set::default(); let highlighter = - Highlighter::new(&stop_words, (String::from(""), String::from(""))); + Formatter::new(&stop_words, (String::from(""), String::from(""))); let mut fields = FieldsIdsMap::new(); let id = fields.insert("test").unwrap(); @@ -475,7 +505,8 @@ mod test { &all_formatted, &to_highlight_ids, &to_crop_ids, - ).unwrap(); + ) + .unwrap(); assert_eq!(value["test"], "hello"); } @@ -484,7 +515,7 @@ mod test { fn formatted_with_highlight() { let stop_words = fst::Set::default(); let highlighter = - Highlighter::new(&stop_words, (String::from(""), String::from(""))); + Formatter::new(&stop_words, (String::from(""), String::from(""))); let mut fields = FieldsIdsMap::new(); let id = fields.insert("test").unwrap(); @@ -511,7 +542,8 @@ mod test { &all_formatted, &to_highlight_ids, &to_crop_ids, - ).unwrap(); + ) + .unwrap(); assert_eq!(value["test"], "hello"); }