From 0250ea91579ddbcfbd5d7a4d1bbf7c42c39c706e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 18 May 2022 10:26:52 +0200 Subject: [PATCH 1/3] Intergrate smart crop in Meilisearch --- Cargo.lock | 25 +- meilisearch-auth/Cargo.toml | 2 +- meilisearch-http/tests/search/formatted.rs | 31 +- meilisearch-lib/Cargo.toml | 2 +- meilisearch-lib/src/index/search.rs | 1185 +++----------------- 5 files changed, 182 insertions(+), 1063 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 64b683481..7b0897571 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1092,8 +1092,8 @@ dependencies = [ [[package]] name = "filter-parser" -version = "0.26.4" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec" +version = "0.28.0" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" dependencies = [ "nom", "nom_locate", @@ -1119,8 +1119,8 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "0.26.4" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec" +version = "0.28.0" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" dependencies = [ "serde_json", ] @@ -1622,8 +1622,8 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "0.26.4" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec" +version = "0.28.0" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" dependencies = [ "serde_json", ] @@ -2151,8 +2151,8 @@ dependencies = [ [[package]] name = "milli" -version = "0.26.4" -source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec" +version = "0.28.0" +source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969" dependencies = [ "bimap", "bincode", @@ -2189,6 +2189,7 @@ dependencies = [ "smallvec", "smartstring", "tempfile", + "thiserror", "time 0.3.9", "uuid", ] @@ -3360,18 +3361,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" [[package]] name = "thiserror" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" dependencies = [ "proc-macro2 1.0.37", "quote 1.0.17", diff --git a/meilisearch-auth/Cargo.toml b/meilisearch-auth/Cargo.toml index 2d9f229f0..dd12b5b63 100644 --- a/meilisearch-auth/Cargo.toml +++ b/meilisearch-auth/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] enum-iterator = "0.7.0" meilisearch-error = { path = "../meilisearch-error" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.26.5" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } rand = "0.8.4" serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } diff --git a/meilisearch-http/tests/search/formatted.rs b/meilisearch-http/tests/search/formatted.rs index 13b8a07d8..19387bdc5 100644 --- a/meilisearch-http/tests/search/formatted.rs +++ b/meilisearch-http/tests/search/formatted.rs @@ -16,7 +16,7 @@ async fn formatted_contain_wildcard() { index.wait_task(1).await; let (response, code) = index - .search_post(json!({ "q": "pesti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"] })) + .search_post(json!({ "q": "pesti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"], "matches": true })) .await; assert_eq!(code, 200, "{}", response); assert_eq!( @@ -25,7 +25,8 @@ async fn formatted_contain_wildcard() { "_formatted": { "id": "852", "cattos": "pesti", - } + }, + "_matchesInfo": {"cattos": [{"start": 0, "length": 5}]}, }) ); @@ -43,7 +44,7 @@ async fn formatted_contain_wildcard() { let (response, code) = index .search_post( - json!({ "q": "pesti", "attributesToRetrieve": ["*"], "attributesToHighlight": ["id"] }), + json!({ "q": "pesti", "attributesToRetrieve": ["*"], "attributesToHighlight": ["id"], "matches": true }), ) .await; assert_eq!(code, 200, "{}", response); @@ -55,7 +56,8 @@ async fn formatted_contain_wildcard() { "_formatted": { "id": "852", "cattos": "pesti", - } + }, + "_matchesInfo": {"cattos": [{"start": 0, "length": 5}]}, }) ); @@ -141,6 +143,27 @@ async fn format_nested() { }) ); + let (response, code) = index + .search_post( + json!({ "q": "bobby", "attributesToRetrieve": ["doggos.name"], "matches": true }), + ) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!( + response["hits"][0], + json!({ + "doggos": [ + { + "name": "bobby", + }, + { + "name": "buddy", + }, + ], + "_matchesInfo": {"doggos.name": [{"start": 0, "length": 5}]}, + }) + ); + let (response, code) = index .search_post(json!({ "q": "pesti", "attributesToRetrieve": [], "attributesToHighlight": ["doggos.name"] })) .await; diff --git a/meilisearch-lib/Cargo.toml b/meilisearch-lib/Cargo.toml index 0b6596ffd..85ae49f64 100644 --- a/meilisearch-lib/Cargo.toml +++ b/meilisearch-lib/Cargo.toml @@ -30,7 +30,7 @@ lazy_static = "1.4.0" log = "0.4.14" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-error = { path = "../meilisearch-error" } -milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.26.5" } +milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" } mime = "0.3.16" num_cpus = "1.13.1" obkv = "0.2.0" diff --git a/meilisearch-lib/src/index/search.rs b/meilisearch-lib/src/index/search.rs index 7c12f985e..327cf173a 100644 --- a/meilisearch-lib/src/index/search.rs +++ b/meilisearch-lib/src/index/search.rs @@ -4,8 +4,10 @@ use std::str::FromStr; use std::time::Instant; use either::Either; -use milli::tokenizer::{Analyzer, AnalyzerConfig, Token}; -use milli::{AscDesc, FieldId, FieldsIdsMap, Filter, MatchingWords, SortError}; +use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::{ + AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError, +}; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; @@ -16,13 +18,7 @@ use super::error::{IndexError, Result}; use super::index::Index; pub type Document = serde_json::Map; -type MatchesInfo = BTreeMap>; - -#[derive(Serialize, Debug, Clone, PartialEq)] -pub struct MatchInfo { - start: usize, - length: usize, -} +type MatchesInfo = BTreeMap>; pub const DEFAULT_SEARCH_LIMIT: usize = 20; const fn default_search_limit() -> usize { @@ -105,21 +101,6 @@ pub struct SearchResult { pub exhaustive_facets_count: Option, } -#[derive(Copy, Clone, Default)] -struct FormatOptions { - highlight: bool, - crop: Option, -} - -impl FormatOptions { - pub fn merge(self, other: Self) -> Self { - Self { - highlight: self.highlight || other.highlight, - crop: self.crop.or(other.crop), - } - } -} - impl Index { pub fn perform_search(&self, query: SearchQuery) -> Result { let before_search = Instant::now(); @@ -221,11 +202,10 @@ impl Index { config.stop_words(&stop_words); let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (query.highlight_pre_tag, query.highlight_post_tag), - query.crop_marker, - ); + let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words); + formatter_builder.crop_marker(query.crop_marker); + formatter_builder.highlight_prefix(query.highlight_pre_tag); + formatter_builder.highlight_suffix(query.highlight_post_tag); let mut documents = Vec::new(); @@ -242,16 +222,14 @@ impl Index { let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); - let matches_info = query - .matches - .then(|| compute_matches(&matching_words, &document, &analyzer)); - - let formatted = format_fields( + let (matches_info, formatted) = format_fields( &displayed_document, &fields_ids_map, - &formatter, - &matching_words, + &formatter_builder, + &analyzer, &formatted_options, + query.matches, + &displayed_ids, )?; if let Some(sort) = query.sort.as_ref() { @@ -317,56 +295,6 @@ fn insert_geo_distance(sorts: &[String], document: &mut Document) { } } -fn compute_matches>( - matcher: &impl Matcher, - document: &Document, - analyzer: &Analyzer, -) -> MatchesInfo { - let mut matches = BTreeMap::new(); - - for (key, value) in document { - let mut infos = Vec::new(); - compute_value_matches(&mut infos, value, matcher, analyzer); - if !infos.is_empty() { - matches.insert(key.clone(), infos); - } - } - matches -} - -fn compute_value_matches<'a, A: AsRef<[u8]>>( - infos: &mut Vec, - value: &Value, - matcher: &impl Matcher, - analyzer: &Analyzer<'a, A>, -) { - match value { - Value::String(s) => { - let analyzed = analyzer.analyze(s); - let mut start = 0; - for (word, token) in analyzed.reconstruct() { - if token.is_word() { - if let Some(length) = matcher.matches(&token) { - infos.push(MatchInfo { start, length }); - } - } - - start += word.len(); - } - } - Value::Array(vals) => vals - .iter() - .for_each(|val| compute_value_matches(infos, val, matcher, analyzer)), - Value::Object(vals) => vals - .values() - .for_each(|val| compute_value_matches(infos, val, matcher, analyzer)), - Value::Number(number) => { - compute_value_matches(infos, &Value::String(number.to_string()), matcher, analyzer) - } - _ => (), - } -} - fn compute_formatted_options( attr_to_highlight: &HashSet, attr_to_crop: &[String], @@ -509,22 +437,23 @@ fn make_document( Ok(document) } -fn format_fields>( +fn format_fields<'a, A: AsRef<[u8]>>( document: &Document, field_ids_map: &FieldsIdsMap, - formatter: &Formatter, - matching_words: &impl Matcher, + builder: &MatcherBuilder, + analyzer: &'a Analyzer<'a, A>, formatted_options: &BTreeMap, -) -> Result { - let selectors: Vec<_> = formatted_options - .keys() - // This unwrap must be safe since we got the ids from the fields_ids_map just - // before. - .map(|&fid| field_ids_map.name(fid).unwrap()) - .collect(); - let mut document = permissive_json_pointer::select_values(document, selectors.iter().copied()); + compute_matches: bool, + displayable_ids: &BTreeSet, +) -> Result<(Option, Document)> { + let mut matches = compute_matches.then(|| BTreeMap::new()); + let mut document = document.clone(); - permissive_json_pointer::map_leaf_values(&mut document, selectors, |key, value| { + // select the attributes to retrieve + let displayable_names = displayable_ids + .iter() + .map(|&fid| field_ids_map.name(fid).expect("Missing field name")); + permissive_json_pointer::map_leaf_values(&mut document, displayable_names, |key, value| { // To get the formatting option of each key we need to see all the rules that applies // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only @@ -535,235 +464,124 @@ fn format_fields>( let name = field_ids_map.name(**field).unwrap(); milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) }) - .fold(FormatOptions::default(), |acc, (_, option)| { - acc.merge(*option) - }); - *value = formatter.format_value(std::mem::take(value), matching_words, format); + .map(|(_, option)| *option) + .reduce(|acc, option| acc.merge(option)); + let mut infos = Vec::new(); + + *value = format_value( + std::mem::take(value), + builder, + format, + analyzer, + &mut infos, + compute_matches, + ); + + if let Some(matches) = matches.as_mut() { + if !infos.is_empty() { + matches.insert(key.to_owned(), infos); + } + } }); - Ok(document) + let selectors = formatted_options + .keys() + // This unwrap must be safe since we got the ids from the fields_ids_map just + // before. + .map(|&fid| field_ids_map.name(fid).unwrap()); + let document = permissive_json_pointer::select_values(&document, selectors); + + Ok((matches, document)) } -/// trait to allow unit testing of `format_fields` -trait Matcher { - fn matches(&self, w: &Token) -> Option; -} - -#[cfg(test)] -impl Matcher for BTreeMap<&str, Option> { - fn matches(&self, w: &Token) -> Option { - self.get(w.text()).cloned().flatten() - } -} - -impl Matcher for MatchingWords { - fn matches(&self, w: &Token) -> Option { - self.matching_bytes(w) - } -} - -struct Formatter<'a, A> { +fn format_value<'a, A: AsRef<[u8]>>( + value: Value, + builder: &MatcherBuilder, + format_options: Option, analyzer: &'a Analyzer<'a, A>, - highlight_tags: (String, String), - crop_marker: String, -} + infos: &mut Vec, + compute_matches: bool, +) -> Value { + match value { + Value::String(old_string) => { + // this will be removed with charabia + let analyzed = analyzer.analyze(&old_string); + let tokens: Vec<_> = analyzed.tokens().collect(); -impl<'a, A: AsRef<[u8]>> Formatter<'a, A> { - pub fn new( - analyzer: &'a Analyzer<'a, A>, - highlight_tags: (String, String), - crop_marker: String, - ) -> Self { - Self { - analyzer, - highlight_tags, - crop_marker, - } - } - - fn format_value( - &self, - value: Value, - matcher: &impl Matcher, - format_options: FormatOptions, - ) -> Value { - match value { - Value::String(old_string) => { - let value = self.format_string(old_string, matcher, format_options); - Value::String(value) + let mut matcher = builder.build(&tokens[..], &old_string); + if compute_matches { + let matches = matcher.matches(); + infos.extend_from_slice(&matches[..]); } - Value::Array(values) => Value::Array( - values - .into_iter() - .map(|v| { - self.format_value( + + match format_options { + Some(format_options) => { + let value = matcher.format(format_options); + Value::String(value.into_owned()) + } + None => Value::String(old_string), + } + } + Value::Array(values) => Value::Array( + values + .into_iter() + .map(|v| { + format_value( + v, + builder, + format_options.map(|format_options| FormatOptions { + highlight: format_options.highlight, + crop: None, + }), + analyzer, + infos, + compute_matches, + ) + }) + .collect(), + ), + Value::Object(object) => Value::Object( + object + .into_iter() + .map(|(k, v)| { + ( + k, + format_value( v, - matcher, - FormatOptions { + builder, + format_options.map(|format_options| FormatOptions { highlight: format_options.highlight, crop: None, - }, - ) - }) - .collect(), - ), - Value::Object(object) => Value::Object( - object - .into_iter() - .map(|(k, v)| { - ( - k, - self.format_value( - v, - matcher, - FormatOptions { - highlight: format_options.highlight, - crop: None, - }, - ), - ) - }) - .collect(), - ), - Value::Number(number) => { - let number_string_value = - self.format_string(number.to_string(), matcher, format_options); - Value::String(number_string_value) + }), + analyzer, + infos, + compute_matches, + ), + ) + }) + .collect(), + ), + Value::Number(number) => { + // this will be removed with charabia + let s = number.to_string(); + let analyzed = analyzer.analyze(&s); + let tokens: Vec<_> = analyzed.tokens().collect(); + + let mut matcher = builder.build(&tokens[..], &s); + if compute_matches { + let matches = matcher.matches(); + infos.extend_from_slice(&matches[..]); + } + + match format_options { + Some(format_options) => { + let value = matcher.format(format_options); + Value::String(value.into_owned()) + } + None => Value::Number(number), } - value => value, } - } - - fn format_string( - &self, - s: String, - matcher: &impl Matcher, - format_options: FormatOptions, - ) -> String { - let analyzed = self.analyzer.analyze(&s); - - let mut tokens = analyzed.reconstruct(); - let mut crop_marker_before = false; - - let tokens_interval: Box> = match format_options.crop { - Some(crop_len) if crop_len > 0 => { - let mut buffer = Vec::new(); - let mut tokens = tokens.by_ref().peekable(); - - while let Some((word, token)) = - tokens.next_if(|(_, token)| matcher.matches(token).is_none()) - { - buffer.push((word, token)); - } - - match tokens.next() { - Some(token) => { - let mut total_count: usize = buffer - .iter() - .filter(|(_, token)| token.is_separator().is_none()) - .count(); - - let crop_len_before = crop_len / 2; - // check if start will be cropped. - crop_marker_before = total_count > crop_len_before; - - let before_iter = buffer.into_iter().skip_while(move |(_, token)| { - if token.is_separator().is_none() { - total_count -= 1; - } - total_count >= crop_len_before - }); - - // rebalance remaining word count after the match. - let crop_len_after = if crop_marker_before { - crop_len.saturating_sub(crop_len_before + 1) - } else { - crop_len.saturating_sub(total_count + 1) - }; - - let mut taken_after = 0; - let after_iter = tokens.take_while(move |(_, token)| { - let take = taken_after < crop_len_after; - if token.is_separator().is_none() { - taken_after += 1; - } - take - }); - - let iter = before_iter.chain(Some(token)).chain(after_iter); - - Box::new(iter) - } - // If no word matches in the attribute - None => { - let mut count = 0; - let mut tokens = buffer.into_iter(); - let mut out: String = tokens - .by_ref() - .take_while(move |(_, token)| { - let take = count < crop_len; - if token.is_separator().is_none() { - count += 1; - } - take - }) - .map(|(word, _)| word) - .collect(); - - // if there are remaining tokens after formatted interval, - // put a crop marker at the end. - if tokens.next().is_some() { - out.push_str(&self.crop_marker); - } - - return out; - } - } - } - _ => Box::new(tokens.by_ref()), - }; - - let out = if crop_marker_before { - self.crop_marker.clone() - } else { - String::new() - }; - - let mut out = tokens_interval.fold(out, |mut out, (word, token)| { - // Check if we need to do highlighting or computed matches before calling - // Matcher::match since the call is expensive. - if format_options.highlight && token.is_word() { - if let Some(length) = matcher.matches(&token) { - match word.get(..length).zip(word.get(length..)) { - Some((head, tail)) => { - out.push_str(&self.highlight_tags.0); - out.push_str(head); - out.push_str(&self.highlight_tags.1); - out.push_str(tail); - } - // if we are in the middle of a character - // or if all the word should be highlighted, - // we highlight the complete word. - None => { - out.push_str(&self.highlight_tags.0); - out.push_str(word); - out.push_str(&self.highlight_tags.1); - } - } - return out; - } - } - out.push_str(word); - out - }); - - // if there are remaining tokens after formatted interval, - // put a crop marker at the end. - if tokens.next().is_some() { - out.push_str(&self.crop_marker); - } - - out + value => value, } } @@ -810,740 +628,17 @@ fn parse_filter_array(arr: &[Value]) -> Result> { mod test { use super::*; - #[test] - fn no_ids_no_formatted() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - fields.insert("test").unwrap(); - - let document: serde_json::Value = json!({ - "test": "hello", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let formatted_options = BTreeMap::new(); - - let matching_words = MatchingWords::default(); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert!(value.is_empty()); - } - - #[test] - fn formatted_with_highlight_in_word() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "The Hobbit", - "author": "J. R. R. Tolkien", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: true, - crop: None, - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("hobbit", Some(3)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "The Hobbit"); - assert_eq!(value["author"], "J. R. R. Tolkien"); - } - - #[test] - fn formatted_with_highlight_in_number() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - let publication_year = fields.insert("publication_year").unwrap(); - - let document: serde_json::Value = json!({ - "title": "The Hobbit", - "author": "J. R. R. Tolkien", - "publication_year": 1937, - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: false, - crop: None, - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - formatted_options.insert( - publication_year, - FormatOptions { - highlight: true, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("1937", Some(4)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "The Hobbit"); - assert_eq!(value["author"], "J. R. R. Tolkien"); - assert_eq!(value["publication_year"], "1937"); - } - - /// https://github.com/meilisearch/meilisearch/issues/1368 - #[test] - fn formatted_with_highlight_emoji() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Go💼od luck.", - "author": "JacobLey", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: true, - crop: None, - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - // emojis are deunicoded during tokenization - // TODO Tokenizer should remove spaces after deunicode - matching_words.insert("gobriefcase od", Some(11)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "Go💼od luck."); - assert_eq!(value["author"], "JacobLey"); - } - - #[test] - fn formatted_with_highlight_in_unicode_word() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "étoile", - "author": "J. R. R. Tolkien", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: true, - crop: None, - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("etoile", Some(1)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "étoile"); - assert_eq!(value["author"], "J. R. R. Tolkien"); - } - - #[test] - fn formatted_with_crop_2() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Harry Potter and the Half-Blood Prince", - "author": "J. K. Rowling", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: false, - crop: Some(2), - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("potter", Some(3)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "Harry Potter…"); - assert_eq!(value["author"], "J. K. Rowling"); - } - - #[test] - fn formatted_with_crop_5() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Harry Potter and the Half-Blood Prince", - "author": "J. K. Rowling", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: false, - crop: Some(5), - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("potter", Some(5)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "Harry Potter and the Half…"); - assert_eq!(value["author"], "J. K. Rowling"); - } - - #[test] - fn formatted_with_crop_0() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Harry Potter and the Half-Blood Prince", - "author": "J. K. Rowling", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: false, - crop: Some(0), - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("potter", Some(6)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "Harry Potter and the Half-Blood Prince"); - assert_eq!(value["author"], "J. K. Rowling"); - } - - #[test] - fn formatted_with_crop_and_no_match() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Harry Potter and the Half-Blood Prince", - "author": "J. K. Rowling", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: false, - crop: Some(1), - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: Some(20), - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("rowling", Some(3)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "Harry…"); - assert_eq!(value["author"], "J. K. Rowling"); - } - - #[test] - fn formatted_with_crop_and_highlight() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Harry Potter and the Half-Blood Prince", - "author": "J. K. Rowling", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: true, - crop: Some(1), - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("and", Some(3)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "…and…"); - assert_eq!(value["author"], "J. K. Rowling"); - } - - #[test] - fn formatted_with_crop_and_highlight_in_word() { - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - let formatter = Formatter::new( - &analyzer, - (String::from(""), String::from("")), - String::from("…"), - ); - - let mut fields = FieldsIdsMap::new(); - let title = fields.insert("title").unwrap(); - let author = fields.insert("author").unwrap(); - - let document: serde_json::Value = json!({ - "title": "Harry Potter and the Half-Blood Prince", - "author": "J. K. Rowling", - }); - - // we need to convert the `serde_json::Map` into an `IndexMap`. - let document = document - .as_object() - .unwrap() - .into_iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect(); - - let mut formatted_options = BTreeMap::new(); - formatted_options.insert( - title, - FormatOptions { - highlight: true, - crop: Some(4), - }, - ); - formatted_options.insert( - author, - FormatOptions { - highlight: false, - crop: None, - }, - ); - - let mut matching_words = BTreeMap::new(); - matching_words.insert("blood", Some(3)); - - let value = format_fields( - &document, - &fields, - &formatter, - &matching_words, - &formatted_options, - ) - .unwrap(); - - assert_eq!(value["title"], "…the Half-Blood Prince"); - assert_eq!(value["author"], "J. K. Rowling"); - } - - #[test] - fn test_compute_value_matches() { - let text = "Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world."; - let value = serde_json::json!(text); - - let mut matcher = BTreeMap::new(); - matcher.insert("ishmael", Some(3)); - matcher.insert("little", Some(6)); - matcher.insert("particular", Some(1)); - - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - - let mut infos = Vec::new(); - - compute_value_matches(&mut infos, &value, &matcher, &analyzer); - - let mut infos = infos.into_iter(); - let crop = |info: MatchInfo| &text[info.start..info.start + info.length]; - - assert_eq!(crop(infos.next().unwrap()), "Ish"); - assert_eq!(crop(infos.next().unwrap()), "little"); - assert_eq!(crop(infos.next().unwrap()), "p"); - assert_eq!(crop(infos.next().unwrap()), "little"); - assert!(infos.next().is_none()); - } - - #[test] - fn test_compute_match() { - let value = serde_json::from_str(r#"{ - "color": "Green", - "name": "Lucas Hess", - "gender": "male", - "price": 3.5, - "address": "412 Losee Terrace, Blairstown, Georgia, 2825", - "about": "Mollit ad in exercitation quis Laboris . Anim est ut consequat fugiat duis magna aliquip velit nisi. Commodo eiusmod est consequat proident consectetur aliqua enim fugiat. Aliqua adipisicing laboris elit proident enim veniam laboris mollit. Incididunt fugiat minim ad nostrud deserunt tempor in. Id irure officia labore qui est labore nulla nisi. Magna sit quis tempor esse consectetur amet labore duis aliqua consequat.\r\n" - }"#).unwrap(); - let mut matcher = BTreeMap::new(); - matcher.insert("green", Some(5)); - matcher.insert("mollit", Some(6)); - matcher.insert("laboris", Some(7)); - matcher.insert("3", Some(1)); - - let stop_words = fst::Set::default(); - let mut config = AnalyzerConfig::default(); - config.stop_words(&stop_words); - let analyzer = Analyzer::new(config); - - let matches = compute_matches(&matcher, &value, &analyzer); - assert_eq!( - format!("{:?}", matches), - r##"{"about": [MatchInfo { start: 0, length: 6 }, MatchInfo { start: 31, length: 7 }, MatchInfo { start: 191, length: 7 }, MatchInfo { start: 225, length: 7 }, MatchInfo { start: 233, length: 6 }], "color": [MatchInfo { start: 0, length: 5 }], "price": [MatchInfo { start: 0, length: 1 }]}"## - ); - } - #[test] fn test_insert_geo_distance() { let value: Document = serde_json::from_str( r#"{ - "_geo": { - "lat": 50.629973371633746, - "lng": 3.0569447399419567 - }, - "city": "Lille", - "id": "1" - }"#, + "_geo": { + "lat": 50.629973371633746, + "lng": 3.0569447399419567 + }, + "city": "Lille", + "id": "1" + }"#, ) .unwrap(); From 3517eae47feb17b89d278df0fc26354773765ae9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 18 May 2022 18:45:53 +0200 Subject: [PATCH 2/3] Fix tests --- meilisearch-http/tests/documents/add_documents.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/meilisearch-http/tests/documents/add_documents.rs b/meilisearch-http/tests/documents/add_documents.rs index 911cfd312..0ac0436dc 100644 --- a/meilisearch-http/tests/documents/add_documents.rs +++ b/meilisearch-http/tests/documents/add_documents.rs @@ -868,7 +868,12 @@ async fn error_add_documents_bad_document_id() { let (response, code) = index.get_task(1).await; assert_eq!(code, 200); assert_eq!(response["status"], json!("failed")); - assert_eq!(response["error"]["message"], json!("Document identifier `foo & bar` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).")); + assert_eq!( + response["error"]["message"], + json!( + r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)."# + ) + ); assert_eq!(response["error"]["code"], json!("invalid_document_id")); assert_eq!(response["error"]["type"], json!("invalid_request")); assert_eq!( @@ -891,7 +896,12 @@ async fn error_update_documents_bad_document_id() { index.update_documents(documents, None).await; let response = index.wait_task(1).await; assert_eq!(response["status"], json!("failed")); - assert_eq!(response["error"]["message"], json!("Document identifier `foo & bar` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).")); + assert_eq!( + response["error"]["message"], + json!( + r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)."# + ) + ); assert_eq!(response["error"]["code"], json!("invalid_document_id")); assert_eq!(response["error"]["type"], json!("invalid_request")); assert_eq!( From 50763aac82365a8bddc635c8599f34c28a98140f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 19 May 2022 11:23:22 +0200 Subject: [PATCH 3/3] Fix clippy --- meilisearch-lib/src/index/search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch-lib/src/index/search.rs b/meilisearch-lib/src/index/search.rs index 327cf173a..bf543b377 100644 --- a/meilisearch-lib/src/index/search.rs +++ b/meilisearch-lib/src/index/search.rs @@ -446,7 +446,7 @@ fn format_fields<'a, A: AsRef<[u8]>>( compute_matches: bool, displayable_ids: &BTreeSet, ) -> Result<(Option, Document)> { - let mut matches = compute_matches.then(|| BTreeMap::new()); + let mut matches = compute_matches.then(BTreeMap::new); let mut document = document.clone(); // select the attributes to retrieve