From c6bb36efa5bdf689fab0985d46c47862722ec2cb Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Mon, 19 Apr 2021 16:22:41 +0200 Subject: [PATCH] implement _formated --- Cargo.lock | 9 +- meilisearch-http/Cargo.toml | 1 + meilisearch-http/src/index/mod.rs | 6 +- meilisearch-http/src/index/search.rs | 148 ++++++++++++++++++++------ meilisearch-http/src/routes/search.rs | 4 +- 5 files changed, 124 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 79e013392..1e5809b17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -345,9 +345,9 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "backtrace" -version = "0.3.56" +version = "0.3.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc" +checksum = "78ed203b9ba68b242c62b3fb7480f589dd49829be1edb3fe8fc8b4ffda2dcb8d" dependencies = [ "addr2line", "cfg-if 1.0.0", @@ -1661,6 +1661,7 @@ dependencies = [ "milli", "mime", "mockall", + "obkv", "once_cell", "oxidized-json-checker", "parking_lot", @@ -2837,9 +2838,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" [[package]] name = "slice-group-by" diff --git a/meilisearch-http/Cargo.toml b/meilisearch-http/Cargo.toml index 5ba3acd2b..e7a55d28e 100644 --- a/meilisearch-http/Cargo.toml +++ b/meilisearch-http/Cargo.toml @@ -63,6 +63,7 @@ tokio = { version = "1", features = ["full"] } uuid = "0.8.2" oxidized-json-checker = "0.3.2" walkdir = "2.3.2" +obkv = "0.1.1" [dependencies.sentry] default-features = false diff --git a/meilisearch-http/src/index/mod.rs b/meilisearch-http/src/index/mod.rs index ab5fc6fb5..57a939aaa 100644 --- a/meilisearch-http/src/index/mod.rs +++ b/meilisearch-http/src/index/mod.rs @@ -83,7 +83,7 @@ impl Index { let fields_ids_map = self.fields_ids_map(&txn)?; let fields_to_display = - self.fields_to_display(&txn, attributes_to_retrieve, &fields_ids_map)?; + self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?; let iter = self.documents.range(&txn, &(..))?.skip(offset).take(limit); @@ -108,7 +108,7 @@ impl Index { let fields_ids_map = self.fields_ids_map(&txn)?; let fields_to_display = - self.fields_to_display(&txn, attributes_to_retrieve, &fields_ids_map)?; + self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?; let internal_id = self .external_documents_ids(&txn)? @@ -134,7 +134,7 @@ impl Index { fn fields_to_display>( &self, txn: &heed::RoTxn, - attributes_to_retrieve: Option>, + attributes_to_retrieve: &Option>, fields_ids_map: &milli::FieldsIdsMap, ) -> anyhow::Result> { let mut displayed_fields_ids = match self.displayed_fields_ids(&txn)? { diff --git a/meilisearch-http/src/index/search.rs b/meilisearch-http/src/index/search.rs index 28eb7af98..9e0c5a5cc 100644 --- a/meilisearch-http/src/index/search.rs +++ b/meilisearch-http/src/index/search.rs @@ -1,17 +1,19 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashSet}; -use std::mem; use std::time::Instant; use anyhow::bail; use either::Either; use heed::RoTxn; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; -use milli::{facet::FacetValue, FacetCondition, MatchingWords}; +use milli::{facet::FacetValue, FacetCondition, FieldId, FieldsIdsMap, MatchingWords}; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use super::Index; +pub type Document = Map; + pub const DEFAULT_SEARCH_LIMIT: usize = 20; const fn default_search_limit() -> usize { @@ -25,8 +27,8 @@ pub struct SearchQuery { pub offset: Option, #[serde(default = "default_search_limit")] pub limit: usize, - pub attributes_to_retrieve: Option>, - pub attributes_to_crop: Option>, + pub attributes_to_retrieve: Option>, + pub attributes_to_crop: Option>, pub crop_length: Option, pub attributes_to_highlight: Option>, pub filters: Option, @@ -38,9 +40,9 @@ pub struct SearchQuery { #[derive(Debug, Clone, Serialize)] pub struct SearchHit { #[serde(flatten)] - pub document: Map, + pub document: Document, #[serde(rename = "_formatted", skip_serializing_if = "Map::is_empty")] - pub formatted: Map, + pub formatted: Document, } #[derive(Serialize)] @@ -86,21 +88,79 @@ impl Index { let mut documents = Vec::new(); let fields_ids_map = self.fields_ids_map(&rtxn).unwrap(); - let fields_to_display = - self.fields_to_display(&rtxn, query.attributes_to_retrieve, &fields_ids_map)?; + let fids = |attrs: &HashSet| { + attrs + .iter() + .filter_map(|name| fields_ids_map.id(name)) + .collect() + }; + + let displayed_ids: HashSet = fields_ids_map.iter().map(|(id, _)| id).collect(); + + let to_retrieve_ids = query + .attributes_to_retrieve + .as_ref() + .map(fids) + .unwrap_or_else(|| displayed_ids.clone()); + + let to_highlight_ids = query + .attributes_to_highlight + .as_ref() + .map(fids) + .unwrap_or_default(); + + let to_crop_ids = query + .attributes_to_crop + .as_ref() + .map(fids) + .unwrap_or_default(); + + // The attributes to retrieve are: + // - the ones explicitly marked as to retrieve that are also in the displayed attributes + let all_attributes: Vec<_> = to_retrieve_ids + .intersection(&displayed_ids) + .cloned() + .collect(); + + // The formatted attributes are: + // - The one in either highlighted attributes or cropped attributes if there are attributes + // to retrieve + // - All the attributes to retrieve if there are either highlighted or cropped attributes + // the request specified that all attributes are to retrieve (i.e attributes to retrieve is + // empty in the query) + let all_formatted = if query.attributes_to_retrieve.is_none() { + if query.attributes_to_highlight.is_some() || query.attributes_to_crop.is_some() { + Cow::Borrowed(&all_attributes) + } else { + Cow::Owned(Vec::new()) + } + } else { + let attrs = (&to_crop_ids | &to_highlight_ids) + .intersection(&displayed_ids) + .cloned() + .collect::>(); + Cow::Owned(attrs) + }; let stop_words = fst::Set::default(); - let highlighter = Highlighter::new(&stop_words); + let highlighter = Highlighter::new( + &stop_words, + (String::from(""), String::from("")), + ); for (_id, obkv) in self.documents(&rtxn, documents_ids)? { - let mut object = - milli::obkv_to_json(&fields_to_display, &fields_ids_map, obkv).unwrap(); - if let Some(ref attributes_to_highlight) = query.attributes_to_highlight { - highlighter.highlight_record(&mut object, &matching_words, attributes_to_highlight); - } + let document = milli::obkv_to_json(&all_attributes, &fields_ids_map, obkv.clone())?; + let formatted = compute_formatted( + &fields_ids_map, + obkv, + &highlighter, + &matching_words, + all_formatted.as_ref().as_slice(), + &to_highlight_ids, + )?; let hit = SearchHit { - document: object, - formatted: Map::new(), + document, + formatted, }; documents.push(hit); } @@ -132,6 +192,38 @@ impl Index { } } +fn compute_formatted>( + field_ids_map: &FieldsIdsMap, + obkv: obkv::KvReader, + highlighter: &Highlighter, + matching_words: &MatchingWords, + all_formatted: &[FieldId], + to_highlight_ids: &HashSet, +) -> anyhow::Result { + let mut document = Document::new(); + + for field in all_formatted { + if let Some(value) = obkv.get(*field) { + let mut value: Value = serde_json::from_slice(value)?; + + if to_highlight_ids.contains(field) { + value = highlighter.highlight_value(value, matching_words); + } + + // This unwrap must be safe since we got the ids from the fields_ids_map just + // before. + let key = field_ids_map + .name(*field) + .expect("Missing field name") + .to_string(); + + document.insert(key, value); + } + } + + Ok(document) +} + fn parse_facets_array( txn: &RoTxn, index: &Index, @@ -163,16 +255,17 @@ fn parse_facets_array( pub struct Highlighter<'a, A> { analyzer: Analyzer<'a, A>, + marks: (String, String), } impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { - pub fn new(stop_words: &'a fst::Set) -> Self { + pub fn new(stop_words: &'a fst::Set, marks: (String, String)) -> Self { let mut config = AnalyzerConfig::default(); config.stop_words(stop_words); let analyzer = Analyzer::new(config); - Self { analyzer } + Self { analyzer, marks } } pub fn highlight_value(&self, value: Value, words_to_highlight: &MatchingWords) -> Value { @@ -187,11 +280,11 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { if token.is_word() { let to_highlight = words_to_highlight.matches(token.text()); if to_highlight { - string.push_str("") + string.push_str(&self.marks.0) } string.push_str(word); if to_highlight { - string.push_str("") + string.push_str(&self.marks.1) } } else { string.push_str(word); @@ -213,21 +306,6 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { ), } } - - pub fn highlight_record( - &self, - object: &mut Map, - words_to_highlight: &MatchingWords, - attributes_to_highlight: &HashSet, - ) { - // TODO do we need to create a string for element that are not and needs to be highlight? - for (key, value) in object.iter_mut() { - if attributes_to_highlight.contains(key) { - let old_value = mem::take(value); - *value = self.highlight_value(old_value, words_to_highlight); - } - } - } } fn parse_facets( diff --git a/meilisearch-http/src/routes/search.rs b/meilisearch-http/src/routes/search.rs index ae2caeb30..86beb2750 100644 --- a/meilisearch-http/src/routes/search.rs +++ b/meilisearch-http/src/routes/search.rs @@ -36,11 +36,11 @@ impl TryFrom for SearchQuery { fn try_from(other: SearchQueryGet) -> anyhow::Result { let attributes_to_retrieve = other .attributes_to_retrieve - .map(|attrs| attrs.split(',').map(String::from).collect::>()); + .map(|attrs| attrs.split(',').map(String::from).collect::>()); let attributes_to_crop = other .attributes_to_crop - .map(|attrs| attrs.split(',').map(String::from).collect::>()); + .map(|attrs| attrs.split(',').map(String::from).collect::>()); let attributes_to_highlight = other .attributes_to_highlight