diff --git a/Cargo.lock b/Cargo.lock index 79e013392..1e5809b17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -345,9 +345,9 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "backtrace" -version = "0.3.56" +version = "0.3.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc" +checksum = "78ed203b9ba68b242c62b3fb7480f589dd49829be1edb3fe8fc8b4ffda2dcb8d" dependencies = [ "addr2line", "cfg-if 1.0.0", @@ -1661,6 +1661,7 @@ dependencies = [ "milli", "mime", "mockall", + "obkv", "once_cell", "oxidized-json-checker", "parking_lot", @@ -2837,9 +2838,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" [[package]] name = "slice-group-by" diff --git a/meilisearch-http/Cargo.toml b/meilisearch-http/Cargo.toml index 5ba3acd2b..e7a55d28e 100644 --- a/meilisearch-http/Cargo.toml +++ b/meilisearch-http/Cargo.toml @@ -63,6 +63,7 @@ tokio = { version = "1", features = ["full"] } uuid = "0.8.2" oxidized-json-checker = "0.3.2" walkdir = "2.3.2" +obkv = "0.1.1" [dependencies.sentry] default-features = false diff --git a/meilisearch-http/src/index/mod.rs b/meilisearch-http/src/index/mod.rs index ab5fc6fb5..57a939aaa 100644 --- a/meilisearch-http/src/index/mod.rs +++ b/meilisearch-http/src/index/mod.rs @@ -83,7 +83,7 @@ impl Index { let fields_ids_map = self.fields_ids_map(&txn)?; let fields_to_display = - self.fields_to_display(&txn, attributes_to_retrieve, &fields_ids_map)?; + self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?; let iter = self.documents.range(&txn, &(..))?.skip(offset).take(limit); @@ -108,7 +108,7 @@ impl Index { let fields_ids_map = self.fields_ids_map(&txn)?; let fields_to_display = - self.fields_to_display(&txn, attributes_to_retrieve, &fields_ids_map)?; + self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?; let internal_id = self .external_documents_ids(&txn)? @@ -134,7 +134,7 @@ impl Index { fn fields_to_display>( &self, txn: &heed::RoTxn, - attributes_to_retrieve: Option>, + attributes_to_retrieve: &Option>, fields_ids_map: &milli::FieldsIdsMap, ) -> anyhow::Result> { let mut displayed_fields_ids = match self.displayed_fields_ids(&txn)? { diff --git a/meilisearch-http/src/index/search.rs b/meilisearch-http/src/index/search.rs index 4b9753b82..63e8b7cd9 100644 --- a/meilisearch-http/src/index/search.rs +++ b/meilisearch-http/src/index/search.rs @@ -1,17 +1,21 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashSet}; -use std::mem; use std::time::Instant; use anyhow::bail; use either::Either; use heed::RoTxn; +use indexmap::IndexMap; +use itertools::Itertools; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; -use milli::{facet::FacetValue, FacetCondition, MatchingWords}; +use milli::{facet::FacetValue, FacetCondition, FieldId, FieldsIdsMap, MatchingWords}; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::Value; use super::Index; +pub type Document = IndexMap; + pub const DEFAULT_SEARCH_LIMIT: usize = 20; const fn default_search_limit() -> usize { @@ -25,8 +29,8 @@ pub struct SearchQuery { pub offset: Option, #[serde(default = "default_search_limit")] pub limit: usize, - pub attributes_to_retrieve: Option>, - pub attributes_to_crop: Option>, + pub attributes_to_retrieve: Option>, + pub attributes_to_crop: Option>, pub crop_length: Option, pub attributes_to_highlight: Option>, pub filters: Option, @@ -35,10 +39,18 @@ pub struct SearchQuery { pub facet_distributions: Option>, } +#[derive(Debug, Clone, Serialize)] +pub struct SearchHit { + #[serde(flatten)] + pub document: Document, + #[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")] + pub formatted: Document, +} + #[derive(Serialize)] #[serde(rename_all = "camelCase")] pub struct SearchResult { - pub hits: Vec>, + pub hits: Vec, pub nb_hits: u64, pub exhaustive_nb_hits: bool, pub query: String, @@ -78,19 +90,92 @@ impl Index { let mut documents = Vec::new(); let fields_ids_map = self.fields_ids_map(&rtxn).unwrap(); - let fields_to_display = - self.fields_to_display(&rtxn, query.attributes_to_retrieve, &fields_ids_map)?; + let displayed_ids = self.displayed_fields_ids(&rtxn)? + .map(|fields| fields.into_iter().collect::>()) + .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); + + let fids = |attrs: &HashSet| { + let mut ids = HashSet::new(); + for attr in attrs { + if attr == "*" { + ids = displayed_ids.clone(); + break; + } + + if let Some(id) = fields_ids_map.id(attr) { + ids.insert(id); + } + } + ids + }; + + let to_retrieve_ids = query + .attributes_to_retrieve + .as_ref() + .map(fids) + .unwrap_or_else(|| displayed_ids.clone()); + + let to_highlight_ids = query + .attributes_to_highlight + .as_ref() + .map(fids) + .unwrap_or_default(); + + let to_crop_ids = query + .attributes_to_crop + .as_ref() + .map(fids) + .unwrap_or_default(); + + // The attributes to retrieve are: + // - the ones explicitly marked as to retrieve that are also in the displayed attributes + let all_attributes: Vec<_> = to_retrieve_ids + .intersection(&displayed_ids) + .cloned() + .sorted() + .collect(); + + // The formatted attributes are: + // - The one in either highlighted attributes or cropped attributes if there are attributes + // to retrieve + // - All the attributes to retrieve if there are either highlighted or cropped attributes + // the request specified that all attributes are to retrieve (i.e attributes to retrieve is + // empty in the query) + let all_formatted = if query.attributes_to_retrieve.is_none() { + if query.attributes_to_highlight.is_some() || query.attributes_to_crop.is_some() { + Cow::Borrowed(&all_attributes) + } else { + Cow::Owned(Vec::new()) + } + } else { + let attrs = (&to_crop_ids | &to_highlight_ids) + .intersection(&displayed_ids) + .cloned() + .collect::>(); + Cow::Owned(attrs) + }; let stop_words = fst::Set::default(); - let highlighter = Highlighter::new(&stop_words); + let highlighter = Highlighter::new( + &stop_words, + (String::from(""), String::from("")), + ); for (_id, obkv) in self.documents(&rtxn, documents_ids)? { - let mut object = - milli::obkv_to_json(&fields_to_display, &fields_ids_map, obkv).unwrap(); - if let Some(ref attributes_to_highlight) = query.attributes_to_highlight { - highlighter.highlight_record(&mut object, &matching_words, attributes_to_highlight); - } - documents.push(object); + let document = make_document(&all_attributes, &fields_ids_map, obkv.clone())?; + let formatted = compute_formatted( + &fields_ids_map, + obkv, + &highlighter, + &matching_words, + all_formatted.as_ref().as_slice(), + &to_highlight_ids, + )?; + let hit = SearchHit { + document, + formatted, + }; + documents.push(hit); } let nb_hits = candidates.len(); @@ -120,6 +205,79 @@ impl Index { } } +fn make_document( + attributes_to_retrieve: &[FieldId], + field_ids_map: &FieldsIdsMap, + obkv: obkv::KvReader, +) -> anyhow::Result { + let mut document = Document::new(); + for attr in attributes_to_retrieve { + if let Some(value) = obkv.get(*attr) { + let value = serde_json::from_slice(value)?; + + // This unwrap must be safe since we got the ids from the fields_ids_map just + // before. + let key = field_ids_map + .name(*attr) + .expect("Missing field name") + .to_string(); + + document.insert(key, value); + } + } + Ok(document) +} + +fn compute_formatted>( + field_ids_map: &FieldsIdsMap, + obkv: obkv::KvReader, + highlighter: &Highlighter, + matching_words: &impl Matcher, + all_formatted: &[FieldId], + to_highlight_ids: &HashSet, +) -> anyhow::Result { + let mut document = Document::new(); + + for field in all_formatted { + if let Some(value) = obkv.get(*field) { + let mut value: Value = serde_json::from_slice(value)?; + + if to_highlight_ids.contains(field) { + value = highlighter.highlight_value(value, matching_words); + } + + // This unwrap must be safe since we got the ids from the fields_ids_map just + // before. + let key = field_ids_map + .name(*field) + .expect("Missing field name") + .to_string(); + + document.insert(key, value); + } + } + + Ok(document) +} + +/// trait to allow unit testing of `compute_formated` +trait Matcher { + fn matches(&self, w: &str) -> bool; +} + +#[cfg(test)] +impl Matcher for HashSet { + fn matches(&self, w: &str) -> bool { + self.contains(w) + } +} + +impl Matcher for MatchingWords { + fn matches(&self, w: &str) -> bool { + self.matches(w) + } +} + fn parse_facets_array( txn: &RoTxn, index: &Index, @@ -149,21 +307,22 @@ fn parse_facets_array( FacetCondition::from_array(txn, &index.0, ands) } -pub struct Highlighter<'a, A> { +struct Highlighter<'a, A> { analyzer: Analyzer<'a, A>, + marks: (String, String), } impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { - pub fn new(stop_words: &'a fst::Set) -> Self { + pub fn new(stop_words: &'a fst::Set, marks: (String, String)) -> Self { let mut config = AnalyzerConfig::default(); config.stop_words(stop_words); let analyzer = Analyzer::new(config); - Self { analyzer } + Self { analyzer, marks } } - pub fn highlight_value(&self, value: Value, words_to_highlight: &MatchingWords) -> Value { + fn highlight_value(&self, value: Value, words_to_highlight: &impl Matcher) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), @@ -175,11 +334,11 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { if token.is_word() { let to_highlight = words_to_highlight.matches(token.text()); if to_highlight { - string.push_str("") + string.push_str(&self.marks.0) } string.push_str(word); if to_highlight { - string.push_str("") + string.push_str(&self.marks.1) } } else { string.push_str(word); @@ -201,21 +360,6 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { ), } } - - pub fn highlight_record( - &self, - object: &mut Map, - words_to_highlight: &MatchingWords, - attributes_to_highlight: &HashSet, - ) { - // TODO do we need to create a string for element that are not and needs to be highlight? - for (key, value) in object.iter_mut() { - if attributes_to_highlight.contains(key) { - let old_value = mem::take(value); - *value = self.highlight_value(old_value, words_to_highlight); - } - } - } } fn parse_facets( @@ -230,3 +374,115 @@ fn parse_facets( v => bail!("Invalid facet expression, expected Array, found: {:?}", v), } } + +#[cfg(test)] +mod test { + use std::iter::FromIterator; + + use super::*; + + #[test] + fn no_formatted() { + let stop_words = fst::Set::default(); + let highlighter = Highlighter::new( + &stop_words, + (String::from(""), String::from("")), + ); + + let mut fields = FieldsIdsMap::new(); + let id = fields.insert("test").unwrap(); + + let mut buf = Vec::new(); + let mut obkv = obkv::KvWriter::new(&mut buf); + obkv.insert(id, Value::String("hello".into()).to_string().as_bytes()).unwrap(); + obkv.finish().unwrap(); + + let obkv = obkv::KvReader::new(&buf); + + let all_formatted = Vec::new(); + let to_highlight_ids = HashSet::new(); + + let matching_words = MatchingWords::default(); + + let value = compute_formatted( + &fields, + obkv, + &highlighter, + &matching_words, + &all_formatted, + &to_highlight_ids + ).unwrap(); + + assert!(value.is_empty()); + } + + #[test] + fn formatted_no_highlight() { + let stop_words = fst::Set::default(); + let highlighter = Highlighter::new( + &stop_words, + (String::from(""), String::from("")), + ); + + let mut fields = FieldsIdsMap::new(); + let id = fields.insert("test").unwrap(); + + let mut buf = Vec::new(); + let mut obkv = obkv::KvWriter::new(&mut buf); + obkv.insert(id, Value::String("hello".into()).to_string().as_bytes()).unwrap(); + obkv.finish().unwrap(); + + let obkv = obkv::KvReader::new(&buf); + + let all_formatted = vec![id]; + let to_highlight_ids = HashSet::new(); + + let matching_words = MatchingWords::default(); + + let value = compute_formatted( + &fields, + obkv, + &highlighter, + &matching_words, + &all_formatted, + &to_highlight_ids + ).unwrap(); + + assert_eq!(value["test"], "hello"); + } + + #[test] + fn formatted_with_highlight() { + let stop_words = fst::Set::default(); + let highlighter = Highlighter::new( + &stop_words, + (String::from(""), String::from("")), + ); + + let mut fields = FieldsIdsMap::new(); + let id = fields.insert("test").unwrap(); + + let mut buf = Vec::new(); + let mut obkv = obkv::KvWriter::new(&mut buf); + obkv.insert(id, Value::String("hello".into()).to_string().as_bytes()).unwrap(); + obkv.finish().unwrap(); + + let obkv = obkv::KvReader::new(&buf); + + let all_formatted = vec![id]; + let to_highlight_ids = HashSet::from_iter(Some(id)); + + let matching_words = HashSet::from_iter(Some(String::from("hello"))); + + let value = compute_formatted( + &fields, + obkv, + &highlighter, + &matching_words, + &all_formatted, + &to_highlight_ids + ).unwrap(); + + assert_eq!(value["test"], "hello"); + } +} diff --git a/meilisearch-http/src/routes/index.rs b/meilisearch-http/src/routes/index.rs index d5fac5170..4424c8cfe 100644 --- a/meilisearch-http/src/routes/index.rs +++ b/meilisearch-http/src/routes/index.rs @@ -1,7 +1,6 @@ use actix_web::{delete, get, post, put}; use actix_web::{web, HttpResponse}; -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use crate::error::ResponseError; use crate::helpers::Authentication; @@ -69,16 +68,6 @@ struct UpdateIndexRequest { primary_key: Option, } -#[derive(Debug, Serialize)] -#[serde(rename_all = "camelCase")] -struct UpdateIndexResponse { - name: String, - uid: String, - created_at: DateTime, - updated_at: DateTime, - primary_key: Option, -} - #[put("/indexes/{index_uid}", wrap = "Authentication::Private")] async fn update_index( data: web::Data, diff --git a/meilisearch-http/src/routes/search.rs b/meilisearch-http/src/routes/search.rs index ae2caeb30..86beb2750 100644 --- a/meilisearch-http/src/routes/search.rs +++ b/meilisearch-http/src/routes/search.rs @@ -36,11 +36,11 @@ impl TryFrom for SearchQuery { fn try_from(other: SearchQueryGet) -> anyhow::Result { let attributes_to_retrieve = other .attributes_to_retrieve - .map(|attrs| attrs.split(',').map(String::from).collect::>()); + .map(|attrs| attrs.split(',').map(String::from).collect::>()); let attributes_to_crop = other .attributes_to_crop - .map(|attrs| attrs.split(',').map(String::from).collect::>()); + .map(|attrs| attrs.split(',').map(String::from).collect::>()); let attributes_to_highlight = other .attributes_to_highlight