Rework query highlight/crop parameters

This commit is contained in:
Quentin de Quelen 2020-04-02 19:53:51 +02:00
parent c418abe92d
commit 500eeca3fb
2 changed files with 104 additions and 58 deletions

View File

@ -12,8 +12,8 @@ use meilisearch_core::Filter;
use meilisearch_core::criterion::*;
use meilisearch_core::settings::RankingRule;
use meilisearch_core::{Highlight, Index, MainT, RankedMap};
use meilisearch_tokenizer::is_cjk;
use meilisearch_schema::{FieldId, Schema};
use meilisearch_tokenizer::is_cjk;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
@ -220,36 +220,51 @@ impl<'a> SearchBuilder<'a> {
}
let start = Instant::now();
let result = query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
let result =
query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
let (docs, nb_hits) = result.map_err(|e| Error::SearchDocuments(e.to_string()))?;
let time_ms = start.elapsed().as_millis() as usize;
let mut all_attributes: HashSet<&str> = HashSet::new();
let mut all_formatted: HashSet<&str> = HashSet::new();
match &self.attributes_to_retrieve {
Some(to_retrieve) => {
all_attributes.extend(to_retrieve.iter().map(String::as_str));
if let Some(to_highlight) = &self.attributes_to_highlight {
all_formatted.extend(to_highlight.iter().map(String::as_str));
}
if let Some(to_crop) = &self.attributes_to_crop {
all_formatted.extend(to_crop.keys().map(String::as_str));
}
all_attributes.extend(&all_formatted);
},
None => {
all_attributes.extend(schema.displayed_name());
// If we specified at least one attribute to highlight or crop then
// all available attributes will be returned in the _formatted field.
if self.attributes_to_highlight.is_some() || self.attributes_to_crop.is_some() {
all_formatted.extend(all_attributes.iter().cloned());
}
},
}
let mut hits = Vec::with_capacity(self.limit);
for doc in docs {
// retrieve the content of document in kv store
let mut fields: Option<HashSet<&str>> = None;
if let Some(attributes_to_retrieve) = &self.attributes_to_retrieve {
let mut set = HashSet::new();
for field in attributes_to_retrieve {
set.insert(field.as_str());
}
fields = Some(set);
}
let document: IndexMap<String, Value> = self
let mut document: IndexMap<String, Value> = self
.index
.document(reader, fields.as_ref(), doc.id)
.document(reader, Some(&all_attributes), doc.id)
.map_err(|e| Error::RetrieveDocument(doc.id.0, e.to_string()))?
.ok_or(Error::DocumentNotFound(doc.id.0))?;
let has_attributes_to_highlight = self.attributes_to_highlight.is_some();
let has_attributes_to_crop = self.attributes_to_crop.is_some();
let mut formatted = document.iter()
.filter(|(key, _)| all_formatted.contains(key.as_str()))
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
let mut formatted = if has_attributes_to_highlight || has_attributes_to_crop {
document.clone()
} else {
IndexMap::new()
};
let mut matches = doc.highlights.clone();
// Crops fields if needed
@ -258,13 +273,24 @@ impl<'a> SearchBuilder<'a> {
}
// Transform to readable matches
let matches = calculate_matches(matches, self.attributes_to_retrieve.clone(), &schema);
if let Some(attributes_to_highlight) = &self.attributes_to_highlight {
let matches = calculate_matches(
matches.clone(),
self.attributes_to_highlight.clone(),
&schema,
);
formatted = calculate_highlights(&formatted, &matches, attributes_to_highlight);
}
let matches_info = if self.matches { Some(matches) } else { None };
let matches_info = if self.matches {
Some(calculate_matches(matches, self.attributes_to_retrieve.clone(), &schema))
} else {
None
};
if let Some(attributes_to_retrieve) = &self.attributes_to_retrieve {
document.retain(|key, _| attributes_to_retrieve.contains(&key.to_string()))
}
let hit = SearchHit {
document,
@ -369,7 +395,7 @@ pub struct SearchResult {
pub query: String,
}
/// returns the start index and the length on the crop.
/// returns the start index and the length on the crop.
fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c);
@ -553,8 +579,8 @@ mod tests {
let (start, length) = aligned_crop(&text, 5, 3);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("isのス", cropped);
// split regular word / CJK word, no space
// split regular word / CJK word, no space
let (start, length) = aligned_crop(&text, 7, 1);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("のス", cropped);

View File

@ -2,6 +2,7 @@ use std::collections::HashMap;
use std::collections::HashSet;
use std::time::Duration;
use log::warn;
use meilisearch_core::Index;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use serde::{Deserialize, Serialize};
@ -53,45 +54,64 @@ pub async fn search_with_url_query(ctx: Request<Data>) -> SResult<Response> {
search_builder.limit(limit);
}
if let Some(attributes_to_retrieve) = query.attributes_to_retrieve {
for attr in attributes_to_retrieve.split(',') {
search_builder.add_retrievable_field(attr.to_string());
let available_attributes = schema.displayed_name();
let mut restricted_attributes: HashSet<&str>;
match &query.attributes_to_retrieve {
Some(attributes_to_retrieve) => {
restricted_attributes = attributes_to_retrieve.split(',').collect();
restricted_attributes.retain(|attr| available_attributes.contains(attr));
},
None => {
restricted_attributes = available_attributes.clone();
}
}
if let Some(attributes_to_crop) = query.attributes_to_crop {
let crop_length = query.crop_length.unwrap_or(200);
if attributes_to_crop == "*" {
let attributes_to_crop = schema
.displayed_name()
.iter()
.map(|attr| (attr.to_string(), crop_length))
.collect();
search_builder.attributes_to_crop(attributes_to_crop);
} else {
let attributes_to_crop = attributes_to_crop
.split(',')
.map(|r| (r.to_string(), crop_length))
.collect();
search_builder.attributes_to_crop(attributes_to_crop);
let default_length = query.crop_length.unwrap_or(200);
let mut final_attributes: HashMap<String, usize> = HashMap::new();
for attribute in attributes_to_crop.split(',') {
let mut attribute = attribute.split(':');
let attr = attribute.next();
let length = attribute.next().and_then(|s| s.parse().ok()).unwrap_or(default_length);
match attr {
Some("*") => {
for attr in &restricted_attributes {
final_attributes.insert(attr.to_string(), length);
}
},
Some(attr) => {
if available_attributes.contains(attr) {
final_attributes.insert(attr.to_string(), length);
} else {
warn!("The attributes {:?} present in attributesToCrop parameter doesn't exist", attr);
}
},
None => (),
}
}
search_builder.attributes_to_crop(final_attributes);
}
if let Some(attributes_to_highlight) = query.attributes_to_highlight {
let attributes_to_highlight = if attributes_to_highlight == "*" {
schema
.displayed_name()
.iter()
.map(|s| s.to_string())
.collect()
} else {
attributes_to_highlight
.split(',')
.map(|s| s.to_string())
.collect()
};
if let Some(inline_attributes) = query.attributes_to_highlight {
let mut final_attributes: HashSet<String> = HashSet::new();
search_builder.attributes_to_highlight(attributes_to_highlight);
for attribute in inline_attributes.split(',') {
if attribute == "*" {
for attr in &restricted_attributes {
final_attributes.insert(attr.to_string());
}
} else {
if available_attributes.contains(attribute) {
final_attributes.insert(attribute.to_string());
} else {
warn!("The attributes {:?} present in attributesToHighlight parameter doesn't exist", attribute);
}
}
}
search_builder.attributes_to_highlight(final_attributes);
}
if let Some(filters) = query.filters {