Merge pull request #679 from MarinPostma/highlight-align-fix

Highlight align fix
This commit is contained in:
Clément Renault 2020-05-14 14:57:54 +02:00 committed by GitHub
commit e2b71b0e57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 103 additions and 31 deletions

View File

@ -1,6 +1,7 @@
## v0.10.2 ## v0.10.2
- Disable sentry in debug (#681) - Disable sentry in debug (#681)
- Fix highlight misalignment (#679)
- Add support for facet count (#676) - Add support for facet count (#676)
- Add support for faceted search (#631) - Add support for faceted search (#631)
- Add support for configuring the lmdb map size (#646, #647) - Add support for configuring the lmdb map size (#646, #647)

View File

@ -214,7 +214,7 @@ impl<'a> SearchBuilder<'a> {
// Transform to readable matches // Transform to readable matches
if let Some(attributes_to_highlight) = &self.attributes_to_highlight { if let Some(attributes_to_highlight) = &self.attributes_to_highlight {
let matches = calculate_matches( let matches = calculate_matches(
matches.clone(), &matches,
self.attributes_to_highlight.clone(), self.attributes_to_highlight.clone(),
&schema, &schema,
); );
@ -222,7 +222,7 @@ impl<'a> SearchBuilder<'a> {
} }
let matches_info = if self.matches { let matches_info = if self.matches {
Some(calculate_matches(matches, self.attributes_to_retrieve.clone(), &schema)) Some(calculate_matches(&matches, self.attributes_to_retrieve.clone(), &schema))
} else { } else {
None None
}; };
@ -352,10 +352,14 @@ fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize
return (match_index, 1 + text.chars().skip(match_index).take_while(is_word_component).count()); return (match_index, 1 + text.chars().skip(match_index).take_while(is_word_component).count());
} }
let start = match match_index.saturating_sub(context) { let start = match match_index.saturating_sub(context) {
n if n == 0 => n, 0 => 0,
n => word_end_index(n) n => {
let word_end_index = word_end_index(n);
// skip whitespaces if any
word_end_index + text.chars().skip(word_end_index).take_while(char::is_ascii_whitespace).count()
}
}; };
let end = word_end_index(start + 2 * context); let end = word_end_index(match_index + context);
(start, end - start) (start, end - start)
} }
@ -370,15 +374,21 @@ fn crop_text(
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0); let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
let (start, count) = aligned_crop(text, char_index, context); let (start, count) = aligned_crop(text, char_index, context);
//TODO do something about the double allocation // TODO do something about double allocation
let text = text.chars().skip(start).take(count).collect::<String>().trim().to_string(); let text = text
.chars()
.skip(start)
.take(count)
.collect::<String>()
.trim()
.to_string();
// update matches index to match the new cropped text // update matches index to match the new cropped text
let matches = matches let matches = matches
.take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)) .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + count)
.map(|match_| Highlight { .map(|m| Highlight {
char_index: match_.char_index - start as u16, char_index: m.char_index - start as u16,
..match_ ..m
}) })
.collect(); .collect();
@ -417,14 +427,14 @@ fn crop_document(
} }
fn calculate_matches( fn calculate_matches(
matches: Vec<Highlight>, matches: &[Highlight],
attributes_to_retrieve: Option<HashSet<String>>, attributes_to_retrieve: Option<HashSet<String>>,
schema: &Schema, schema: &Schema,
) -> MatchesInfos { ) -> MatchesInfos {
let mut matches_result: HashMap<String, Vec<MatchPosition>> = HashMap::new(); let mut matches_result: HashMap<String, Vec<MatchPosition>> = HashMap::new();
for m in matches.iter() { for m in matches.iter() {
if let Some(attribute) = schema.name(FieldId::new(m.attribute)) { if let Some(attribute) = schema.name(FieldId::new(m.attribute)) {
if let Some(attributes_to_retrieve) = attributes_to_retrieve.clone() { if let Some(ref attributes_to_retrieve) = attributes_to_retrieve {
if !attributes_to_retrieve.contains(attribute) { if !attributes_to_retrieve.contains(attribute) {
continue; continue;
} }
@ -470,21 +480,20 @@ fn calculate_highlights(
let longest_matches = matches let longest_matches = matches
.linear_group_by_key(|m| m.start) .linear_group_by_key(|m| m.start)
.map(|group| group.last().unwrap()); .map(|group| group.last().unwrap())
.filter(move |m| m.start >= index);
for m in longest_matches { for m in longest_matches {
if m.start >= index { let before = value.get(index..m.start);
let before = value.get(index..m.start); let highlighted = value.get(m.start..(m.start + m.length));
let highlighted = value.get(m.start..(m.start + m.length)); if let (Some(before), Some(highlighted)) = (before, highlighted) {
if let (Some(before), Some(highlighted)) = (before, highlighted) { highlighted_value.extend(before);
highlighted_value.extend(before); highlighted_value.push_str("<em>");
highlighted_value.push_str("<em>"); highlighted_value.extend(highlighted);
highlighted_value.extend(highlighted); highlighted_value.push_str("</em>");
highlighted_value.push_str("</em>"); index = m.start + m.length;
index = m.start + m.length; } else {
} else { error!("value: {:?}; index: {:?}, match: {:?}", value, index, m);
error!("value: {:?}; index: {:?}, match: {:?}", value, index, m);
}
} }
} }
highlighted_value.extend(value[index..].iter()); highlighted_value.extend(value[index..].iter());
@ -492,7 +501,6 @@ fn calculate_highlights(
}; };
} }
} }
highlight_result highlight_result
} }
@ -524,13 +532,12 @@ mod tests {
// mixed charset // mixed charset
let (start, length) = aligned_crop(&text, 5, 3); let (start, length) = aligned_crop(&text, 5, 3);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string(); let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("isの", cropped); assert_eq!("isの", cropped);
// split regular word / CJK word, no space // split regular word / CJK word, no space
let (start, length) = aligned_crop(&text, 7, 1); let (start, length) = aligned_crop(&text, 7, 1);
let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string(); let cropped = text.chars().skip(start).take(length).collect::<String>().trim().to_string();
assert_eq!("のス", cropped); assert_eq!("", cropped);
} }
#[test] #[test]
@ -544,7 +551,7 @@ mod tests {
let schema = Schema::with_primary_key("title"); let schema = Schema::with_primary_key("title");
let matches_result = super::calculate_matches(matches, Some(attributes_to_retrieve), &schema); let matches_result = super::calculate_matches(&matches, Some(attributes_to_retrieve), &schema);
let mut matches_result_expected: HashMap<String, Vec<MatchPosition>> = HashMap::new(); let mut matches_result_expected: HashMap<String, Vec<MatchPosition>> = HashMap::new();

View File

@ -1343,3 +1343,67 @@ async fn test_facet_count() {
let (_response, status_code) = server.search(query).await; let (_response, status_code) = server.search(query).await;
assert_eq!(status_code, 400); assert_eq!(status_code, 400);
} }
#[actix_rt::test]
async fn highlight_cropped_text() {
let mut server = common::Server::with_uid("test");
let body = json!({
"uid": "test",
"primaryKey": "id",
});
server.create_index(body).await;
let doc = json!([
{
"id": 1,
"body": r##"well, it may not work like that, try the following:
1. insert your trip
2. google your `searchQuery`
3. find a solution
> say hello"##
}
]);
server.add_or_replace_multiple_documents(doc).await;
// tests from #680
let query = "q=insert&attributesToHighlight=*&attributesToCrop=body&cropLength=30";
let (response, _status_code) = server.search(query).await;
let expected_response = "that, try the following: \n1. <em>insert</em> your trip\n2. google your";
assert_eq!(response
.get("hits")
.unwrap()
.as_array()
.unwrap()
.get(0)
.unwrap()
.as_object()
.unwrap()
.get("_formatted")
.unwrap()
.as_object()
.unwrap()
.get("body")
.unwrap()
, &Value::String(expected_response.to_owned()));
let query = "q=insert&attributesToHighlight=*&attributesToCrop=body&cropLength=80";
let (response, _status_code) = server.search(query).await;
let expected_response = "well, it may not work like that, try the following: \n1. <em>insert</em> your trip\n2. google your `searchQuery`\n3. find a solution \n> say hello";
assert_eq!(response
.get("hits")
.unwrap()
.as_array()
.unwrap()
.get(0)
.unwrap()
.as_object()
.unwrap()
.get("_formatted")
.unwrap()
.as_object()
.unwrap()
.get("body")
.unwrap()
, &Value::String(expected_response.to_owned()));
}