Hotfix panic for unicode characters

When the highlight bound is in the middle of a character
or if we are out of bounds, we highlight the complete matching word.

note: we should enhance the tokenizer and the Highlighter to match char indices.

Fix #1368
This commit is contained in:
many 2021-07-01 14:49:22 +02:00
parent c11c909bad
commit acfe31151e
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA

View File

@ -580,13 +580,23 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
// Matcher::match since the call is expensive. // Matcher::match since the call is expensive.
if format_options.highlight && token.is_word() { if format_options.highlight && token.is_word() {
if let Some(length) = matcher.matches(token.text()) { if let Some(length) = matcher.matches(token.text()) {
if format_options.highlight { match word.get(..length).zip(word.get(length..)) {
out.push_str(&self.marks.0); Some((head, tail)) => {
out.push_str(&word[..length]); out.push_str(&self.marks.0);
out.push_str(&self.marks.1); out.push_str(head);
out.push_str(&word[length..]); out.push_str(&self.marks.1);
return out; out.push_str(tail);
}
// if we are in the middle of a character
// or if all the word should be highlighted,
// we highlight the complete word.
None => {
out.push_str(&self.marks.0);
out.push_str(&word);
out.push_str(&self.marks.1);
}
} }
return out;
} }
} }
out.push_str(word); out.push_str(word);
@ -741,6 +751,71 @@ mod test {
assert_eq!(value["author"], "J. R. R. Tolkien"); assert_eq!(value["author"], "J. R. R. Tolkien");
} }
/// https://github.com/meilisearch/MeiliSearch/issues/1368
#[test]
fn formatted_with_highlight_in_unicode_word() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
title,
Value::String("Go💼od luck.".into()).to_string().as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
author,
Value::String("JacobLey".into()).to_string().as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
formatted_options.insert(
title,
FormatOptions {
highlight: true,
crop: None,
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
let mut matching_words = BTreeMap::new();
// emojis are deunicoded during tokenization
// TODO Tokenizer should remove spaces after deunicode
matching_words.insert("gobriefcase od", Some(11));
let value = format_fields(
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "<em>Go💼od</em> luck.");
assert_eq!(value["author"], "JacobLey");
}
#[test] #[test]
fn formatted_with_crop_2() { fn formatted_with_crop_2() {
let stop_words = fst::Set::default(); let stop_words = fst::Set::default();