diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 593dba3e5..f45a85753 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -10,7 +10,7 @@ anyhow = "1.0.38"
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
crossbeam-channel = "0.5.0"
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
+meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
memmap2 = "0.5.0"
milli = { path = "../milli" }
once_cell = "1.5.2"
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 75a9012c6..6502bf83a 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -160,13 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
let analyzed = self.analyzer.analyze(&old_string);
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
- let to_highlight = matching_words.matching_bytes(token.text()).is_some();
- if to_highlight {
- string.push_str("")
- }
- string.push_str(word);
- if to_highlight {
- string.push_str("")
+ match matching_words.matching_bytes(&token) {
+ Some(chars_to_highlight) => {
+ let mut chars = word.chars();
+
+ string.push_str("");
+ // push the part to highlight
+ string.extend(chars.by_ref().take(chars_to_highlight));
+ string.push_str("");
+ // push the suffix after highlight
+ string.extend(chars);
+ }
+ // no highlight
+ None => string.push_str(word),
}
} else {
string.push_str(word);
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index a3d8cf627..3d77654eb 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe
human_format = "1.0.3"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.4"
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
+meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
memmap2 = "0.5.0"
obkv = "0.2.0"
once_cell = "1.5.2"
diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs
index 37754a782..67bdefb37 100644
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
use std::ops::{Index, IndexMut};
use levenshtein_automata::{Distance, DFA};
+use meilisearch_tokenizer::Token;
use super::build_dfa;
use crate::search::query_tree::{Operation, Query};
@@ -33,15 +34,15 @@ impl MatchingWords {
}
/// Returns the number of matching bytes if the word matches one of the query words.
- pub fn matching_bytes(&self, word_to_highlight: &str) -> Option {
+ pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
- match dfa.eval(word_to_highlight) {
+ match dfa.eval(word_to_highlight.text()) {
Distance::Exact(t) if t <= *typo => {
if *is_prefix {
- let len = bytes_to_highlight(word_to_highlight, query_word);
- Some(len)
+ let len = bytes_to_highlight(word_to_highlight.text(), query_word);
+ Some(word_to_highlight.num_chars_from_bytes(len))
} else {
- Some(word_to_highlight.len())
+ Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
}
}
_otherwise => None,
@@ -178,8 +179,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {
#[cfg(test)]
mod tests {
+ use std::borrow::Cow;
use std::str::from_utf8;
+ use meilisearch_tokenizer::TokenKind;
+
use super::*;
use crate::search::query_tree::{Operation, Query, QueryKind};
use crate::MatchingWords;
@@ -269,12 +273,82 @@ mod tests {
let matching_words = MatchingWords::from_query_tree(&query_tree);
- assert_eq!(matching_words.matching_bytes("word"), Some(3));
- assert_eq!(matching_words.matching_bytes("nyc"), None);
- assert_eq!(matching_words.matching_bytes("world"), Some(5));
- assert_eq!(matching_words.matching_bytes("splitted"), Some(5));
- assert_eq!(matching_words.matching_bytes("thisnew"), None);
- assert_eq!(matching_words.matching_bytes("borld"), Some(5));
- assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4));
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("word"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "word".len(),
+ char_map: None,
+ }),
+ Some(3)
+ );
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("nyc"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "nyc".len(),
+ char_map: None,
+ }),
+ None
+ );
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("world"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "world".len(),
+ char_map: None,
+ }),
+ Some(5)
+ );
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("splitted"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "splitted".len(),
+ char_map: None,
+ }),
+ Some(5)
+ );
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("thisnew"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "thisnew".len(),
+ char_map: None,
+ }),
+ None
+ );
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("borld"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "borld".len(),
+ char_map: None,
+ }),
+ Some(5)
+ );
+ assert_eq!(
+ matching_words.matching_bytes(&Token {
+ kind: TokenKind::Word,
+ word: Cow::Borrowed("wordsplit"),
+ byte_start: 0,
+ char_index: 0,
+ byte_end: "wordsplit".len(),
+ char_map: None,
+ }),
+ Some(4)
+ );
}
}