From 30247d70cd3da8f85c960d14f5af291257328a2e Mon Sep 17 00:00:00 2001
From: Samyak S Sarnayak <samyak201@gmail.com>
Date: Fri, 17 Dec 2021 22:53:34 +0530
Subject: [PATCH 1/6] Fix search highlight for non-unicode chars

The `matching_bytes` function takes a `&Token` now and:
- gets the number of bytes to highlight (unchanged).
- uses `Token.num_graphemes_from_bytes` to get the number of grapheme
  clusters to highlight.

In essence, the `matching_bytes` function returns the number of matching
grapheme clusters instead of bytes. Should this function be renamed
then?

Added proper highlighting in the HTTP UI:
- requires dependency on `unicode-segmentation` to extract grapheme
  clusters from tokens
- `<mark>` tag is put around only the matched part
    - before this change, the entire word was highlighted even if only a
      part of it matched
---
 http-ui/Cargo.toml                 |  1 +
 http-ui/src/main.rs                | 23 ++++++++++++++++-------
 milli/src/search/matching_words.rs | 14 +++++++++-----
 3 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 593dba3e5..79c784fdd 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -17,6 +17,7 @@ once_cell = "1.5.2"
 rayon = "1.5.0"
 structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] }
 tempfile = "3.2.0"
+unicode-segmentation = "1.6.0"
 
 # http server
 askama = "0.10.5"
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 75a9012c6..386f10cb4 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -34,6 +34,7 @@ use structopt::StructOpt;
 use tokio::fs::File as TFile;
 use tokio::io::AsyncWriteExt;
 use tokio::sync::broadcast;
+use unicode_segmentation::UnicodeSegmentation;
 use warp::filters::ws::Message;
 use warp::http::Response;
 use warp::Filter;
@@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
                 let analyzed = self.analyzer.analyze(&old_string);
                 for (word, token) in analyzed.reconstruct() {
                     if token.is_word() {
-                        let to_highlight = matching_words.matching_bytes(token.text()).is_some();
-                        if to_highlight {
-                            string.push_str("<mark>")
-                        }
-                        string.push_str(word);
-                        if to_highlight {
-                            string.push_str("</mark>")
+                        let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0);
+                        if chars_to_highlight > 0 {
+                            let graphemes = word.graphemes(true);
+                            let chars = graphemes.clone().into_iter();
+
+                            string.push_str("<mark>");
+                            string.push_str(
+                                chars.take(chars_to_highlight).collect::<String>().as_str(),
+                            );
+                            string.push_str("</mark>");
+
+                            let chars = graphemes.into_iter().skip(chars_to_highlight);
+                            string.push_str(chars.collect::<String>().as_str());
+                        } else {
+                            string.push_str(word);
                         }
                     } else {
                         string.push_str(word);
diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs
index 37754a782..b22335658 100644
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
 use std::ops::{Index, IndexMut};
 
 use levenshtein_automata::{Distance, DFA};
+use meilisearch_tokenizer::Token;
 
 use super::build_dfa;
 use crate::search::query_tree::{Operation, Query};
@@ -33,15 +34,18 @@ impl MatchingWords {
     }
 
     /// Returns the number of matching bytes if the word matches one of the query words.
-    pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
+    pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
         self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
-            match dfa.eval(word_to_highlight) {
+            match dfa.eval(word_to_highlight.text()) {
                 Distance::Exact(t) if t <= *typo => {
                     if *is_prefix {
-                        let len = bytes_to_highlight(word_to_highlight, query_word);
-                        Some(len)
+                        let len = bytes_to_highlight(word_to_highlight.text(), query_word);
+                        Some(word_to_highlight.num_graphemes_from_bytes(len))
                     } else {
-                        Some(word_to_highlight.len())
+                        Some(
+                            word_to_highlight
+                                .num_graphemes_from_bytes(word_to_highlight.text().len()),
+                        )
                     }
                 }
                 _otherwise => None,

From e752bd06f7f5a2e0221a484029d31b1f705155dc Mon Sep 17 00:00:00 2001
From: Samyak S Sarnayak <samyak201@gmail.com>
Date: Fri, 17 Dec 2021 23:26:06 +0530
Subject: [PATCH 2/6] Fix matching_words tests to compile successfully

The tests still fail due to a bug in https://github.com/meilisearch/tokenizer/pull/59
---
 milli/src/search/matching_words.rs | 66 ++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 7 deletions(-)

diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs
index b22335658..6df2e0121 100644
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@@ -182,8 +182,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {
 
 #[cfg(test)]
 mod tests {
+    use std::borrow::Cow;
     use std::str::from_utf8;
 
+    use meilisearch_tokenizer::TokenKind;
+
     use super::*;
     use crate::search::query_tree::{Operation, Query, QueryKind};
     use crate::MatchingWords;
@@ -273,12 +276,61 @@ mod tests {
 
         let matching_words = MatchingWords::from_query_tree(&query_tree);
 
-        assert_eq!(matching_words.matching_bytes("word"), Some(3));
-        assert_eq!(matching_words.matching_bytes("nyc"), None);
-        assert_eq!(matching_words.matching_bytes("world"), Some(5));
-        assert_eq!(matching_words.matching_bytes("splitted"), Some(5));
-        assert_eq!(matching_words.matching_bytes("thisnew"), None);
-        assert_eq!(matching_words.matching_bytes("borld"), Some(5));
-        assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4));
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("word"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "word".len(),
+            char_map: None,
+        }), Some(3));
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("nyc"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "nyc".len(),
+            char_map: None,
+        }), None);
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("world"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "world".len(),
+            char_map: None,
+        }), Some(5));
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("splitted"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "splitted".len(),
+            char_map: None,
+        }), Some(5));
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("thisnew"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "thisnew".len(),
+            char_map: None,
+        }), None);
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("borld"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "borld".len(),
+            char_map: None,
+        }), Some(5));
+        assert_eq!(matching_words.matching_bytes(&Token{
+            kind: TokenKind::Word,
+            word: Cow::Borrowed("wordsplit"),
+            byte_start: 0,
+            char_index: 0,
+            byte_end: "wordsplit".len(),
+            char_map: None,
+        }), Some(4));
     }
 }

From c10f58b7bdbf7ccf958c24e1cb628963af060972 Mon Sep 17 00:00:00 2001
From: Samyak S Sarnayak <samyak201@gmail.com>
Date: Mon, 17 Jan 2022 13:02:00 +0530
Subject: [PATCH 3/6] Update tokenizer to v0.2.7

---
 http-ui/Cargo.toml | 2 +-
 milli/Cargo.toml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 79c784fdd..7406a1c1b 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -10,7 +10,7 @@ anyhow = "1.0.38"
 byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
 crossbeam-channel = "0.5.0"
 heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
+meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
 memmap2 = "0.5.0"
 milli = { path = "../milli" }
 once_cell = "1.5.2"
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index a3d8cf627..3d77654eb 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe
 human_format = "1.0.3"
 levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
 linked-hash-map = "0.5.4"
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
+meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
 memmap2 = "0.5.0"
 obkv = "0.2.0"
 once_cell = "1.5.2"

From 5ab505be33256c1d338d9abeef2f78633c4cd89a Mon Sep 17 00:00:00 2001
From: Samyak S Sarnayak <samyak201@gmail.com>
Date: Mon, 17 Jan 2022 13:02:55 +0530
Subject: [PATCH 4/6] Fix highlight by replacing num_graphemes_from_bytes

num_graphemes_from_bytes has been renamed in the tokenizer to
num_chars_from_bytes.

Highlight now works correctly!
---
 milli/src/search/matching_words.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs
index 6df2e0121..74ff14382 100644
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@@ -40,11 +40,11 @@ impl MatchingWords {
                 Distance::Exact(t) if t <= *typo => {
                     if *is_prefix {
                         let len = bytes_to_highlight(word_to_highlight.text(), query_word);
-                        Some(word_to_highlight.num_graphemes_from_bytes(len))
+                        Some(word_to_highlight.num_chars_from_bytes(len))
                     } else {
                         Some(
                             word_to_highlight
-                                .num_graphemes_from_bytes(word_to_highlight.text().len()),
+                                .num_chars_from_bytes(word_to_highlight.text().len()),
                         )
                     }
                 }

From 2d7607734eaf8baf343da07290170d0eadec4e53 Mon Sep 17 00:00:00 2001
From: Samyak S Sarnayak <samyak201@gmail.com>
Date: Mon, 17 Jan 2022 13:04:33 +0530
Subject: [PATCH 5/6] Run cargo fmt on matching_words.rs

---
 milli/src/search/matching_words.rs | 138 ++++++++++++++++-------------
 1 file changed, 78 insertions(+), 60 deletions(-)

diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs
index 74ff14382..67bdefb37 100644
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@@ -42,10 +42,7 @@ impl MatchingWords {
                         let len = bytes_to_highlight(word_to_highlight.text(), query_word);
                         Some(word_to_highlight.num_chars_from_bytes(len))
                     } else {
-                        Some(
-                            word_to_highlight
-                                .num_chars_from_bytes(word_to_highlight.text().len()),
-                        )
+                        Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
                     }
                 }
                 _otherwise => None,
@@ -276,61 +273,82 @@ mod tests {
 
         let matching_words = MatchingWords::from_query_tree(&query_tree);
 
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("word"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "word".len(),
-            char_map: None,
-        }), Some(3));
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("nyc"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "nyc".len(),
-            char_map: None,
-        }), None);
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("world"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "world".len(),
-            char_map: None,
-        }), Some(5));
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("splitted"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "splitted".len(),
-            char_map: None,
-        }), Some(5));
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("thisnew"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "thisnew".len(),
-            char_map: None,
-        }), None);
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("borld"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "borld".len(),
-            char_map: None,
-        }), Some(5));
-        assert_eq!(matching_words.matching_bytes(&Token{
-            kind: TokenKind::Word,
-            word: Cow::Borrowed("wordsplit"),
-            byte_start: 0,
-            char_index: 0,
-            byte_end: "wordsplit".len(),
-            char_map: None,
-        }), Some(4));
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("word"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "word".len(),
+                char_map: None,
+            }),
+            Some(3)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("nyc"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "nyc".len(),
+                char_map: None,
+            }),
+            None
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("world"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "world".len(),
+                char_map: None,
+            }),
+            Some(5)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("splitted"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "splitted".len(),
+                char_map: None,
+            }),
+            Some(5)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("thisnew"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "thisnew".len(),
+                char_map: None,
+            }),
+            None
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("borld"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "borld".len(),
+                char_map: None,
+            }),
+            Some(5)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("wordsplit"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "wordsplit".len(),
+                char_map: None,
+            }),
+            Some(4)
+        );
     }
 }

From c0313f3026cb8577c4035fd72394772842dec4b4 Mon Sep 17 00:00:00 2001
From: Samyak S Sarnayak <samyak201@gmail.com>
Date: Mon, 17 Jan 2022 13:10:44 +0530
Subject: [PATCH 6/6] Use chars for highlight instead of graphemes

Tokenizer v0.2.7 uses chars instead of graphemes for matching bytes.
`unicode-segmentation` dependency isn't needed anymore.

Also, oxidised the highlight code :)

Co-authored-by: many <maxime@meilisearch.com>
---
 http-ui/Cargo.toml  |  1 -
 http-ui/src/main.rs | 27 ++++++++++++---------------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 7406a1c1b..f45a85753 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -17,7 +17,6 @@ once_cell = "1.5.2"
 rayon = "1.5.0"
 structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] }
 tempfile = "3.2.0"
-unicode-segmentation = "1.6.0"
 
 # http server
 askama = "0.10.5"
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 386f10cb4..6502bf83a 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -34,7 +34,6 @@ use structopt::StructOpt;
 use tokio::fs::File as TFile;
 use tokio::io::AsyncWriteExt;
 use tokio::sync::broadcast;
-use unicode_segmentation::UnicodeSegmentation;
 use warp::filters::ws::Message;
 use warp::http::Response;
 use warp::Filter;
@@ -161,21 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
                 let analyzed = self.analyzer.analyze(&old_string);
                 for (word, token) in analyzed.reconstruct() {
                     if token.is_word() {
-                        let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0);
-                        if chars_to_highlight > 0 {
-                            let graphemes = word.graphemes(true);
-                            let chars = graphemes.clone().into_iter();
+                        match matching_words.matching_bytes(&token) {
+                            Some(chars_to_highlight) => {
+                                let mut chars = word.chars();
 
-                            string.push_str("<mark>");
-                            string.push_str(
-                                chars.take(chars_to_highlight).collect::<String>().as_str(),
-                            );
-                            string.push_str("</mark>");
-
-                            let chars = graphemes.into_iter().skip(chars_to_highlight);
-                            string.push_str(chars.collect::<String>().as_str());
-                        } else {
-                            string.push_str(word);
+                                string.push_str("<mark>");
+                                // push the part to highlight
+                                string.extend(chars.by_ref().take(chars_to_highlight));
+                                string.push_str("</mark>");
+                                // push the suffix after highlight
+                                string.extend(chars);
+                            }
+                            // no highlight
+                            None => string.push_str(word),
                         }
                     } else {
                         string.push_str(word);