From 30247d70cd3da8f85c960d14f5af291257328a2e Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 17 Dec 2021 22:53:34 +0530 Subject: [PATCH 1/6] Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched --- http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 23 ++++++++++++++++------- milli/src/search/matching_words.rs | 14 +++++++++----- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 593dba3e5..79c784fdd 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -17,6 +17,7 @@ once_cell = "1.5.2" rayon = "1.5.0" structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } tempfile = "3.2.0" +unicode-segmentation = "1.6.0" # http server askama = "0.10.5" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 75a9012c6..386f10cb4 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -34,6 +34,7 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; +use unicode_segmentation::UnicodeSegmentation; use warp::filters::ws::Message; use warp::http::Response; use warp::Filter; @@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = matching_words.matching_bytes(token.text()).is_some(); - if to_highlight { - string.push_str("") - } - string.push_str(word); - if to_highlight { - string.push_str("") + let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0); + if chars_to_highlight > 0 { + let graphemes = word.graphemes(true); + let chars = graphemes.clone().into_iter(); + + string.push_str(""); + string.push_str( + chars.take(chars_to_highlight).collect::().as_str(), + ); + string.push_str(""); + + let chars = graphemes.into_iter().skip(chars_to_highlight); + string.push_str(chars.collect::().as_str()); + } else { + string.push_str(word); } } else { string.push_str(word); diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 37754a782..b22335658 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; +use meilisearch_tokenizer::Token; use super::build_dfa; use crate::search::query_tree::{Operation, Query}; @@ -33,15 +34,18 @@ impl MatchingWords { } /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &str) -> Option { + pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { - match dfa.eval(word_to_highlight) { + match dfa.eval(word_to_highlight.text()) { Distance::Exact(t) if t <= *typo => { if *is_prefix { - let len = bytes_to_highlight(word_to_highlight, query_word); - Some(len) + let len = bytes_to_highlight(word_to_highlight.text(), query_word); + Some(word_to_highlight.num_graphemes_from_bytes(len)) } else { - Some(word_to_highlight.len()) + Some( + word_to_highlight + .num_graphemes_from_bytes(word_to_highlight.text().len()), + ) } } _otherwise => None, From e752bd06f7f5a2e0221a484029d31b1f705155dc Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 17 Dec 2021 23:26:06 +0530 Subject: [PATCH 2/6] Fix matching_words tests to compile successfully The tests still fail due to a bug in https://github.com/meilisearch/tokenizer/pull/59 --- milli/src/search/matching_words.rs | 66 ++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index b22335658..6df2e0121 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -182,8 +182,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { #[cfg(test)] mod tests { + use std::borrow::Cow; use std::str::from_utf8; + use meilisearch_tokenizer::TokenKind; + use super::*; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::MatchingWords; @@ -273,12 +276,61 @@ mod tests { let matching_words = MatchingWords::from_query_tree(&query_tree); - assert_eq!(matching_words.matching_bytes("word"), Some(3)); - assert_eq!(matching_words.matching_bytes("nyc"), None); - assert_eq!(matching_words.matching_bytes("world"), Some(5)); - assert_eq!(matching_words.matching_bytes("splitted"), Some(5)); - assert_eq!(matching_words.matching_bytes("thisnew"), None); - assert_eq!(matching_words.matching_bytes("borld"), Some(5)); - assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }), Some(3)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }), None); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }), Some(5)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }), Some(5)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }), None); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }), Some(5)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }), Some(4)); } } From c10f58b7bdbf7ccf958c24e1cb628963af060972 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:02:00 +0530 Subject: [PATCH 3/6] Update tokenizer to v0.2.7 --- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 79c784fdd..7406a1c1b 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index a3d8cf627..3d77654eb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" obkv = "0.2.0" once_cell = "1.5.2" From 5ab505be33256c1d338d9abeef2f78633c4cd89a Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:02:55 +0530 Subject: [PATCH 4/6] Fix highlight by replacing num_graphemes_from_bytes num_graphemes_from_bytes has been renamed in the tokenizer to num_chars_from_bytes. Highlight now works correctly! --- milli/src/search/matching_words.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 6df2e0121..74ff14382 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -40,11 +40,11 @@ impl MatchingWords { Distance::Exact(t) if t <= *typo => { if *is_prefix { let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some(word_to_highlight.num_graphemes_from_bytes(len)) + Some(word_to_highlight.num_chars_from_bytes(len)) } else { Some( word_to_highlight - .num_graphemes_from_bytes(word_to_highlight.text().len()), + .num_chars_from_bytes(word_to_highlight.text().len()), ) } } From 2d7607734eaf8baf343da07290170d0eadec4e53 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:04:33 +0530 Subject: [PATCH 5/6] Run cargo fmt on matching_words.rs --- milli/src/search/matching_words.rs | 138 ++++++++++++++++------------- 1 file changed, 78 insertions(+), 60 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 74ff14382..67bdefb37 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -42,10 +42,7 @@ impl MatchingWords { let len = bytes_to_highlight(word_to_highlight.text(), query_word); Some(word_to_highlight.num_chars_from_bytes(len)) } else { - Some( - word_to_highlight - .num_chars_from_bytes(word_to_highlight.text().len()), - ) + Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) } } _otherwise => None, @@ -276,61 +273,82 @@ mod tests { let matching_words = MatchingWords::from_query_tree(&query_tree); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, - byte_end: "word".len(), - char_map: None, - }), Some(3)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, - byte_end: "nyc".len(), - char_map: None, - }), None); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, - byte_end: "world".len(), - char_map: None, - }), Some(5)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, - byte_end: "splitted".len(), - char_map: None, - }), Some(5)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, - byte_end: "thisnew".len(), - char_map: None, - }), None); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, - byte_end: "borld".len(), - char_map: None, - }), Some(5)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, - byte_end: "wordsplit".len(), - char_map: None, - }), Some(4)); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }), + Some(3) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }), + None + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }), + Some(5) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }), + Some(5) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }), + None + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }), + Some(5) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }), + Some(4) + ); } } From c0313f3026cb8577c4035fd72394772842dec4b4 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:10:44 +0530 Subject: [PATCH 6/6] Use chars for highlight instead of graphemes Tokenizer v0.2.7 uses chars instead of graphemes for matching bytes. `unicode-segmentation` dependency isn't needed anymore. Also, oxidised the highlight code :) Co-authored-by: many --- http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 27 ++++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 7406a1c1b..f45a85753 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -17,7 +17,6 @@ once_cell = "1.5.2" rayon = "1.5.0" structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } tempfile = "3.2.0" -unicode-segmentation = "1.6.0" # http server askama = "0.10.5" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 386f10cb4..6502bf83a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -34,7 +34,6 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; -use unicode_segmentation::UnicodeSegmentation; use warp::filters::ws::Message; use warp::http::Response; use warp::Filter; @@ -161,21 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0); - if chars_to_highlight > 0 { - let graphemes = word.graphemes(true); - let chars = graphemes.clone().into_iter(); + match matching_words.matching_bytes(&token) { + Some(chars_to_highlight) => { + let mut chars = word.chars(); - string.push_str(""); - string.push_str( - chars.take(chars_to_highlight).collect::().as_str(), - ); - string.push_str(""); - - let chars = graphemes.into_iter().skip(chars_to_highlight); - string.push_str(chars.collect::().as_str()); - } else { - string.push_str(word); + string.push_str(""); + // push the part to highlight + string.extend(chars.by_ref().take(chars_to_highlight)); + string.push_str(""); + // push the suffix after highlight + string.extend(chars); + } + // no highlight + None => string.push_str(word), } } else { string.push_str(word);