mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-27 15:10:05 +01:00
Merge #426
426: Fix search highlight for non-unicode chars r=ManyTheFish a=Samyak2 # Pull Request ## What does this PR do? Fixes https://github.com/meilisearch/MeiliSearch/issues/1480 <!-- Please link the issue you're trying to fix with this PR, if none then please create an issue first. --> ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? ## Changes The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function now returns the number of matching grapheme clusters instead of bytes. Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched ## Questions Since `matching_bytes` does not return number of bytes but grapheme clusters, should it be renamed to something like `matching_chars` or `matching_graphemes`? Will this break the API? Thank you very much `@ManyTheFish` for helping 😄 Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com>
This commit is contained in:
commit
4c516c00da
@ -10,7 +10,7 @@ anyhow = "1.0.38"
|
||||
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
|
||||
crossbeam-channel = "0.5.0"
|
||||
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
|
||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
|
||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
|
||||
memmap2 = "0.5.0"
|
||||
milli = { path = "../milli" }
|
||||
once_cell = "1.5.2"
|
||||
|
@ -160,13 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
||||
let analyzed = self.analyzer.analyze(&old_string);
|
||||
for (word, token) in analyzed.reconstruct() {
|
||||
if token.is_word() {
|
||||
let to_highlight = matching_words.matching_bytes(token.text()).is_some();
|
||||
if to_highlight {
|
||||
string.push_str("<mark>")
|
||||
}
|
||||
string.push_str(word);
|
||||
if to_highlight {
|
||||
string.push_str("</mark>")
|
||||
match matching_words.matching_bytes(&token) {
|
||||
Some(chars_to_highlight) => {
|
||||
let mut chars = word.chars();
|
||||
|
||||
string.push_str("<mark>");
|
||||
// push the part to highlight
|
||||
string.extend(chars.by_ref().take(chars_to_highlight));
|
||||
string.push_str("</mark>");
|
||||
// push the suffix after highlight
|
||||
string.extend(chars);
|
||||
}
|
||||
// no highlight
|
||||
None => string.push_str(word),
|
||||
}
|
||||
} else {
|
||||
string.push_str(word);
|
||||
|
@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe
|
||||
human_format = "1.0.3"
|
||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||
linked-hash-map = "0.5.4"
|
||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
|
||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
|
||||
memmap2 = "0.5.0"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.5.2"
|
||||
|
@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
|
||||
use std::ops::{Index, IndexMut};
|
||||
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
use meilisearch_tokenizer::Token;
|
||||
|
||||
use super::build_dfa;
|
||||
use crate::search::query_tree::{Operation, Query};
|
||||
@ -33,15 +34,15 @@ impl MatchingWords {
|
||||
}
|
||||
|
||||
/// Returns the number of matching bytes if the word matches one of the query words.
|
||||
pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
|
||||
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
|
||||
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
|
||||
match dfa.eval(word_to_highlight) {
|
||||
match dfa.eval(word_to_highlight.text()) {
|
||||
Distance::Exact(t) if t <= *typo => {
|
||||
if *is_prefix {
|
||||
let len = bytes_to_highlight(word_to_highlight, query_word);
|
||||
Some(len)
|
||||
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
|
||||
Some(word_to_highlight.num_chars_from_bytes(len))
|
||||
} else {
|
||||
Some(word_to_highlight.len())
|
||||
Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
|
||||
}
|
||||
}
|
||||
_otherwise => None,
|
||||
@ -178,8 +179,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::borrow::Cow;
|
||||
use std::str::from_utf8;
|
||||
|
||||
use meilisearch_tokenizer::TokenKind;
|
||||
|
||||
use super::*;
|
||||
use crate::search::query_tree::{Operation, Query, QueryKind};
|
||||
use crate::MatchingWords;
|
||||
@ -269,12 +273,82 @@ mod tests {
|
||||
|
||||
let matching_words = MatchingWords::from_query_tree(&query_tree);
|
||||
|
||||
assert_eq!(matching_words.matching_bytes("word"), Some(3));
|
||||
assert_eq!(matching_words.matching_bytes("nyc"), None);
|
||||
assert_eq!(matching_words.matching_bytes("world"), Some(5));
|
||||
assert_eq!(matching_words.matching_bytes("splitted"), Some(5));
|
||||
assert_eq!(matching_words.matching_bytes("thisnew"), None);
|
||||
assert_eq!(matching_words.matching_bytes("borld"), Some(5));
|
||||
assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4));
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("word"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "word".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
Some(3)
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("nyc"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "nyc".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("world"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "world".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("splitted"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "splitted".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("thisnew"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "thisnew".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("borld"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "borld".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
Some(5)
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words.matching_bytes(&Token {
|
||||
kind: TokenKind::Word,
|
||||
word: Cow::Borrowed("wordsplit"),
|
||||
byte_start: 0,
|
||||
char_index: 0,
|
||||
byte_end: "wordsplit".len(),
|
||||
char_map: None,
|
||||
}),
|
||||
Some(4)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user