mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-04 20:18:55 +01:00
Fix search highlight for non-unicode chars
The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched
This commit is contained in:
parent
559e019de1
commit
30247d70cd
@ -17,6 +17,7 @@ once_cell = "1.5.2"
|
|||||||
rayon = "1.5.0"
|
rayon = "1.5.0"
|
||||||
structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] }
|
structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] }
|
||||||
tempfile = "3.2.0"
|
tempfile = "3.2.0"
|
||||||
|
unicode-segmentation = "1.6.0"
|
||||||
|
|
||||||
# http server
|
# http server
|
||||||
askama = "0.10.5"
|
askama = "0.10.5"
|
||||||
|
@ -34,6 +34,7 @@ use structopt::StructOpt;
|
|||||||
use tokio::fs::File as TFile;
|
use tokio::fs::File as TFile;
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
use tokio::sync::broadcast;
|
use tokio::sync::broadcast;
|
||||||
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
use warp::filters::ws::Message;
|
use warp::filters::ws::Message;
|
||||||
use warp::http::Response;
|
use warp::http::Response;
|
||||||
use warp::Filter;
|
use warp::Filter;
|
||||||
@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
|
|||||||
let analyzed = self.analyzer.analyze(&old_string);
|
let analyzed = self.analyzer.analyze(&old_string);
|
||||||
for (word, token) in analyzed.reconstruct() {
|
for (word, token) in analyzed.reconstruct() {
|
||||||
if token.is_word() {
|
if token.is_word() {
|
||||||
let to_highlight = matching_words.matching_bytes(token.text()).is_some();
|
let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0);
|
||||||
if to_highlight {
|
if chars_to_highlight > 0 {
|
||||||
string.push_str("<mark>")
|
let graphemes = word.graphemes(true);
|
||||||
}
|
let chars = graphemes.clone().into_iter();
|
||||||
|
|
||||||
|
string.push_str("<mark>");
|
||||||
|
string.push_str(
|
||||||
|
chars.take(chars_to_highlight).collect::<String>().as_str(),
|
||||||
|
);
|
||||||
|
string.push_str("</mark>");
|
||||||
|
|
||||||
|
let chars = graphemes.into_iter().skip(chars_to_highlight);
|
||||||
|
string.push_str(chars.collect::<String>().as_str());
|
||||||
|
} else {
|
||||||
string.push_str(word);
|
string.push_str(word);
|
||||||
if to_highlight {
|
|
||||||
string.push_str("</mark>")
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
string.push_str(word);
|
string.push_str(word);
|
||||||
|
@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
|
|||||||
use std::ops::{Index, IndexMut};
|
use std::ops::{Index, IndexMut};
|
||||||
|
|
||||||
use levenshtein_automata::{Distance, DFA};
|
use levenshtein_automata::{Distance, DFA};
|
||||||
|
use meilisearch_tokenizer::Token;
|
||||||
|
|
||||||
use super::build_dfa;
|
use super::build_dfa;
|
||||||
use crate::search::query_tree::{Operation, Query};
|
use crate::search::query_tree::{Operation, Query};
|
||||||
@ -33,15 +34,18 @@ impl MatchingWords {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of matching bytes if the word matches one of the query words.
|
/// Returns the number of matching bytes if the word matches one of the query words.
|
||||||
pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
|
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
|
||||||
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
|
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
|
||||||
match dfa.eval(word_to_highlight) {
|
match dfa.eval(word_to_highlight.text()) {
|
||||||
Distance::Exact(t) if t <= *typo => {
|
Distance::Exact(t) if t <= *typo => {
|
||||||
if *is_prefix {
|
if *is_prefix {
|
||||||
let len = bytes_to_highlight(word_to_highlight, query_word);
|
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
|
||||||
Some(len)
|
Some(word_to_highlight.num_graphemes_from_bytes(len))
|
||||||
} else {
|
} else {
|
||||||
Some(word_to_highlight.len())
|
Some(
|
||||||
|
word_to_highlight
|
||||||
|
.num_graphemes_from_bytes(word_to_highlight.text().len()),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_otherwise => None,
|
_otherwise => None,
|
||||||
|
Loading…
Reference in New Issue
Block a user