Fix search highlight for non-unicode chars

The `matching_bytes` function takes a `&Token` now and:
- gets the number of bytes to highlight (unchanged).
- uses `Token.num_graphemes_from_bytes` to get the number of grapheme
  clusters to highlight.

In essence, the `matching_bytes` function returns the number of matching
grapheme clusters instead of bytes. Should this function be renamed
then?

Added proper highlighting in the HTTP UI:
- requires dependency on `unicode-segmentation` to extract grapheme
  clusters from tokens
- `<mark>` tag is put around only the matched part
    - before this change, the entire word was highlighted even if only a
      part of it matched
This commit is contained in:
Samyak S Sarnayak 2021-12-17 22:53:34 +05:30
parent 559e019de1
commit 30247d70cd
No known key found for this signature in database
GPG key ID: 365873F2F0C6153B
3 changed files with 26 additions and 12 deletions

View file

@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
use std::ops::{Index, IndexMut};
use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;
use super::build_dfa;
use crate::search::query_tree::{Operation, Query};
@ -33,15 +34,18 @@ impl MatchingWords {
}
/// Returns the number of matching bytes if the word matches one of the query words.
pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
match dfa.eval(word_to_highlight) {
match dfa.eval(word_to_highlight.text()) {
Distance::Exact(t) if t <= *typo => {
if *is_prefix {
let len = bytes_to_highlight(word_to_highlight, query_word);
Some(len)
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
Some(word_to_highlight.num_graphemes_from_bytes(len))
} else {
Some(word_to_highlight.len())
Some(
word_to_highlight
.num_graphemes_from_bytes(word_to_highlight.text().len()),
)
}
}
_otherwise => None,