mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Highlight partially cropped matches too
This commit is contained in:
parent
6c226a4580
commit
e51e6f902a
@ -18,7 +18,7 @@ pub enum MatchPosition {
|
|||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct Match {
|
pub struct Match {
|
||||||
pub match_len: usize,
|
pub char_count: usize,
|
||||||
// ids of the query words that matches.
|
// ids of the query words that matches.
|
||||||
pub ids: Vec<WordId>,
|
pub ids: Vec<WordId>,
|
||||||
pub position: MatchPosition,
|
pub position: MatchPosition,
|
||||||
|
@ -86,14 +86,17 @@ impl MatchingWords {
|
|||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
let prefix_length = char_index + c.len_utf8();
|
let prefix_length = char_index + c.len_utf8();
|
||||||
let char_len = token.original_lengths(prefix_length).0;
|
let (char_count, byte_len) = token.original_lengths(prefix_length);
|
||||||
let ids = &located_words.positions;
|
let ids = &located_words.positions;
|
||||||
return Some(MatchType::Full { char_len, ids });
|
return Some(MatchType::Full { ids, char_count, byte_len });
|
||||||
// else we exact match the token.
|
// else we exact match the token.
|
||||||
} else if token.lemma() == word {
|
} else if token.lemma() == word {
|
||||||
let char_len = token.char_end - token.char_start;
|
|
||||||
let ids = &located_words.positions;
|
let ids = &located_words.positions;
|
||||||
return Some(MatchType::Full { char_len, ids });
|
return Some(MatchType::Full {
|
||||||
|
char_count: token.char_end - token.char_start,
|
||||||
|
byte_len: token.byte_end - token.byte_start,
|
||||||
|
ids,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -149,7 +152,7 @@ pub type WordId = u16;
|
|||||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum MatchType<'a> {
|
pub enum MatchType<'a> {
|
||||||
Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
|
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||||
Partial(PartialMatch<'a>),
|
Partial(PartialMatch<'a>),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,7 +186,11 @@ impl<'a> PartialMatch<'a> {
|
|||||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||||
// return a Full match.
|
// return a Full match.
|
||||||
} else if is_matching {
|
} else if is_matching {
|
||||||
Some(MatchType::Full { char_len: token.char_end - token.char_start, ids })
|
Some(MatchType::Full {
|
||||||
|
char_count: token.char_end - token.char_start,
|
||||||
|
byte_len: token.byte_end - token.byte_start,
|
||||||
|
ids,
|
||||||
|
})
|
||||||
// if the current token doesn't match, return None to break the match sequence.
|
// if the current token doesn't match, return None to break the match sequence.
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
@ -270,7 +277,7 @@ pub(crate) mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
|
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
matching_words
|
matching_words
|
||||||
@ -294,7 +301,7 @@ pub(crate) mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
matching_words
|
matching_words
|
||||||
@ -306,7 +313,7 @@ pub(crate) mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
matching_words
|
matching_words
|
||||||
|
@ -10,7 +10,10 @@ use matching_words::{MatchType, PartialMatch};
|
|||||||
use r#match::{Match, MatchPosition};
|
use r#match::{Match, MatchPosition};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use simple_token_kind::SimpleTokenKind;
|
use simple_token_kind::SimpleTokenKind;
|
||||||
use std::borrow::Cow;
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
|
cmp::{max, min},
|
||||||
|
};
|
||||||
|
|
||||||
const DEFAULT_CROP_MARKER: &str = "…";
|
const DEFAULT_CROP_MARKER: &str = "…";
|
||||||
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
||||||
@ -139,7 +142,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
Some(MatchType::Full { ids, .. }) => {
|
Some(MatchType::Full { ids, .. }) => {
|
||||||
// save the token that closes the partial match as a match.
|
// save the token that closes the partial match as a match.
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: word.char_end - *first_word_char_start,
|
char_count: word.char_end - *first_word_char_start,
|
||||||
ids: ids.clone().collect(),
|
ids: ids.clone().collect(),
|
||||||
position: MatchPosition::Phrase {
|
position: MatchPosition::Phrase {
|
||||||
word_positions: [first_word_position, word_position],
|
word_positions: [first_word_position, word_position],
|
||||||
@ -182,10 +185,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
match match_type {
|
match match_type {
|
||||||
// we match, we save the current token as a match,
|
// we match, we save the current token as a match,
|
||||||
// then we continue the rest of the tokens.
|
// then we continue the rest of the tokens.
|
||||||
MatchType::Full { char_len, ids } => {
|
MatchType::Full { ids, char_count, .. } => {
|
||||||
let ids: Vec<_> = ids.clone().collect();
|
let ids: Vec<_> = ids.clone().collect();
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
char_count,
|
||||||
ids,
|
ids,
|
||||||
position: MatchPosition::Word { word_position, token_position },
|
position: MatchPosition::Word { word_position, token_position },
|
||||||
});
|
});
|
||||||
@ -224,19 +227,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|m| MatchBounds {
|
.map(|m| MatchBounds {
|
||||||
start: tokens[m.get_first_token_pos()].byte_start,
|
start: tokens[m.get_first_token_pos()].byte_start,
|
||||||
length: m.match_len,
|
// TODO: Why is this in chars, while start is in bytes?
|
||||||
|
length: m.char_count,
|
||||||
})
|
})
|
||||||
.collect(),
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the bounds in byte index of the crop window.
|
/// Returns the bounds in byte index of the crop window.
|
||||||
fn crop_bounds(
|
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
||||||
&self,
|
|
||||||
tokens: &[Token<'_>],
|
|
||||||
matches: &[Match],
|
|
||||||
crop_size: usize,
|
|
||||||
) -> (usize, usize) {
|
|
||||||
let (
|
let (
|
||||||
mut remaining_words,
|
mut remaining_words,
|
||||||
is_iterating_forward,
|
is_iterating_forward,
|
||||||
@ -371,7 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||||
|
|
||||||
(crop_byte_start, crop_byte_end)
|
[crop_byte_start, crop_byte_end]
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the formatted version of the original text.
|
// Returns the formatted version of the original text.
|
||||||
@ -382,78 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
} else {
|
} else {
|
||||||
match &self.matches {
|
match &self.matches {
|
||||||
Some((tokens, matches)) => {
|
Some((tokens, matches)) => {
|
||||||
// If the text has to be cropped,
|
// If the text has to be cropped, crop around the best interval.
|
||||||
// crop around the best interval.
|
let [crop_byte_start, crop_byte_end] = match format_options.crop {
|
||||||
let (byte_start, byte_end) = match format_options.crop {
|
|
||||||
Some(crop_size) if crop_size > 0 => {
|
Some(crop_size) if crop_size > 0 => {
|
||||||
self.crop_bounds(tokens, matches, crop_size)
|
self.crop_bounds(tokens, matches, crop_size)
|
||||||
}
|
}
|
||||||
_ => (0, self.text.len()),
|
_ => [0, self.text.len()],
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut formatted = Vec::new();
|
let mut formatted = Vec::new();
|
||||||
|
|
||||||
// push crop marker if it's not the start of the text.
|
// push crop marker if it's not the start of the text.
|
||||||
if byte_start > 0 && !self.crop_marker.is_empty() {
|
if crop_byte_start > 0 && !self.crop_marker.is_empty() {
|
||||||
formatted.push(self.crop_marker);
|
formatted.push(self.crop_marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut byte_index = byte_start;
|
let mut byte_index = crop_byte_start;
|
||||||
|
|
||||||
if format_options.highlight {
|
if format_options.highlight {
|
||||||
// insert highlight markers around matches.
|
// insert highlight markers around matches.
|
||||||
for m in matches {
|
for m in matches {
|
||||||
let (current_byte_start, current_byte_end) = match m.position {
|
let [m_byte_start, m_byte_end] = match m.position {
|
||||||
MatchPosition::Word { token_position, .. } => {
|
MatchPosition::Word { token_position, .. } => {
|
||||||
let token = &tokens[token_position];
|
let token = &tokens[token_position];
|
||||||
(&token.byte_start, &token.byte_end)
|
[&token.byte_start, &token.byte_end]
|
||||||
}
|
}
|
||||||
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
|
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
|
||||||
(&tokens[ftp].byte_start, &tokens[ltp].byte_end)
|
[&tokens[ftp].byte_start, &tokens[ltp].byte_end]
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// skip matches out of the crop window.
|
// skip matches out of the crop window
|
||||||
if *current_byte_start < byte_start || *current_byte_end > byte_end {
|
if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if byte_index < *current_byte_start {
|
// adjust start and end to the crop window size
|
||||||
formatted.push(&self.text[byte_index..*current_byte_start]);
|
let [m_byte_start, m_byte_end] = [
|
||||||
|
max(m_byte_start, &crop_byte_start),
|
||||||
|
min(m_byte_end, &crop_byte_end),
|
||||||
|
];
|
||||||
|
|
||||||
|
// push text that is positioned before our matches
|
||||||
|
if byte_index < *m_byte_start {
|
||||||
|
formatted.push(&self.text[byte_index..*m_byte_start]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let highlight_byte_index = self.text[*current_byte_start..]
|
|
||||||
.char_indices()
|
|
||||||
.enumerate()
|
|
||||||
.find(|(i, _)| *i == m.match_len)
|
|
||||||
.map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start);
|
|
||||||
|
|
||||||
formatted.push(self.highlight_prefix);
|
formatted.push(self.highlight_prefix);
|
||||||
formatted.push(&self.text[*current_byte_start..highlight_byte_index]);
|
|
||||||
|
// TODO: This is additional work done, charabia::token::Token byte_len
|
||||||
|
// should already get us the original byte length, however, that doesn't work as
|
||||||
|
// it's supposed to, investigate why
|
||||||
|
let highlight_byte_index = self.text[*m_byte_start..]
|
||||||
|
.char_indices()
|
||||||
|
.nth(m.char_count)
|
||||||
|
.map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end));
|
||||||
|
formatted.push(&self.text[*m_byte_start..highlight_byte_index]);
|
||||||
|
|
||||||
formatted.push(self.highlight_suffix);
|
formatted.push(self.highlight_suffix);
|
||||||
|
|
||||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||||
if highlight_byte_index < *current_byte_end {
|
if highlight_byte_index < *m_byte_end {
|
||||||
formatted.push(&self.text[highlight_byte_index..*current_byte_end]);
|
formatted.push(&self.text[highlight_byte_index..*m_byte_end]);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte_index = *current_byte_end;
|
byte_index = *m_byte_end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// push the rest of the text between last match and the end of crop.
|
// push the rest of the text between last match and the end of crop.
|
||||||
if byte_index < byte_end {
|
if byte_index < crop_byte_end {
|
||||||
formatted.push(&self.text[byte_index..byte_end]);
|
formatted.push(&self.text[byte_index..crop_byte_end]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// push crop marker if it's not the end of the text.
|
// push crop marker if it's not the end of the text.
|
||||||
if byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
||||||
formatted.push(self.crop_marker);
|
formatted.push(self.crop_marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
if formatted.len() == 1 {
|
if formatted.len() == 1 {
|
||||||
// avoid concatenating if there is already 1 slice.
|
// avoid concatenating if there is already 1 slice.
|
||||||
Cow::Borrowed(&self.text[byte_start..byte_end])
|
Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end])
|
||||||
} else {
|
} else {
|
||||||
Cow::Owned(formatted.concat())
|
Cow::Owned(formatted.concat())
|
||||||
}
|
}
|
||||||
@ -825,8 +833,7 @@ mod tests {
|
|||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
// @TODO: Should probably highlight it all, even if it didn't fit the whole phrase
|
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||||
@"The groundbreaking invention had the power to split the world…"
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let builder = MatcherBuilder::new_test(
|
let builder = MatcherBuilder::new_test(
|
||||||
@ -837,7 +844,7 @@ mod tests {
|
|||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
// @TODO: Should probably include end of string in this case?
|
// TODO: Should include exclamation mark without crop markers
|
||||||
@"…between those who <em>embraced progress and those who resisted change</em>…"
|
@"…between those who <em>embraced progress and those who resisted change</em>…"
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -860,8 +867,7 @@ mod tests {
|
|||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
// @TODO: "invention" should be highlighted as well
|
@"…<em>invention</em> <em>had the power to split the world between those</em>…"
|
||||||
@"…invention <em>had the power to split the world between those</em>…"
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user