mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-12-03 18:15:46 +01:00
Add crop algorithm with naive match algorithm
This commit is contained in:
parent
d96e72e5dc
commit
3be1790803
@ -92,13 +92,15 @@ impl MatcherBuilder {
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
pub struct Match<'t> {
|
#[derive(Clone)]
|
||||||
token: &'t Token<'t>,
|
pub struct Match {
|
||||||
match_len: usize,
|
match_len: usize,
|
||||||
// id of the query word that matches.
|
// id of the query word that matches.
|
||||||
id: usize,
|
id: usize,
|
||||||
// position of the word in the whole text.
|
// position of the word in the whole text.
|
||||||
position: usize,
|
word_position: usize,
|
||||||
|
// position of the token in the whole text.
|
||||||
|
token_position: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct MatchBounds {
|
pub struct MatchBounds {
|
||||||
@ -106,12 +108,6 @@ pub struct MatchBounds {
|
|||||||
length: usize,
|
length: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> From<&Match<'t>> for MatchBounds {
|
|
||||||
fn from(m: &Match) -> Self {
|
|
||||||
MatchBounds { start: m.token.byte_start, length: m.match_len }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Matcher<'t, 'm> {
|
pub struct Matcher<'t, 'm> {
|
||||||
text: &'t str,
|
text: &'t str,
|
||||||
tokens: &'t [Token<'t>],
|
tokens: &'t [Token<'t>],
|
||||||
@ -120,26 +116,22 @@ pub struct Matcher<'t, 'm> {
|
|||||||
crop_marker: &'m str,
|
crop_marker: &'m str,
|
||||||
highlight_prefix: &'m str,
|
highlight_prefix: &'m str,
|
||||||
highlight_suffix: &'m str,
|
highlight_suffix: &'m str,
|
||||||
matches: Option<Vec<Match<'t>>>,
|
matches: Option<Vec<Match>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Matcher<'t, '_> {
|
impl<'t> Matcher<'t, '_> {
|
||||||
fn compute_matches(&mut self) -> &mut Self {
|
fn compute_matches(&mut self) -> &mut Self {
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
let mut position = 0;
|
let mut word_position = 0;
|
||||||
|
let mut token_position = 0;
|
||||||
for token in self.tokens {
|
for token in self.tokens {
|
||||||
match token.is_separator() {
|
if token.is_separator().is_none() {
|
||||||
Some(SeparatorKind::Hard) => position += 7,
|
if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) {
|
||||||
None => {
|
matches.push(Match { match_len, id, word_position, token_position });
|
||||||
if let Some((match_len, id)) =
|
|
||||||
self.matching_words.matching_bytes_with_id(&token)
|
|
||||||
{
|
|
||||||
matches.push(Match { token, match_len, id, position });
|
|
||||||
}
|
|
||||||
position += 1;
|
|
||||||
}
|
}
|
||||||
_otherwise => {}
|
word_position += 1;
|
||||||
}
|
}
|
||||||
|
token_position += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.matches = Some(matches);
|
self.matches = Some(matches);
|
||||||
@ -149,21 +141,104 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
||||||
match &self.matches {
|
match &self.matches {
|
||||||
None => self.compute_matches().matches(),
|
None => self.compute_matches().matches(),
|
||||||
Some(matches) => matches.iter().map(MatchBounds::from).collect(),
|
Some(matches) => matches
|
||||||
|
.iter()
|
||||||
|
.map(|m| MatchBounds {
|
||||||
|
start: self.tokens[m.token_position].byte_start,
|
||||||
|
length: m.match_len,
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) {
|
fn crop_around(&self, matches: &[Match]) -> (usize, usize) {
|
||||||
let byte_end = self
|
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
|
||||||
.tokens
|
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
|
||||||
.iter()
|
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
|
||||||
.filter(|t| t.is_separator().is_none())
|
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
|
||||||
.enumerate()
|
|
||||||
.take_while(|(i, _)| *i < self.crop_size)
|
|
||||||
.last()
|
|
||||||
.map_or(self.text.len(), |(_, t)| t.byte_end);
|
|
||||||
|
|
||||||
(0, byte_end)
|
// TODO: buggy if no match and fisrt token is a sepparator
|
||||||
|
let mut remaining_words =
|
||||||
|
self.crop_size + first_match_word_position - last_match_word_position - 1;
|
||||||
|
let mut first_token_position = first_match_token_position;
|
||||||
|
let mut last_token_position = last_match_token_position;
|
||||||
|
|
||||||
|
while remaining_words > 0 {
|
||||||
|
match (
|
||||||
|
first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
|
||||||
|
last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
|
||||||
|
) {
|
||||||
|
(Some(ft), Some(lt)) => {
|
||||||
|
match (ft.is_separator(), lt.is_separator()) {
|
||||||
|
// if they are both separators and are the same kind then advance both
|
||||||
|
(Some(f_kind), Some(s_kind)) => {
|
||||||
|
if f_kind == s_kind {
|
||||||
|
first_token_position -= 1;
|
||||||
|
last_token_position += 1;
|
||||||
|
} else if f_kind == SeparatorKind::Hard {
|
||||||
|
last_token_position += 1;
|
||||||
|
} else {
|
||||||
|
first_token_position -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// left is a word, advance left
|
||||||
|
(None, Some(_)) => {
|
||||||
|
first_token_position -= 1;
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
// right is a word, advance right
|
||||||
|
(Some(_), None) => {
|
||||||
|
last_token_position += 1;
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
// both are words, advance left then right if remaining_word > 0
|
||||||
|
(None, None) => {
|
||||||
|
first_token_position -= 1;
|
||||||
|
remaining_words -= 1;
|
||||||
|
|
||||||
|
if remaining_words > 0 {
|
||||||
|
last_token_position += 1;
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(Some(ft), None) => {
|
||||||
|
first_token_position -= 1;
|
||||||
|
if ft.is_separator().is_none() {
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(None, Some(lt)) => {
|
||||||
|
last_token_position += 1;
|
||||||
|
if lt.is_separator().is_none() {
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
(None, None) => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if tokens after the end of the window are separators,
|
||||||
|
// then add them to the window in order to keep context in cropped text.
|
||||||
|
while let Some(_separator_kind) = last_token_position
|
||||||
|
.checked_add(1)
|
||||||
|
.and_then(|i| self.tokens.get(i))
|
||||||
|
.and_then(|t| t.is_separator())
|
||||||
|
{
|
||||||
|
last_token_position += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
(self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
|
||||||
|
match matches {
|
||||||
|
// at least 2 matches
|
||||||
|
[first, last, ..] => self.crop_around(&[first.clone()][..]),
|
||||||
|
// less than 2 matches
|
||||||
|
_ => self.crop_around(matches),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
|
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
|
||||||
@ -187,20 +262,23 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
|
|
||||||
if highlight {
|
if highlight {
|
||||||
// insert highlight markers around matches.
|
// insert highlight markers around matches.
|
||||||
|
let tokens = self.tokens;
|
||||||
for m in matches
|
for m in matches
|
||||||
.iter()
|
.iter()
|
||||||
.skip_while(|m| m.token.byte_start < byte_start)
|
.skip_while(|m| tokens[m.token_position].byte_start < byte_start)
|
||||||
.take_while(|m| m.token.byte_start < byte_end)
|
.take_while(|m| tokens[m.token_position].byte_start < byte_end)
|
||||||
{
|
{
|
||||||
if byte_index < m.token.byte_start {
|
let token = &tokens[m.token_position];
|
||||||
formatted.push(&self.text[byte_index..m.token.byte_start]);
|
|
||||||
|
if byte_index < token.byte_start {
|
||||||
|
formatted.push(&self.text[byte_index..token.byte_start]);
|
||||||
}
|
}
|
||||||
|
|
||||||
formatted.push(self.highlight_prefix);
|
formatted.push(self.highlight_prefix);
|
||||||
formatted.push(&self.text[m.token.byte_start..m.token.byte_end]);
|
formatted.push(&self.text[token.byte_start..token.byte_end]);
|
||||||
formatted.push(self.highlight_suffix);
|
formatted.push(self.highlight_suffix);
|
||||||
|
|
||||||
byte_index = m.token.byte_end;
|
byte_index = token.byte_end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -271,7 +349,7 @@ mod tests {
|
|||||||
assert_eq!(&matcher.format(highlight, crop), &text);
|
assert_eq!(&matcher.format(highlight, crop), &text);
|
||||||
|
|
||||||
// Text containing all matches.
|
// Text containing all matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let analyzed = analyzer.analyze(&text);
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
@ -306,12 +384,12 @@ mod tests {
|
|||||||
assert_eq!(&matcher.format(highlight, crop), &text);
|
assert_eq!(&matcher.format(highlight, crop), &text);
|
||||||
|
|
||||||
// Text containing all matches.
|
// Text containing all matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let analyzed = analyzer.analyze(&text);
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text with highlighted matches.
|
// no crop should return complete text with highlighted matches.
|
||||||
assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
|
assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
@ -343,18 +421,18 @@ mod tests {
|
|||||||
// no highlight should return 10 first words with a marker at the end.
|
// no highlight should return 10 first words with a marker at the end.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&matcher.format(highlight, crop),
|
&matcher.format(highlight, crop),
|
||||||
"A quick brown fox can not jump 32 feet, right…"
|
"A quick brown fox can not jump 32 feet, right? …"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Text containing all matches.
|
// Test phrase propagation
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let analyzed = analyzer.analyze(&text);
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no highlight should return 10 last words with a marker at the start.
|
// should crop the phrase instead of croping around the match.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&matcher.format(highlight, crop),
|
&matcher.format(highlight, crop),
|
||||||
"…she loves. Emily Henry: The Love That Split The World"
|
"…Split The World is a book written by Emily Henry. …"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
@ -368,6 +446,17 @@ mod tests {
|
|||||||
"…future to build a world with the boy she loves."
|
"…future to build a world with the boy she loves."
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Text containing all matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no highlight should return 10 last words with a marker at the start.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"…she loves. Emily Henry: The Love That Split The World."
|
||||||
|
);
|
||||||
|
|
||||||
// Text containing a match unordered and a match ordered.
|
// Text containing a match unordered and a match ordered.
|
||||||
let text = "The world split void void void void void void void void void split the world void void";
|
let text = "The world split void void void void void void void void void split the world void void";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let analyzed = analyzer.analyze(&text);
|
||||||
@ -398,17 +487,9 @@ mod tests {
|
|||||||
// both should return 10 first words with a marker at the end.
|
// both should return 10 first words with a marker at the end.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&matcher.format(highlight, crop),
|
&matcher.format(highlight, crop),
|
||||||
"A quick brown fox can not jump 32 feet, right…"
|
"A quick brown fox can not jump 32 feet, right? …"
|
||||||
);
|
);
|
||||||
|
|
||||||
// Text containing all matches.
|
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
|
||||||
let analyzed = analyzer.analyze(&text);
|
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
|
||||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
|
||||||
assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
|
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let analyzed = analyzer.analyze(&text);
|
||||||
@ -420,6 +501,14 @@ mod tests {
|
|||||||
"…future to build a <em>world</em> with <em>the</em> boy she loves."
|
"…future to build a <em>world</em> with <em>the</em> boy she loves."
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Text containing all matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
|
||||||
|
|
||||||
// Text containing a match unordered and a match ordered.
|
// Text containing a match unordered and a match ordered.
|
||||||
let text = "The world split void void void void void void void void void split the world void void";
|
let text = "The world split void void void void void void void void void split the world void void";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
Loading…
Reference in New Issue
Block a user