mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Fix and refactor crop_bounds
This commit is contained in:
parent
37a9d64c44
commit
40336ce87d
@ -1,6 +1,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
|
use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
|
||||||
|
use either::Either;
|
||||||
pub use matching_words::MatchingWords;
|
pub use matching_words::MatchingWords;
|
||||||
use matching_words::{MatchType, PartialMatch, WordId};
|
use matching_words::{MatchType, PartialMatch, WordId};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
@ -450,147 +451,141 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
crop_size: usize,
|
crop_size: usize,
|
||||||
) -> (usize, usize) {
|
) -> (usize, usize) {
|
||||||
// if there is no match, we start from the beginning of the string by default.
|
// if there is no match, we start from the beginning of the string by default.
|
||||||
let (matches_size, first_match_first_token_position, last_match_last_token_position) =
|
let (
|
||||||
if !matches.is_empty() {
|
mut remaining_words,
|
||||||
let matches_first = matches.first().unwrap();
|
is_iterating_forward,
|
||||||
let matches_last = matches.last().unwrap();
|
before_tokens_starting_index,
|
||||||
|
after_tokens_starting_index,
|
||||||
|
) = if !matches.is_empty() {
|
||||||
|
let matches_first = matches.first().unwrap();
|
||||||
|
let matches_last = matches.last().unwrap();
|
||||||
|
|
||||||
(
|
let matches_size =
|
||||||
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1,
|
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
|
||||||
matches_first.get_first_token_pos(),
|
|
||||||
matches_last.get_last_token_pos(),
|
let is_crop_size_gte_match_size = crop_size >= matches_size;
|
||||||
)
|
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
|
||||||
|
|
||||||
|
let remaining_words = if is_crop_size_gte_match_size {
|
||||||
|
crop_size - matches_size
|
||||||
} else {
|
} else {
|
||||||
(0, 0, 0)
|
// in case matches size is greater than crop size, which implies there's only one match,
|
||||||
|
// we count words backwards, because we have to remove words, as they're extra words outside of
|
||||||
|
// crop window
|
||||||
|
matches_size - crop_size
|
||||||
};
|
};
|
||||||
|
|
||||||
if crop_size >= matches_size {
|
|
||||||
// matches needs to be counted in the crop len.
|
|
||||||
let mut remaining_words = crop_size - matches_size;
|
|
||||||
|
|
||||||
let last_match_last_token_position_plus_one = last_match_last_token_position + 1;
|
|
||||||
let after_tokens_starting_index = if matches_size == 0 {
|
let after_tokens_starting_index = if matches_size == 0 {
|
||||||
0
|
0
|
||||||
} else if last_match_last_token_position_plus_one < tokens.len() {
|
|
||||||
last_match_last_token_position_plus_one
|
|
||||||
} else {
|
} else {
|
||||||
tokens.len()
|
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
|
||||||
|
if last_match_last_token_position_plus_one < tokens.len() {
|
||||||
|
last_match_last_token_position_plus_one
|
||||||
|
} else {
|
||||||
|
// we have matched the end of possible tokens, there's nothing to advance
|
||||||
|
tokens.len() - 1
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
(
|
||||||
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
remaining_words,
|
||||||
let mut before_tokens =
|
is_iterating_forward,
|
||||||
tokens[..first_match_first_token_position].iter().rev().peekable();
|
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
|
||||||
// an iterator starting from the last match token position and going towards the end of the text.
|
after_tokens_starting_index,
|
||||||
let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable();
|
)
|
||||||
|
} else {
|
||||||
|
(crop_size, true, 0, 0)
|
||||||
|
};
|
||||||
|
|
||||||
// grows the crop window peeking in both directions
|
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
||||||
// until the window contains the good number of words:
|
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
||||||
while remaining_words > 0 {
|
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
|
||||||
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
|
// an iterator ...
|
||||||
let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new);
|
let mut after_tokens = if is_iterating_forward {
|
||||||
|
// ... starting from the last match token position and going towards the end of the text.
|
||||||
|
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
|
||||||
|
} else {
|
||||||
|
// ... starting from the last match token position and going towards the start of the text.
|
||||||
|
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
|
||||||
|
};
|
||||||
|
|
||||||
match (before_token_kind, after_token_kind) {
|
// grows the crop window peeking in both directions
|
||||||
// we can expand both sides.
|
// until the window contains the good number of words:
|
||||||
(Some(before_token_kind), Some(after_token_kind)) => {
|
while remaining_words > 0 {
|
||||||
match (before_token_kind, after_token_kind) {
|
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
|
||||||
// if they are both separators and are the same kind then advance both,
|
let after_token_kind =
|
||||||
// or expand in the soft separator separator side.
|
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
|
||||||
(
|
|
||||||
SimpleTokenKind::Separator(before_token_separator_kind),
|
|
||||||
SimpleTokenKind::Separator(after_token_separator_kind),
|
|
||||||
) => {
|
|
||||||
if before_token_separator_kind == after_token_separator_kind {
|
|
||||||
before_tokens.next();
|
|
||||||
|
|
||||||
// this avoid having an ending separator before crop marker.
|
match (before_token_kind, after_token_kind) {
|
||||||
if remaining_words > 1 {
|
// we can expand both sides.
|
||||||
after_tokens.next();
|
(Some(before_token_kind), Some(after_token_kind)) => {
|
||||||
}
|
match (before_token_kind, after_token_kind) {
|
||||||
} else if matches!(before_token_separator_kind, SeparatorKind::Hard)
|
// if they are both separators and are the same kind then advance both,
|
||||||
{
|
// or expand in the soft separator separator side.
|
||||||
after_tokens.next();
|
(
|
||||||
} else {
|
SimpleTokenKind::Separator(before_token_separator_kind),
|
||||||
before_tokens.next();
|
SimpleTokenKind::Separator(after_token_separator_kind),
|
||||||
}
|
) => {
|
||||||
}
|
if before_token_separator_kind == after_token_separator_kind {
|
||||||
// if one of the tokens is a word, we expend in the side of the word.
|
before_tokens.next();
|
||||||
// left is a word, advance left.
|
|
||||||
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
|
// this avoid having an ending separator before crop marker.
|
||||||
|
if remaining_words > 1 {
|
||||||
|
after_tokens.next();
|
||||||
|
}
|
||||||
|
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
|
||||||
|
after_tokens.next();
|
||||||
|
} else {
|
||||||
before_tokens.next();
|
before_tokens.next();
|
||||||
remaining_words -= 1;
|
|
||||||
}
|
}
|
||||||
// right is a word, advance right.
|
}
|
||||||
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
|
// if one of the tokens is a word, we expend in the side of the word.
|
||||||
|
// left is a word, advance left.
|
||||||
|
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
|
||||||
|
before_tokens.next();
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
// right is a word, advance right.
|
||||||
|
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
|
||||||
|
after_tokens.next();
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
// both are words, advance left then right if remaining_word > 0.
|
||||||
|
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
|
||||||
|
before_tokens.next();
|
||||||
|
remaining_words -= 1;
|
||||||
|
|
||||||
|
if remaining_words > 0 {
|
||||||
after_tokens.next();
|
after_tokens.next();
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
// both are words, advance left then right if remaining_word > 0.
|
|
||||||
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
|
|
||||||
before_tokens.next();
|
|
||||||
remaining_words -= 1;
|
|
||||||
|
|
||||||
if remaining_words > 0 {
|
|
||||||
after_tokens.next();
|
|
||||||
remaining_words -= 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the end of the text is reached, advance left.
|
|
||||||
(Some(before_token_kind), None) => {
|
|
||||||
before_tokens.next();
|
|
||||||
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
|
|
||||||
remaining_words -= 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// the start of the text is reached, advance right.
|
|
||||||
(None, Some(after_token_kind)) => {
|
|
||||||
after_tokens.next();
|
|
||||||
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
|
|
||||||
remaining_words -= 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// no more token to add.
|
|
||||||
(None, None) => break,
|
|
||||||
}
|
}
|
||||||
}
|
// the end of the text is reached, advance left.
|
||||||
|
(Some(before_token_kind), None) => {
|
||||||
// finally, keep the byte index of each bound of the crop window.
|
before_tokens.next();
|
||||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
|
||||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
(crop_byte_start, crop_byte_end)
|
|
||||||
} else {
|
|
||||||
// there's one match and it's longer than the crop window, so we have to advance inward
|
|
||||||
let mut remaining_extra_words = matches_size - crop_size;
|
|
||||||
let mut tokens_from_end =
|
|
||||||
tokens[..=last_match_last_token_position].iter().rev().peekable();
|
|
||||||
|
|
||||||
while remaining_extra_words > 0 {
|
|
||||||
let token_from_end_kind = tokens_from_end
|
|
||||||
.peek()
|
|
||||||
.map(SimpleTokenKind::new)
|
|
||||||
.expect("Expected iterator to not reach end");
|
|
||||||
if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) {
|
|
||||||
remaining_extra_words -= 1;
|
|
||||||
}
|
}
|
||||||
|
// the start of the text is reached, advance right.
|
||||||
tokens_from_end.next();
|
(None, Some(after_token_kind)) => {
|
||||||
|
after_tokens.next();
|
||||||
|
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
|
||||||
|
remaining_words -= 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// no more token to add.
|
||||||
|
(None, None) => break,
|
||||||
}
|
}
|
||||||
|
|
||||||
let crop_byte_start = if first_match_first_token_position > 0 {
|
|
||||||
&tokens[first_match_first_token_position - 1].byte_end
|
|
||||||
} else {
|
|
||||||
&0
|
|
||||||
};
|
|
||||||
let crop_byte_end = tokens_from_end
|
|
||||||
.next()
|
|
||||||
.map(|t| t.byte_start)
|
|
||||||
.expect("Expected iterator to not reach end");
|
|
||||||
|
|
||||||
(*crop_byte_start, crop_byte_end)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// finally, keep the byte index of each bound of the crop window.
|
||||||
|
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||||
|
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||||
|
|
||||||
|
(crop_byte_start, crop_byte_end)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the formatted version of the original text.
|
// Returns the formatted version of the original text.
|
||||||
|
Loading…
Reference in New Issue
Block a user