Refactor matches, change behaviour of showMatchesPosition

This commit is contained in:
F. Levi 2025-06-07 11:45:01 +03:00
parent 97aeb6db4d
commit 24f213c343
13 changed files with 1504 additions and 1395 deletions

View file

@ -1551,9 +1551,10 @@ fn retrieve_documents<S: AsRef<str>>(
Ok(match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document?,
attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
),
attributes_to_retrieve
.iter()
.map(|s| s.as_ref())
.chain(retrieve_vectors.should_retrieve().then_some("_vectors")),
),
None => document?,
})
@ -1586,7 +1587,7 @@ fn retrieve_document<S: AsRef<str>>(
attributes_to_retrieve
.iter()
.map(|s| s.as_ref())
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
.chain(retrieve_vectors.should_retrieve().then_some("_vectors")),
),
None => document,
};

View file

@ -815,7 +815,8 @@ impl SearchByIndex {
let (result, _semantic_hit_count) =
super::super::search_from_kind(index_uid.to_string(), search_kind, search)?;
let format = AttributesFormat {
let attributes_format = AttributesFormat {
attributes_to_retrieve: query.attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight: query.attributes_to_highlight,
@ -846,12 +847,11 @@ impl SearchByIndex {
let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref());
let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
let hit_maker =
HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| {
MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
})?;
HitMaker::new(matching_words, tokenizer, attributes_format, &index, &rtxn)
.map_err(|e| {
MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
})?;
results_by_query.push(SearchResultByQuery {
weight,

View file

@ -1,4 +1,5 @@
use core::fmt;
use std::borrow::Cow;
use std::cmp::min;
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::str::FromStr;
@ -27,11 +28,11 @@ use meilisearch_types::{milli, Document};
use milli::tokenizer::{Language, TokenizerBuilder};
use milli::{
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, LocalizedAttributesRule,
MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
MarkerOptions, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
};
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use serde_json::{json, Map, Value};
#[cfg(test)]
mod mod_test;
use utoipa::ToSchema;
@ -46,7 +47,9 @@ pub use federated::{
mod ranking_rules;
type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;
// TODO: Adapt this type to support cropping
// { "_matchesPosition": { "overview": { first: false, highlighted: [[0,4,6,11,5,234,6,241,5]] } } }
// type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;
pub const DEFAULT_SEARCH_OFFSET: fn() -> usize = || 0;
pub const DEFAULT_SEARCH_LIMIT: fn() -> usize = || 20;
@ -742,11 +745,9 @@ pub struct SearchHit {
#[serde(flatten)]
#[schema(additional_properties, inline, value_type = HashMap<String, Value>)]
pub document: Document,
#[serde(default, rename = "_formatted", skip_serializing_if = "Document::is_empty")]
#[serde(default, rename = "_formatted", skip_serializing_if = "Option::is_none")]
#[schema(additional_properties, value_type = HashMap<String, Value>)]
pub formatted: Document,
#[serde(default, rename = "_matchesPosition", skip_serializing_if = "Option::is_none")]
pub matches_position: Option<MatchesPosition>,
pub formatted: Option<Document>,
#[serde(default, rename = "_rankingScore", skip_serializing_if = "Option::is_none")]
pub ranking_score: Option<f64>,
#[serde(default, rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")]
@ -1223,6 +1224,7 @@ struct AttributesFormat {
crop_marker: String,
highlight_pre_tag: String,
highlight_post_tag: String,
// TODO: Might want to rename this to signify that this will not yield _formatted anymore, only positions
show_matches_position: bool,
sort: Option<Vec<String>>,
show_ranking_score: bool,
@ -1230,7 +1232,7 @@ struct AttributesFormat {
locales: Option<Vec<Language>>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Clone, Copy)]
pub enum RetrieveVectors {
/// Remove the `_vectors` field
///
@ -1250,6 +1252,10 @@ impl RetrieveVectors {
Self::Hide
}
}
pub fn should_retrieve(&self) -> bool {
matches!(self, Self::Retrieve)
}
}
struct HitMaker<'a> {
@ -1261,7 +1267,7 @@ struct HitMaker<'a> {
retrieve_vectors: RetrieveVectors,
to_retrieve_ids: BTreeSet<FieldId>,
embedding_configs: Vec<milli::index::IndexEmbeddingConfig>,
formatter_builder: MatcherBuilder<'a>,
matcher_builder: MatcherBuilder<'a>,
formatted_options: BTreeMap<FieldId, FormatOptions>,
show_ranking_score: bool,
show_ranking_score_details: bool,
@ -1289,24 +1295,20 @@ impl<'a> HitMaker<'a> {
tokenizer_builder.into_tokenizer()
}
pub fn formatter_builder(
matching_words: milli::MatchingWords,
tokenizer: milli::tokenizer::Tokenizer<'_>,
) -> MatcherBuilder<'_> {
let formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
formatter_builder
}
pub fn new(
matching_words: milli::MatchingWords,
tokenizer: milli::tokenizer::Tokenizer<'a>,
attr_fmt: AttributesFormat,
index: &'a Index,
rtxn: &'a RoTxn<'a>,
format: AttributesFormat,
mut formatter_builder: MatcherBuilder<'a>,
) -> milli::Result<Self> {
formatter_builder.crop_marker(format.crop_marker);
formatter_builder.highlight_prefix(format.highlight_pre_tag);
formatter_builder.highlight_suffix(format.highlight_post_tag);
let AttributesFormat { highlight_pre_tag, highlight_post_tag, crop_marker, .. } = attr_fmt;
let matcher_builder = MatcherBuilder::new(
matching_words,
tokenizer,
MarkerOptions { highlight_pre_tag, highlight_post_tag, crop_marker },
);
let fields_ids_map = index.fields_ids_map(rtxn)?;
let displayed_ids = index
@ -1324,21 +1326,21 @@ impl<'a> HitMaker<'a> {
let displayed_names = index.displayed_fields(rtxn)?.unwrap();
!displayed_names.contains(&milli::constants::RESERVED_VECTORS_FIELD_NAME)
}
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
// displayed_ids is a finite list, so hide if `_vectors` is not part of it
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
};
let displayed_ids =
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
let retrieve_vectors = if let RetrieveVectors::Retrieve = attr_fmt.retrieve_vectors {
if vectors_is_hidden {
RetrieveVectors::Hide
} else {
RetrieveVectors::Retrieve
}
} else {
format.retrieve_vectors
attr_fmt.retrieve_vectors
};
let fids = |attrs: &BTreeSet<String>| {
@ -1355,7 +1357,7 @@ impl<'a> HitMaker<'a> {
}
ids
};
let to_retrieve_ids: BTreeSet<_> = format
let to_retrieve_ids: BTreeSet<_> = attr_fmt
.attributes_to_retrieve
.as_ref()
.map(fids)
@ -1364,12 +1366,12 @@ impl<'a> HitMaker<'a> {
.cloned()
.collect();
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
let attr_to_highlight = attr_fmt.attributes_to_highlight.unwrap_or_default();
let attr_to_crop = attr_fmt.attributes_to_crop.unwrap_or_default();
let formatted_options = compute_formatted_options(
&attr_to_highlight,
&attr_to_crop,
format.crop_length,
attr_fmt.crop_length,
&to_retrieve_ids,
&fields_ids_map,
&displayed_ids,
@ -1386,51 +1388,53 @@ impl<'a> HitMaker<'a> {
retrieve_vectors,
to_retrieve_ids,
embedding_configs,
formatter_builder,
matcher_builder,
formatted_options,
show_ranking_score: format.show_ranking_score,
show_ranking_score_details: format.show_ranking_score_details,
show_matches_position: format.show_matches_position,
sort: format.sort,
locales: format.locales,
show_ranking_score: attr_fmt.show_ranking_score,
show_ranking_score_details: attr_fmt.show_ranking_score_details,
show_matches_position: attr_fmt.show_matches_position,
sort: attr_fmt.sort,
locales: attr_fmt.locales,
})
}
pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
let (_, obkv) =
self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
fn make_document(&self, obkv: &obkv::KvReaderU16) -> milli::Result<Document> {
let mut document = serde_json::Map::new();
// First generate a document with all the displayed fields
let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?;
let add_vectors_fid =
self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);
// select the attributes to retrieve
let attributes_to_retrieve = self
.to_retrieve_ids
.iter()
// skip the vectors_fid if RetrieveVectors::Hide
.filter(|fid| match self.vectors_fid {
Some(vectors_fid) => {
!(self.retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
// recreate JSON with appropriate attributes
for (key, value) in obkv.iter() {
if self.vectors_fid.is_some_and(|vectors_fid| vectors_fid == key) {
// (vectors aren't considered in `displayedAttributes` and `attributesToRetrieve`, but rather with `retrieveVectors`)
if !self.retrieve_vectors.should_retrieve() {
continue;
}
None => true,
})
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
.chain(add_vectors_fid.iter())
.map(|&fid| self.fields_ids_map.name(fid).expect("Missing field name"));
} else if !self.to_retrieve_ids.contains(&key) || !self.displayed_ids.contains(&key) {
// https://www.meilisearch.com/docs/reference/api/settings#displayed-attributes
// https://www.meilisearch.com/docs/reference/api/search#attributes-to-retrieve
continue;
}
let mut document =
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
let key = self.fields_ids_map.name(key).expect("Missing field name").to_string();
if self.retrieve_vectors == RetrieveVectors::Retrieve {
// Clippy is wrong
document.insert(key, value);
}
Ok(document)
}
pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
let obkv = self.index.document(self.rtxn, id)?;
let mut document = self.make_document(obkv)?;
if self.retrieve_vectors.should_retrieve() {
#[allow(clippy::manual_unwrap_or_default)]
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in self.index.embeddings(self.rtxn, id)? {
let user_provided = self
.embedding_configs
@ -1439,6 +1443,7 @@ impl<'a> HitMaker<'a> {
.is_some_and(|conf| conf.user_provided.contains(id));
let embeddings =
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
vectors.insert(
name,
serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?,
@ -1450,10 +1455,10 @@ impl<'a> HitMaker<'a> {
let localized_attributes =
self.index.localized_attributes_rules(self.rtxn)?.unwrap_or_default();
let (matches_position, formatted) = format_fields(
&displayed_document,
let formatted = format_fields(
&mut document,
&self.fields_ids_map,
&self.formatter_builder,
&self.matcher_builder,
&self.formatted_options,
self.show_matches_position,
&self.displayed_ids,
@ -1470,13 +1475,7 @@ impl<'a> HitMaker<'a> {
let ranking_score_details =
self.show_ranking_score_details.then(|| ScoreDetails::to_json_map(score.iter()));
let hit = SearchHit {
document,
formatted,
matches_position,
ranking_score_details,
ranking_score,
};
let hit = SearchHit { document, formatted, ranking_score_details, ranking_score };
Ok(hit)
}
@ -1485,7 +1484,7 @@ impl<'a> HitMaker<'a> {
fn make_hits<'a>(
index: &Index,
rtxn: &RoTxn<'_>,
format: AttributesFormat,
attributes_format: AttributesFormat,
matching_words: milli::MatchingWords,
documents_ids_scores: impl Iterator<Item = (u32, &'a Vec<ScoreDetails>)> + 'a,
) -> milli::Result<Vec<SearchHit>> {
@ -1500,9 +1499,7 @@ fn make_hits<'a>(
let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref());
let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
let hit_maker = HitMaker::new(index, rtxn, format, formatter_builder)?;
let hit_maker = HitMaker::new(matching_words, tokenizer, attributes_format, index, rtxn)?;
for (id, score) in documents_ids_scores {
documents.push(hit_maker.make_hit(id, score)?);
@ -1818,59 +1815,100 @@ fn add_non_formatted_ids_to_formatted_options(
}
}
fn make_document(
displayed_attributes: &BTreeSet<FieldId>,
field_ids_map: &FieldsIdsMap,
obkv: &obkv::KvReaderU16,
) -> milli::Result<Document> {
let mut document = serde_json::Map::new();
// recreate the original json
for (key, value) in obkv.iter() {
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
let key = field_ids_map.name(key).expect("Missing field name").to_string();
document.insert(key, value);
}
// select the attributes to retrieve
let displayed_attributes = displayed_attributes
.iter()
.map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
let document = permissive_json_pointer::select_values(&document, displayed_attributes);
Ok(document)
}
#[allow(clippy::too_many_arguments)]
fn format_fields(
document: &Document,
document: &mut Document,
field_ids_map: &FieldsIdsMap,
builder: &MatcherBuilder<'_>,
matcher_builder: &MatcherBuilder<'_>,
formatted_options: &BTreeMap<FieldId, FormatOptions>,
compute_matches: bool,
show_matches_position: bool,
displayable_ids: &BTreeSet<FieldId>,
locales: Option<&[Language]>,
localized_attributes: &[LocalizedAttributesRule],
) -> milli::Result<(Option<MatchesPosition>, Document)> {
let mut matches_position = compute_matches.then(BTreeMap::new);
let mut document = document.clone();
) -> milli::Result<Option<Document>> {
// reduce the formatted option list to the attributes that should be formatted,
// instead of all the attributes to display.
let formatting_fields_options: Vec<_> = formatted_options
let formatting_fields_options = formatted_options
.iter()
.filter(|(_, option)| option.should_format())
.map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
.collect();
.collect::<Vec<_>>();
// select the attributes to retrieve
let displayable_names =
displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
let get_format_options = |key: Cow<'_, str>| {
formatting_fields_options
.iter()
.filter(|(name, ..)| {
milli::is_faceted_by(name, &key) || milli::is_faceted_by(&key, name)
})
.map(|(_, option)| **option)
.reduce(|acc, option| acc.merge(option))
};
let get_locales = |key: Cow<'_, str>| {
// TODO: Should this be re computed every time?
// if no locales has been provided, we try to find the locales in the localized_attributes.
locales.or_else(|| {
localized_attributes
.iter()
.find(|rule| matches!(rule.match_str(&key), PatternMatch::Match))
.map(LocalizedAttributesRule::locales)
})
};
fn get_text(value: &mut Value) -> Option<Cow<'_, String>> {
match value {
Value::String(text) => Some(Cow::Borrowed(text)),
Value::Number(number) => Some(Cow::Owned(number.to_string())),
// boolean and null can not be matched by meili, can not be formatted
// and array or object cannot be yielded by `permissive_json_pointer::map_leaf_values`
_ => None,
}
}
if show_matches_position {
permissive_json_pointer::map_leaf_values(document, displayable_names, |key, _, value| {
let Some(text) = get_text(value) else {
*value = Value::Object(Map::from_iter(std::iter::once((
"value".to_string(),
value.take(),
))));
return;
};
let locales = get_locales(Cow::from(key));
let mut matcher = matcher_builder.build(&text, locales);
let format_options = get_format_options(Cow::from(key));
let match_bounds = matcher.get_match_bounds(format_options);
let value_iter = std::iter::once(("value".to_string(), value.take()));
// do not include `matches` in case there is nothing to format
let json_map = if let Some(mb) = match_bounds {
let matches_iter = std::iter::once((
"matches".to_string(),
serde_json::to_value(mb).expect("TODO"),
));
Map::from_iter(value_iter.chain(matches_iter))
} else {
Map::from_iter(value_iter)
};
*value = Value::Object(json_map);
});
return Ok(None);
}
let mut formatted_document = document.clone();
permissive_json_pointer::map_leaf_values(
&mut document,
&mut formatted_document,
displayable_names,
|key, array_indices, value| {
|key, _, value| {
// To get the formatting option of each key we need to see all the rules that applies
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
@ -1878,37 +1916,22 @@ fn format_fields(
// Warn: The time to compute the format list scales with the number of fields to format;
// cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
// d*f where d is the total number of fields to display and f is the total number of fields to format.
let format = formatting_fields_options
.iter()
.filter(|(name, _option)| {
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
})
.map(|(_, option)| **option)
.reduce(|acc, option| acc.merge(option));
let mut infos = Vec::new();
let Some(text) = get_text(value) else {
return;
};
// if no locales has been provided, we try to find the locales in the localized_attributes.
let locales = locales.or_else(|| {
localized_attributes
.iter()
.find(|rule| rule.match_str(key) == PatternMatch::Match)
.map(LocalizedAttributesRule::locales)
});
let format_options = get_format_options(Cow::from(key));
*value = format_value(
std::mem::take(value),
builder,
format,
&mut infos,
compute_matches,
array_indices,
locales,
);
// there's nothing to format
if !format_options.is_some_and(|v| v.should_format()) {
return;
}
if let Some(matches) = matches_position.as_mut() {
if !infos.is_empty() {
matches.insert(key.to_owned(), infos);
}
let locales = get_locales(Cow::from(key));
let mut matcher = matcher_builder.build(&text, locales);
if let Some(formatted_text) = matcher.get_formatted_text(format_options) {
*value = Value::String(formatted_text);
}
},
);
@ -1918,58 +1941,9 @@ fn format_fields(
// This unwrap must be safe since we got the ids from the fields_ids_map just
// before.
.map(|&fid| field_ids_map.name(fid).unwrap());
let document = permissive_json_pointer::select_values(&document, selectors);
let formatted_document = permissive_json_pointer::select_values(&formatted_document, selectors);
Ok((matches_position, document))
}
fn format_value(
value: Value,
builder: &MatcherBuilder<'_>,
format_options: Option<FormatOptions>,
infos: &mut Vec<MatchBounds>,
compute_matches: bool,
array_indices: &[usize],
locales: Option<&[Language]>,
) -> Value {
match value {
Value::String(old_string) => {
let mut matcher = builder.build(&old_string, locales);
if compute_matches {
let matches = matcher.matches(array_indices);
infos.extend_from_slice(&matches[..]);
}
match format_options {
Some(format_options) => {
let value = matcher.format(format_options);
Value::String(value.into_owned())
}
None => Value::String(old_string),
}
}
// `map_leaf_values` makes sure this is only called for leaf fields
Value::Array(_) => unreachable!(),
Value::Object(_) => unreachable!(),
Value::Number(number) => {
let s = number.to_string();
let mut matcher = builder.build(&s, locales);
if compute_matches {
let matches = matcher.matches(array_indices);
infos.extend_from_slice(&matches[..]);
}
match format_options {
Some(format_options) => {
let value = matcher.format(format_options);
Value::String(value.into_owned())
}
None => Value::String(s),
}
}
value => value,
}
Ok(Some(formatted_document))
}
pub(crate) fn parse_filter(

View file

@ -79,8 +79,9 @@ pub use self::localized_attributes_rules::LocalizedAttributesRule;
pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
pub use self::search::similar::Similar;
pub use self::search::{
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
FacetDistribution, Filter, FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder,
MatchingWords, OrderBy, Search, SearchResult, SemanticSearch, TermsMatchingStrategy,
DEFAULT_VALUES_PER_FACET,
};
pub use self::update::ChannelCongestion;

View file

@ -7,7 +7,9 @@ use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
pub use self::new::matches::{
FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder, MatchingWords,
};
use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
use crate::score_details::{ScoreDetails, ScoringStrategy};
@ -277,7 +279,7 @@ impl<'a> Search<'a> {
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms),
None => MatchingWords::default(),
};

View file

@ -0,0 +1,204 @@
use std::cmp::Ordering;
use charabia::{SeparatorKind, Token, TokenKind};
enum SimpleTokenKind {
Separator(SeparatorKind),
NonSeparator,
Done,
}
impl SimpleTokenKind {
fn new(token: &Token) -> Self {
match token.kind {
TokenKind::Separator(separator_kind) => Self::Separator(separator_kind),
_ => Self::NonSeparator,
}
}
}
struct CropBoundsHelper<'a> {
tokens: &'a [Token<'a>],
index_backward: usize,
backward_token_kind: SimpleTokenKind,
index_forward: usize,
forward_token_kind: SimpleTokenKind,
}
impl CropBoundsHelper<'_> {
fn advance_backward(&mut self) {
if matches!(self.backward_token_kind, SimpleTokenKind::Done) {
return;
}
if self.index_backward != 0 {
self.index_backward -= 1;
self.backward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_backward]);
} else {
self.backward_token_kind = SimpleTokenKind::Done;
}
}
fn advance_forward(&mut self) {
if matches!(self.forward_token_kind, SimpleTokenKind::Done) {
return;
}
if self.index_forward != self.tokens.len() - 1 {
self.index_forward += 1;
self.forward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_forward]);
} else {
self.forward_token_kind = SimpleTokenKind::Done;
}
}
}
fn get_adjusted_indices_for_too_few_words(
tokens: &[Token],
index_backward: usize,
index_forward: usize,
mut words_count: usize,
crop_size: usize,
) -> [usize; 2] {
let crop_size = crop_size + 2;
let mut cbh = CropBoundsHelper {
tokens,
index_backward,
backward_token_kind: SimpleTokenKind::new(&tokens[index_backward]),
index_forward,
forward_token_kind: SimpleTokenKind::new(&tokens[index_forward]),
};
loop {
match [&cbh.backward_token_kind, &cbh.forward_token_kind] {
// if they are both separators and are the same kind then advance both,
// or expand in the soft separator side
[SimpleTokenKind::Separator(backward_sk), SimpleTokenKind::Separator(forward_sk)] => {
if backward_sk == forward_sk {
cbh.advance_backward();
// this avoids having an ending separator before crop marker
if words_count < crop_size - 1 {
cbh.advance_forward();
}
} else if matches!(backward_sk, SeparatorKind::Hard) {
cbh.advance_forward();
} else {
cbh.advance_backward();
}
}
// both are words, advance left then right if we haven't reached `crop_size`
[SimpleTokenKind::NonSeparator, SimpleTokenKind::NonSeparator] => {
cbh.advance_backward();
words_count += 1;
if words_count != crop_size {
cbh.advance_forward();
words_count += 1;
}
}
[SimpleTokenKind::Done, SimpleTokenKind::Done] => break,
// if one of the tokens is non-separator and the other a separator, we expand in the non-separator side
// if one of the sides reached the end, we expand in the opposite direction
[backward_stk, SimpleTokenKind::Done]
| [backward_stk @ SimpleTokenKind::NonSeparator, SimpleTokenKind::Separator(_)] => {
if matches!(backward_stk, SimpleTokenKind::NonSeparator) {
words_count += 1;
}
cbh.advance_backward();
}
[SimpleTokenKind::Done, forward_stk]
| [SimpleTokenKind::Separator(_), forward_stk @ SimpleTokenKind::NonSeparator] => {
if matches!(forward_stk, SimpleTokenKind::NonSeparator) {
words_count += 1;
}
cbh.advance_forward();
}
}
if words_count == crop_size {
break;
}
}
[cbh.index_backward, cbh.index_forward]
}
fn get_adjusted_index_forward_for_too_many_words(
tokens: &[Token],
mut index_forward: usize,
mut words_count: usize,
crop_size: usize,
) -> usize {
while index_forward != 0 {
if matches!(SimpleTokenKind::new(&tokens[index_forward]), SimpleTokenKind::NonSeparator) {
words_count -= 1;
if words_count == crop_size {
break;
}
}
index_forward -= 1;
}
if index_forward == 0 {
return index_forward;
}
index_forward - 1
}
pub fn get_adjusted_indices_for_highlights_and_crop_size(
tokens: &[Token],
index_backward: usize,
index_forward: usize,
words_count: usize,
crop_size: usize,
) -> [usize; 2] {
match words_count.cmp(&crop_size) {
Ordering::Less => get_adjusted_indices_for_too_few_words(
tokens,
index_backward,
index_forward,
words_count,
crop_size,
),
Ordering::Equal => [
if index_backward != 0 { index_backward - 1 } else { index_backward },
if index_forward != tokens.len() - 1 { index_forward + 1 } else { index_forward },
],
Ordering::Greater => [
index_backward,
get_adjusted_index_forward_for_too_many_words(
tokens,
index_forward,
words_count,
crop_size,
),
],
}
}
pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usize) -> usize {
let mut words_count = 0;
let mut index = 0;
while index != tokens.len() - 1 {
if matches!(SimpleTokenKind::new(&tokens[index]), SimpleTokenKind::NonSeparator) {
words_count += 1;
if words_count == crop_size {
break;
}
}
index += 1;
}
if index == tokens.len() - 1 {
return index;
}
index + 1
}

View file

@ -1,139 +0,0 @@
use super::matching_words::WordId;
use super::{Match, MatchPosition};
struct MatchIntervalWithScore {
interval: [usize; 2],
score: [i16; 3],
}
// count score for phrases
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
let words_in_phrase_minus_one = (lwp - fwp) as i16;
// will always be ordered, so +1 for each space between words
*order_score += words_in_phrase_minus_one;
// distance will always be 1, so -1 for each space between words
*distance_score -= words_in_phrase_minus_one;
}
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
let mut order_score = 0;
let mut distance_score = 0;
let mut iter = matches.iter().peekable();
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
if next_match.ids.iter().min() > m.ids.iter().min() {
order_score += 1;
}
let m_last_word_pos = match m.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
lwp
}
};
let next_match_first_word_pos = next_match.get_first_word_pos();
// compute distance between matches
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
// in case last match is a phrase, count score for its words
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
}
ids.extend(m.ids.iter());
}
ids.sort_unstable();
ids.dedup();
let uniq_score = ids.len() as i16;
// rank by unique match count, then by distance between matches, then by ordered match count.
[uniq_score, distance_score, order_score]
}
/// Returns the first and last match where the score computed by match_interval_score is the best.
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
if matches.is_empty() {
panic!("`matches` should not be empty at this point");
}
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval: Option<MatchIntervalWithScore> = None;
let mut save_best_interval = |interval_first, interval_last| {
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
let is_interval_score_better = &best_interval
.as_ref()
.is_none_or(|MatchIntervalWithScore { score, .. }| interval_score > *score);
if *is_interval_score_better {
best_interval = Some(MatchIntervalWithScore {
interval: [interval_first, interval_last],
score: interval_score,
});
}
};
// we compute the matches interval if we have at least 2 matches.
// current interval positions.
let mut interval_first = 0;
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
for (index, next_match) in matches.iter().enumerate() {
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
let next_match_last_word_pos = next_match.get_last_word_pos();
// if the next match would mean that we pass the crop size window,
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
// and calculate a score for it, and check if it's better than our best so far
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
// if index is 0 there is no last viable match
if index != 0 {
let interval_last = index - 1;
// keep interval if it's the best
save_best_interval(interval_first, interval_last);
}
// advance start of the interval while interval is longer than crop_size.
loop {
interval_first += 1;
if interval_first == matches.len() {
interval_first -= 1;
break;
}
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
if interval_first_match_first_word_pos > next_match_last_word_pos
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
{
break;
}
}
}
}
// compute the last interval score and compare it to the best one.
let interval_last = matches.len() - 1;
// if it's the last match with itself, we need to make sure it's
// not a phrase longer than the crop window
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
save_best_interval(interval_first, interval_last);
}
// if none of the matches fit the criteria above, default to the first one
best_interval.map_or(
[&matches[0], &matches[0]],
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
)
}

View file

@ -0,0 +1,169 @@
use std::cell::Cell;
use crate::search::new::matches::matching_words::QueryPosition;
use super::r#match::{Match, MatchPosition};
struct MatchesIndexRangeWithScore {
matches_index_range: [usize; 2],
score: [i16; 3],
}
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn get_score(
matches: &[Match],
query_positions: &[QueryPosition],
index_first: usize,
index_last: usize,
) -> [i16; 3] {
let order_score = Cell::new(0);
let distance_score = Cell::new(0);
let mut iter = (index_first..=index_last)
.filter_map(|index| {
query_positions.iter().find_map(move |v| (v.index == index).then(|| v.range[0]))
})
.peekable();
while let (Some(range_first), Some(next_range_first)) = (iter.next(), iter.peek()) {
if range_first < *next_range_first {
order_score.set(order_score.get() + 1);
}
}
// count score for phrases
let tally_phrase_scores = |fwp, lwp| {
let words_in_phrase_minus_one = (lwp - fwp) as i16;
// will always be in the order of query, so +1 for each space between words
order_score.set(order_score.get() + words_in_phrase_minus_one);
// distance will always be 1, so -1 for each space between words
distance_score.set(distance_score.get() - words_in_phrase_minus_one);
};
let mut iter = matches[index_first..=index_last].iter().peekable();
while let Some(r#match) = iter.next() {
if let Some(next_match) = iter.peek() {
let match_last_word_pos = match r#match.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => {
tally_phrase_scores(fwp, lwp);
lwp
}
};
let next_match_first_word_pos = next_match.get_first_word_pos();
// compute distance between matches
distance_score.set(
distance_score.get()
- (next_match_first_word_pos - match_last_word_pos).min(7) as i16,
);
} else if let MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } =
r#match.position
{
// in case last match is a phrase, count score for its words
tally_phrase_scores(fwp, lwp);
}
}
let mut uniqueness_score = 0i16;
let mut current_range: Option<super::matching_words::UserQueryPositionRange> = None;
for qp in query_positions.iter().filter(|v| v.index >= index_first && v.index <= index_last) {
match current_range.as_mut() {
Some([saved_range_start, saved_range_end]) => {
let [range_start, range_end] = qp.range;
if range_start > *saved_range_start {
uniqueness_score += (*saved_range_end - *saved_range_start) as i16 + 1;
*saved_range_start = range_start;
*saved_range_end = range_end;
} else if range_end > *saved_range_end {
*saved_range_end = range_end;
}
}
None => current_range = Some(qp.range),
}
}
if let Some([saved_range_start, saved_range_end]) = current_range {
uniqueness_score += (saved_range_end - saved_range_start) as i16 + 1;
}
// rank by unique match count, then by distance between matches, then by ordered match count.
[uniqueness_score, distance_score.into_inner(), order_score.into_inner()]
}
/// Returns the first and last match where the score computed by match_interval_score is the best.
pub fn get_best_match_index_range(
matches: &[Match],
query_positions: &[QueryPosition],
crop_size: usize,
) -> [usize; 2] {
// positions of the first and the last match of the best matches index range in `matches`.
let mut best_matches_index_range: Option<MatchesIndexRangeWithScore> = None;
let mut save_best_matches_index_range = |index_first, index_last| {
let score = get_score(matches, query_positions, index_first, index_last);
let is_score_better = best_matches_index_range.as_ref().is_none_or(|v| score > v.score);
if is_score_better {
best_matches_index_range = Some(MatchesIndexRangeWithScore {
matches_index_range: [index_first, index_last],
score,
});
}
};
// we compute the matches index range if we have at least 2 matches.
let mut index_first = 0;
let mut first_match_first_word_pos = matches[index_first].get_first_word_pos();
for (index, next_match) in matches.iter().enumerate() {
// if next match would make index range gross more than crop_size,
// we compare the current index range with the best one,
// then we increase `index_first` until next match can be added.
let next_match_last_word_pos = next_match.get_last_word_pos();
// if the next match would mean that we pass the crop size window,
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
// and calculate a score for it, and check if it's better than our best so far
if next_match_last_word_pos - first_match_first_word_pos + 1 > crop_size {
// if index is 0 there is no previous viable match
if index != 0 {
// keep index range if it's the best
save_best_matches_index_range(index_first, index - 1);
}
// advance `index_first` while index range is longer than crop_size.
loop {
if index_first == matches.len() - 1 {
break;
}
index_first += 1;
first_match_first_word_pos = matches[index_first].get_first_word_pos();
// also make sure that subtracting won't cause a panic
if next_match_last_word_pos < first_match_first_word_pos
|| next_match_last_word_pos - first_match_first_word_pos + 1 < crop_size
{
break;
}
}
}
}
// compute the last index range score and compare it to the best one.
let index_last = matches.len() - 1;
// if it's the last match with itself, we need to make sure it's
// not a phrase longer than the crop window
if index_first != index_last || matches[index_first].get_word_count() < crop_size {
save_best_matches_index_range(index_first, index_last);
}
// if none of the matches fit the criteria above, default to the first one
best_matches_index_range.map_or([0, 0], |v| v.matches_index_range)
}

View file

@ -1,62 +1,49 @@
use super::matching_words::WordId;
#[derive(Clone, Debug)]
#[derive(Debug, PartialEq)]
pub enum MatchPosition {
Word {
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
},
Phrase {
// position of the first and last word in the phrase in the whole text.
word_positions: [usize; 2],
// position of the first and last token in the phrase in the whole text.
token_positions: [usize; 2],
},
Word { word_position: usize, token_position: usize },
Phrase { word_position_range: [usize; 2], token_position_range: [usize; 2] },
}
#[derive(Clone, Debug)]
#[derive(Debug, PartialEq)]
pub struct Match {
pub char_count: usize,
// ids of the query words that matches.
pub ids: Vec<WordId>,
pub byte_len: usize,
pub position: MatchPosition,
}
impl Match {
pub(super) fn get_first_word_pos(&self) -> usize {
pub fn get_first_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
MatchPosition::Phrase { word_position_range: [fwp, _], .. } => fwp,
}
}
pub(super) fn get_last_word_pos(&self) -> usize {
pub fn get_last_word_pos(&self) -> usize {
match self.position {
MatchPosition::Word { word_position, .. } => word_position,
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
MatchPosition::Phrase { word_position_range: [_, lwp], .. } => lwp,
}
}
pub(super) fn get_first_token_pos(&self) -> usize {
pub fn get_first_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
MatchPosition::Phrase { token_position_range: [ftp, _], .. } => ftp,
}
}
pub(super) fn get_last_token_pos(&self) -> usize {
pub fn get_last_token_pos(&self) -> usize {
match self.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
MatchPosition::Phrase { token_position_range: [_, ltp], .. } => ltp,
}
}
pub(super) fn get_word_count(&self) -> usize {
pub fn get_word_count(&self) -> usize {
match self.position {
MatchPosition::Word { .. } => 1,
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => lwp - fwp + 1,
}
}
}

View file

@ -0,0 +1,272 @@
use std::cmp::{max, min};
use super::{
matching_words::QueryPosition,
r#match::{Match, MatchPosition},
};
use super::adjust_indices::{
get_adjusted_index_forward_for_crop_size, get_adjusted_indices_for_highlights_and_crop_size,
};
use charabia::Token;
use serde::Serialize;
use utoipa::ToSchema;
use super::FormatOptions;
#[derive(Serialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct MatchBounds {
pub highlight_toggle: bool,
pub indices: Vec<usize>,
}
struct MatchBoundsHelper<'a> {
tokens: &'a [Token<'a>],
matches: &'a [Match],
query_positions: &'a [QueryPosition],
}
struct MatchesAndCropIndices {
matches_first_index: usize,
matches_last_index: usize,
crop_byte_start: usize,
crop_byte_end: usize,
}
enum CropThing {
Last(usize),
First(usize),
}
impl MatchBoundsHelper<'_> {
fn get_match_byte_position_range(&self, r#match: &Match) -> [usize; 2] {
let byte_start = match r#match.position {
MatchPosition::Word { token_position, .. } => self.tokens[token_position].byte_start,
MatchPosition::Phrase { token_position_range: [ftp, ..], .. } => {
self.tokens[ftp].byte_start
}
};
[byte_start, byte_start + r#match.byte_len]
}
// TODO: Rename this
fn get_match_byte_position_rangee(
&self,
index: &mut usize,
crop_thing: CropThing,
) -> [usize; 2] {
let new_index = match crop_thing {
CropThing::First(_) if *index != 0 => *index - 1,
CropThing::Last(_) if *index != self.matches.len() - 1 => *index + 1,
_ => {
return self.get_match_byte_position_range(&self.matches[*index]);
}
};
let [byte_start, byte_end] = self.get_match_byte_position_range(&self.matches[new_index]);
// NOTE: This doesn't need additional checks, because `get_best_match_index_range` already
// guarantees that the next or preceding match contains the crop boundary
match crop_thing {
CropThing::First(crop_byte_start) if crop_byte_start < byte_end => {
*index -= 1;
[byte_start, byte_end]
}
CropThing::Last(crop_byte_end) if byte_start < crop_byte_end => {
*index += 1;
[byte_start, byte_end]
}
_ => self.get_match_byte_position_range(&self.matches[*index]),
}
}
/// TODO: Description
fn get_match_bounds(&self, mci: MatchesAndCropIndices) -> MatchBounds {
let MatchesAndCropIndices {
mut matches_first_index,
mut matches_last_index,
crop_byte_start,
crop_byte_end,
} = mci;
let [first_match_first_byte, first_match_last_byte] = self.get_match_byte_position_rangee(
&mut matches_first_index,
CropThing::First(crop_byte_start),
);
let first_match_first_byte = max(first_match_first_byte, crop_byte_start);
let [last_match_first_byte, last_match_last_byte] =
if matches_first_index != matches_last_index {
self.get_match_byte_position_rangee(
&mut matches_last_index,
CropThing::Last(crop_byte_end),
)
} else {
[first_match_first_byte, first_match_last_byte]
};
let last_match_last_byte = min(last_match_last_byte, crop_byte_end);
let selected_matches_len = matches_last_index - matches_first_index + 1;
let mut indices_size = 2 * selected_matches_len;
let crop_byte_start_is_not_first_match_start = crop_byte_start != first_match_first_byte;
let crop_byte_end_is_not_last_match_end = crop_byte_end != last_match_last_byte;
if crop_byte_start_is_not_first_match_start {
indices_size += 1;
}
if crop_byte_end_is_not_last_match_end {
indices_size += 1;
}
let mut indices = Vec::with_capacity(indices_size);
if crop_byte_start_is_not_first_match_start {
indices.push(crop_byte_start);
}
indices.push(first_match_first_byte);
if selected_matches_len > 1 {
indices.push(first_match_last_byte);
}
if selected_matches_len > 2 {
for index in (matches_first_index + 1)..matches_last_index {
let [m_byte_start, m_byte_end] =
self.get_match_byte_position_range(&self.matches[index]);
indices.push(m_byte_start);
indices.push(m_byte_end);
}
}
if selected_matches_len > 1 {
indices.push(last_match_first_byte);
}
indices.push(last_match_last_byte);
if crop_byte_end_is_not_last_match_end {
indices.push(crop_byte_end);
}
MatchBounds { highlight_toggle: !crop_byte_start_is_not_first_match_start, indices }
}
/// For crop but no highlight.
fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> Option<MatchBounds> {
let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size);
let final_token = &self.tokens[final_token_index];
if final_token_index == self.tokens.len() - 1 {
return None;
}
// TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start?
Some(MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] })
}
fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices {
// TODO: This doesn't give back 2 phrases if one is out of crop window
// Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them
let [matches_first_index, matches_last_index] =
super::best_match_range::get_best_match_index_range(
self.matches,
self.query_positions,
crop_size,
);
let first_match = &self.matches[matches_first_index];
let last_match = &self.matches[matches_last_index];
let last_match_last_word_pos = last_match.get_last_word_pos();
let first_match_first_word_pos = first_match.get_first_word_pos();
let words_count = last_match_last_word_pos - first_match_first_word_pos + 1;
let [index_backward, index_forward] = get_adjusted_indices_for_highlights_and_crop_size(
self.tokens,
first_match.get_first_token_pos(),
last_match.get_last_token_pos(),
words_count,
crop_size,
);
let is_index_backward_at_limit = index_backward == 0;
let is_index_forward_at_limit = index_forward == self.tokens.len() - 1;
let backward_token = &self.tokens[index_backward];
let crop_byte_start = if is_index_backward_at_limit {
backward_token.byte_start
} else {
backward_token.byte_end
};
let forward_token = &self.tokens[index_forward];
let crop_byte_end = if is_index_forward_at_limit {
forward_token.byte_end
} else {
forward_token.byte_start
};
MatchesAndCropIndices {
matches_first_index,
matches_last_index,
crop_byte_start,
crop_byte_end,
}
}
/// TODO: description
fn get_crop_and_highlight_bounds_with_matches(&self, crop_size: usize) -> MatchBounds {
self.get_match_bounds(self.get_matches_and_crop_indices(crop_size))
}
/// For when there are no matches, but crop is required.
fn get_crop_bounds_with_matches(&self, crop_size: usize) -> MatchBounds {
let mci = self.get_matches_and_crop_indices(crop_size);
MatchBounds {
highlight_toggle: false,
indices: vec![mci.crop_byte_start, mci.crop_byte_end],
}
}
}
impl MatchBounds {
pub fn try_new(
tokens: &[Token],
matches: &[Match],
query_positions: &[QueryPosition],
format_options: FormatOptions,
) -> Option<MatchBounds> {
let mbh = MatchBoundsHelper { tokens, matches, query_positions };
if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) {
if matches.is_empty() {
return mbh.get_crop_bounds_with_no_matches(crop_size);
}
if format_options.highlight {
return Some(mbh.get_crop_and_highlight_bounds_with_matches(crop_size));
}
return Some(mbh.get_crop_bounds_with_matches(crop_size));
}
if format_options.highlight && !matches.is_empty() {
Some(mbh.get_match_bounds(MatchesAndCropIndices {
matches_first_index: 0,
matches_last_index: matches.len() - 1,
crop_byte_start: 0,
crop_byte_end: tokens[tokens.len() - 1].byte_end,
}))
} else {
None
}
}
}

View file

@ -1,24 +1,89 @@
use std::cmp::Reverse;
use std::fmt;
use std::ops::RangeInclusive;
use std::fmt::{Debug, Formatter, Result};
use charabia::Token;
use super::super::interner::Interned;
use super::super::query_term::LocatedQueryTerm;
use super::super::{DedupInterner, Phrase};
use super::r#match::{Match, MatchPosition};
use crate::SearchContext;
pub struct LocatedMatchingPhrase {
pub value: Interned<Phrase>,
pub positions: RangeInclusive<WordId>,
enum PrefixedOrEquality {
Prefixed,
Equality,
NotApplicable,
}
pub struct LocatedMatchingWords {
pub value: Vec<Interned<String>>,
pub positions: RangeInclusive<WordId>,
pub is_prefix: bool,
pub original_char_count: usize,
impl PrefixedOrEquality {
fn new(string: &str, other_string: &str, is_other_string_prefix: bool) -> Self {
if string.is_empty() {
return if other_string.is_empty() { Self::Equality } else { Self::NotApplicable };
}
let mut other_string_iter = other_string.chars();
for c in string.chars() {
let Some(other_c) = other_string_iter.next() else {
return if is_other_string_prefix { Self::Prefixed } else { Self::NotApplicable };
};
if c != other_c {
return Self::NotApplicable;
}
}
if other_string_iter.next().is_some() {
return Self::NotApplicable;
}
Self::Equality
}
}
// TODO: Consider using a tuple here, because indexing this thing out of bounds only incurs a runtime error
pub type UserQueryPositionRange = [u16; 2];
struct LocatedMatchingPhrase {
value: Interned<Phrase>,
position: UserQueryPositionRange,
}
struct LocatedMatchingWords {
value: Vec<Interned<String>>,
position: UserQueryPositionRange,
is_prefix: bool,
original_char_count: usize,
}
struct TokenPositionHelper<'a> {
token: &'a Token<'a>,
position_by_word: usize,
position_by_token: usize,
}
impl<'a> TokenPositionHelper<'a> {
fn iter_from_tokens(tokens: &'a [Token]) -> impl Iterator<Item = Self> + Clone {
tokens
.iter()
.scan([0, 0], |[token_position, word_position], token| {
// TODO: Naming
let token_word_thingy = Self {
position_by_token: *token_position,
position_by_word: *word_position,
token,
};
*token_position += 1;
if !token.is_separator() {
*word_position += 1;
}
Some(token_word_thingy)
})
.filter(|t| !t.token.is_separator())
}
}
/// Structure created from a query tree
@ -27,180 +92,246 @@ pub struct LocatedMatchingWords {
pub struct MatchingWords {
word_interner: DedupInterner<String>,
phrase_interner: DedupInterner<Phrase>,
phrases: Vec<LocatedMatchingPhrase>,
words: Vec<LocatedMatchingWords>,
located_matching_phrases: Vec<LocatedMatchingPhrase>,
located_matching_words: Vec<LocatedMatchingWords>,
}
#[cfg_attr(test, derive(Debug, PartialEq))]
pub struct QueryPosition {
pub range: UserQueryPositionRange,
pub index: usize,
}
impl MatchingWords {
pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
let mut phrases = Vec::new();
let mut words = Vec::new();
pub fn new(ctx: SearchContext, located_terms: &[LocatedQueryTerm]) -> Self {
let mut located_matching_phrases = Vec::new();
let mut located_matching_words = Vec::new();
// Extract and centralize the different phrases and words to match stored in a QueryTerm
// and wrap them in dedicated structures.
for located_term in located_terms {
let term = ctx.term_interner.get(located_term.value);
for LocatedQueryTerm { value, positions } in located_terms {
let term = ctx.term_interner.get(*value);
let (matching_words, matching_phrases) = term.all_computed_derivations();
for matching_phrase in matching_phrases {
phrases.push(LocatedMatchingPhrase {
value: matching_phrase,
positions: located_term.positions.clone(),
});
}
let position = [*positions.start(), *positions.end()];
words.push(LocatedMatchingWords {
located_matching_phrases.reserve(matching_phrases.len());
located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
LocatedMatchingPhrase { value: *matching_phrase, position }
}));
located_matching_words.push(LocatedMatchingWords {
value: matching_words,
positions: located_term.positions.clone(),
position,
is_prefix: term.is_prefix(),
original_char_count: term.original_word(&ctx).chars().count(),
});
}
// Sort word to put prefixes at the bottom prioritizing the exact matches.
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
// Sort words by having `is_prefix` as false first and then by their lengths in reverse order.
// This is only meant to help with what we match a token against first.
located_matching_words.sort_unstable_by_key(|lmw| {
(lmw.is_prefix, Reverse(lmw.position[1] - lmw.position[0]))
});
Self {
phrases,
words,
located_matching_phrases,
located_matching_words,
word_interner: ctx.word_interner,
phrase_interner: ctx.phrase_interner,
}
}
/// Returns an iterator over terms that match or partially match the given token.
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
fn try_get_phrase_match<'a>(
&self,
token_position_helper_iter: &mut (impl Iterator<Item = TokenPositionHelper<'a>> + Clone),
) -> Option<(Match, UserQueryPositionRange)> {
let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| {
let words_iter = self
.phrase_interner
.get(lmp.value)
.words
.iter()
.map(|word_option| word_option.map(|word| self.word_interner.get(word).as_str()))
.peekable();
(lmp.position, words_iter)
});
'outer: loop {
let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?;
// TODO: Is it worth only cloning if we have to?
let mut tph_iter = token_position_helper_iter.clone();
let mut first_tph_details = None;
let last_tph_details = loop {
// 1. get word from `words_iter` and token word thingy from `token_word_thingy_iter`
let (Some(word), Some(tph)) = (words_iter.next(), tph_iter.next()) else {
// 2. if there are no more words or token word thingys, get to next phrase and reset `token_word_thingy_iter`
continue 'outer;
};
// ?. save first token position bla bla bla
if first_tph_details.is_none() {
first_tph_details = Some([
tph.position_by_token,
tph.position_by_word,
tph.token.char_start,
tph.token.byte_start,
]);
}
// 3. check if word matches our token
let is_matching = match word {
Some(word) => tph.token.lemma() == word,
// a `None` value in the phrase words iterator corresponds to a stop word,
// the value is considered a match if the current token is categorized as a stop word.
None => tph.token.is_stopword(),
};
// 4. if it does not, get to next phrase and restart `token_word_thingy_iter`
if !is_matching {
continue 'outer;
}
// 5. if it does, and there are no words left, time to return
if words_iter.peek().is_none() {
break [
tph.position_by_token,
tph.position_by_word,
tph.token.char_end,
tph.token.byte_end,
];
}
};
let [first_tph_position_by_token, first_tph_position_by_word, first_tph_char_start, first_tph_byte_start] =
first_tph_details.expect("TODO");
let [last_tph_position_by_token, last_tph_position_by_word, last_tph_char_end, last_tph_byte_end] =
last_tph_details;
// save new position in parameter iterator
*token_position_helper_iter = tph_iter;
return Some((
Match {
// do not +1, because Token index ranges are exclusive
byte_len: last_tph_byte_end - first_tph_byte_start,
char_count: last_tph_char_end - first_tph_char_start,
position: MatchPosition::Phrase {
word_position_range: [
first_tph_position_by_word,
last_tph_position_by_word,
],
token_position_range: [
first_tph_position_by_token,
last_tph_position_by_token,
],
},
},
query_position_range,
));
}
}
/// Try to match the token with one of the located_words.
fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
for located_words in &self.words {
for word in &located_words.value {
let word = self.word_interner.get(*word);
// if the word is a prefix we match using starts_with.
if located_words.is_prefix && token.lemma().starts_with(word) {
let Some((char_index, c)) =
word.char_indices().take(located_words.original_char_count).last()
else {
continue;
};
let prefix_length = char_index + c.len_utf8();
let (char_count, byte_len) = token.original_lengths(prefix_length);
let ids = &located_words.positions;
return Some(MatchType::Full { ids, char_count, byte_len });
// else we exact match the token.
} else if token.lemma() == word {
let ids = &located_words.positions;
return Some(MatchType::Full {
char_count: token.char_end - token.char_start,
byte_len: token.byte_end - token.byte_start,
ids,
});
fn try_get_word_match(
&self,
tph: TokenPositionHelper,
text: &str,
) -> Option<(Match, UserQueryPositionRange)> {
let mut iter =
self.located_matching_words.iter().flat_map(|lw| lw.value.iter().map(move |w| (lw, w)));
loop {
let (located_words, word) = iter.next()?;
let word = self.word_interner.get(*word);
let [char_count, byte_len] =
match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) {
PrefixedOrEquality::Prefixed => {
let prefix_byte_len = text[tph.token.byte_start..]
.char_indices()
.nth(located_words.original_char_count - 1)
.map(|(i, c)| i + c.len_utf8())
.expect("expected text to have n-th thing bal bla TODO");
// TODO: Investigate token original byte length and similar methods and why they're not good enough
[located_words.original_char_count, prefix_byte_len]
}
// do not +1, because Token index ranges are exclusive
PrefixedOrEquality::Equality => [
tph.token.char_end - tph.token.char_start,
tph.token.byte_end - tph.token.byte_start,
],
_ => continue,
};
return Some((
Match {
char_count,
byte_len,
position: MatchPosition::Word {
word_position: tph.position_by_word,
token_position: tph.position_by_token,
},
},
located_words.position,
));
}
}
pub fn get_matches_and_query_positions(
&self,
tokens: &[Token],
text: &str,
) -> (Vec<Match>, Vec<QueryPosition>) {
// TODO: Note in the doc that with the help of this iter, matches are guaranteed to be ordered
let mut token_position_helper_iter = TokenPositionHelper::iter_from_tokens(tokens);
let mut matches = Vec::new();
let mut query_positions = Vec::new();
loop {
// try and get a phrase match
if let Some((r#match, range)) =
self.try_get_phrase_match(&mut token_position_helper_iter)
{
matches.push(r#match);
query_positions.push(QueryPosition { range, index: matches.len() - 1 });
continue;
}
// if the above fails, try get next token position helper
if let Some(tph) = token_position_helper_iter.next() {
// and then try and get a word match
if let Some((r#match, range)) = self.try_get_word_match(tph, text) {
matches.push(r#match);
query_positions.push(QueryPosition { range, index: matches.len() - 1 });
}
}
} else {
// there are no more items in the iterator, we are done searching for matches
break;
};
}
None
// TODO: Explain why
query_positions.sort_unstable_by_key(|v| v.range[0]);
(matches, query_positions)
}
}
/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
pub struct MatchesIter<'a, 'b> {
matching_words: &'a MatchingWords,
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
token: &'b Token<'b>,
}
impl<'a> Iterator for MatchesIter<'a, '_> {
type Item = MatchType<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.phrases.next() {
// Try to match all the phrases first.
Some(located_phrase) => {
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
// create a PartialMatch struct to make it compute the first match
// instead of duplicating the code.
let ids = &located_phrase.positions;
// collect the references of words from the interner.
let words = phrase
.words
.iter()
.map(|word| {
word.map(|word| self.matching_words.word_interner.get(word).as_str())
})
.collect();
let partial = PartialMatch { matching_words: words, ids };
partial.match_token(self.token).or_else(|| self.next())
}
// If no phrases matches, try to match uiques words.
None => self.matching_words.match_unique_words(self.token),
}
}
}
/// Id of a matching term corespounding to a word written by the end user.
pub type WordId = u16;
/// A given token can partially match a query word for several reasons:
/// - split words
/// - multi-word synonyms
/// In these cases we need to match consecutively several tokens to consider that the match is full.
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
Partial(PartialMatch<'a>),
}
/// Structure helper to match several tokens in a row in order to complete a partial match.
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: Vec<Option<&'a str>>,
ids: &'a RangeInclusive<WordId>,
}
impl<'a> PartialMatch<'a> {
/// Returns:
/// - None if the given token breaks the partial match
/// - Partial if the given token matches the partial match but doesn't complete it
/// - Full if the given token completes the partial match
pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
let Self { mut matching_words, ids, .. } = self;
let is_matching = match matching_words.first()? {
Some(word) => &token.lemma() == word,
// a None value in the phrase corresponds to a stop word,
// the walue is considered a match if the current token is categorized as a stop word.
None => token.is_stopword(),
};
// if there are remaining words to match in the phrase and the current token is matching,
// return a new Partial match allowing the highlighter to continue.
if is_matching && matching_words.len() > 1 {
matching_words.remove(0);
Some(MatchType::Partial(Self { matching_words, ids }))
// if there is no remaining word to match in the phrase and the current token is matching,
// return a Full match.
} else if is_matching {
Some(MatchType::Full {
char_count: token.char_end - token.char_start,
byte_len: token.byte_end - token.byte_start,
ids,
})
// if the current token doesn't match, return None to break the match sequence.
} else {
None
}
}
}
impl fmt::Debug for MatchingWords {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
impl Debug for MatchingWords {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let MatchingWords {
word_interner,
phrase_interner,
located_matching_phrases: phrases,
located_matching_words: words,
} = self;
let phrases: Vec<_> = phrases
.iter()
@ -213,119 +344,110 @@ impl fmt::Debug for MatchingWords {
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
.collect::<Vec<_>>()
.join(" "),
p.positions.clone(),
p.position,
)
})
.collect();
let words: Vec<_> = words
.iter()
.flat_map(|w| {
w.value
.iter()
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
.map(|s| (word_interner.get(*s), w.position, w.is_prefix))
.collect::<Vec<_>>()
})
.collect();
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
}
}
#[cfg(test)]
pub(crate) mod tests {
use std::borrow::Cow;
use charabia::{TokenKind, TokenizerBuilder};
use super::super::super::located_query_terms_from_tokens;
use super::*;
use crate::index::tests::TempIndex;
use crate::search::new::matches::tests::temp_index_with_documents;
use crate::search::new::query_term::ExtractedTokens;
pub(crate) fn temp_index_with_documents() -> TempIndex {
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
{ "id": 2, "name": "Westfália" },
{ "id": 3, "name": "Ŵôřlḑôle" },
]))
.unwrap();
temp_index
}
use charabia::{TokenKind, TokenizerBuilder};
use std::borrow::Cow;
#[test]
fn matching_words() {
let temp_index = temp_index_with_documents();
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let tokens = tokenizer.tokenize("split this world");
let text = "split this world";
let tokens = tokenizer.tokenize(text);
let ExtractedTokens { query_terms, .. } =
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
let matching_words = MatchingWords::new(ctx, query_terms);
let matching_words = MatchingWords::new(ctx, &query_terms);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("split"),
char_end: "split".chars().count(),
byte_end: "split".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
..Default::default()
})
.next(),
None
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("worlded"),
char_end: "worlded".chars().count(),
byte_end: "worlded".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
..Default::default()
})
.next(),
None
matching_words.get_matches_and_query_positions(
&[
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("split"),
char_end: "split".chars().count(),
byte_end: "split".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("worlded"),
char_end: "worlded".chars().count(),
byte_end: "worlded".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
..Default::default()
}
],
text
),
(
vec![
Match {
char_count: 5,
byte_len: 5,
position: MatchPosition::Word { word_position: 0, token_position: 0 }
},
Match {
char_count: 5,
byte_len: 5,
position: MatchPosition::Word { word_position: 2, token_position: 2 }
},
Match {
char_count: 5,
byte_len: 5,
position: MatchPosition::Word { word_position: 3, token_position: 3 }
}
],
vec![
QueryPosition { range: [0, 0], index: 0 },
QueryPosition { range: [2, 2], index: 1 },
QueryPosition { range: [2, 2], index: 2 }
]
)
);
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,15 +0,0 @@
use charabia::{SeparatorKind, Token, TokenKind};
pub enum SimpleTokenKind {
Separator(SeparatorKind),
NotSeparator,
}
impl SimpleTokenKind {
pub fn new(token: &&Token<'_>) -> Self {
match token.kind {
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
_ => Self::NotSeparator,
}
}
}