mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Merge 90056c64f5
into a88146d59e
This commit is contained in:
commit
7ded621743
14 changed files with 1605 additions and 1431 deletions
|
@ -1551,9 +1551,10 @@ fn retrieve_documents<S: AsRef<str>>(
|
|||
Ok(match &attributes_to_retrieve {
|
||||
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
|
||||
&document?,
|
||||
attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
|
||||
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
|
||||
),
|
||||
attributes_to_retrieve
|
||||
.iter()
|
||||
.map(|s| s.as_ref())
|
||||
.chain(retrieve_vectors.should_retrieve().then_some("_vectors")),
|
||||
),
|
||||
None => document?,
|
||||
})
|
||||
|
@ -1586,7 +1587,7 @@ fn retrieve_document<S: AsRef<str>>(
|
|||
attributes_to_retrieve
|
||||
.iter()
|
||||
.map(|s| s.as_ref())
|
||||
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
|
||||
.chain(retrieve_vectors.should_retrieve().then_some("_vectors")),
|
||||
),
|
||||
None => document,
|
||||
};
|
||||
|
|
|
@ -815,7 +815,8 @@ impl SearchByIndex {
|
|||
|
||||
let (result, _semantic_hit_count) =
|
||||
super::super::search_from_kind(index_uid.to_string(), search_kind, search)?;
|
||||
let format = AttributesFormat {
|
||||
|
||||
let attributes_format = AttributesFormat {
|
||||
attributes_to_retrieve: query.attributes_to_retrieve,
|
||||
retrieve_vectors,
|
||||
attributes_to_highlight: query.attributes_to_highlight,
|
||||
|
@ -846,12 +847,11 @@ impl SearchByIndex {
|
|||
|
||||
let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref());
|
||||
|
||||
let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
|
||||
|
||||
let hit_maker =
|
||||
HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| {
|
||||
MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
|
||||
})?;
|
||||
HitMaker::new(matching_words, tokenizer, attributes_format, &index, &rtxn)
|
||||
.map_err(|e| {
|
||||
MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
|
||||
})?;
|
||||
|
||||
results_by_query.push(SearchResultByQuery {
|
||||
weight,
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use core::fmt;
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::min;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||
use std::str::FromStr;
|
||||
|
@ -28,11 +29,11 @@ use meilisearch_types::{milli, Document};
|
|||
use milli::tokenizer::{Language, TokenizerBuilder};
|
||||
use milli::{
|
||||
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, LocalizedAttributesRule,
|
||||
MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
MarkerOptions, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{json, Value};
|
||||
use serde_json::{json, Map, Value};
|
||||
#[cfg(test)]
|
||||
mod mod_test;
|
||||
use utoipa::ToSchema;
|
||||
|
@ -47,7 +48,9 @@ pub use federated::{
|
|||
|
||||
mod ranking_rules;
|
||||
|
||||
type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;
|
||||
// TODO: Adapt this type to support cropping
|
||||
// { "_matchesPosition": { "overview": { first: false, highlighted: [[0,4,6,11,5,234,6,241,5]] } } }
|
||||
// type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;
|
||||
|
||||
pub const DEFAULT_SEARCH_OFFSET: fn() -> usize = || 0;
|
||||
pub const DEFAULT_SEARCH_LIMIT: fn() -> usize = || 20;
|
||||
|
@ -810,11 +813,9 @@ pub struct SearchHit {
|
|||
#[serde(flatten)]
|
||||
#[schema(additional_properties, inline, value_type = HashMap<String, Value>)]
|
||||
pub document: Document,
|
||||
#[serde(default, rename = "_formatted", skip_serializing_if = "Document::is_empty")]
|
||||
#[serde(default, rename = "_formatted", skip_serializing_if = "Option::is_none")]
|
||||
#[schema(additional_properties, value_type = HashMap<String, Value>)]
|
||||
pub formatted: Document,
|
||||
#[serde(default, rename = "_matchesPosition", skip_serializing_if = "Option::is_none")]
|
||||
pub matches_position: Option<MatchesPosition>,
|
||||
pub formatted: Option<Document>,
|
||||
#[serde(default, rename = "_rankingScore", skip_serializing_if = "Option::is_none")]
|
||||
pub ranking_score: Option<f64>,
|
||||
#[serde(default, rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")]
|
||||
|
@ -1291,6 +1292,7 @@ struct AttributesFormat {
|
|||
crop_marker: String,
|
||||
highlight_pre_tag: String,
|
||||
highlight_post_tag: String,
|
||||
// TODO: Might want to rename this to signify that this will not yield _formatted anymore, only positions
|
||||
show_matches_position: bool,
|
||||
sort: Option<Vec<String>>,
|
||||
show_ranking_score: bool,
|
||||
|
@ -1298,7 +1300,7 @@ struct AttributesFormat {
|
|||
locales: Option<Vec<Language>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum RetrieveVectors {
|
||||
/// Remove the `_vectors` field
|
||||
///
|
||||
|
@ -1318,6 +1320,10 @@ impl RetrieveVectors {
|
|||
Self::Hide
|
||||
}
|
||||
}
|
||||
|
||||
pub fn should_retrieve(&self) -> bool {
|
||||
matches!(self, Self::Retrieve)
|
||||
}
|
||||
}
|
||||
|
||||
struct HitMaker<'a> {
|
||||
|
@ -1329,7 +1335,7 @@ struct HitMaker<'a> {
|
|||
retrieve_vectors: RetrieveVectors,
|
||||
to_retrieve_ids: BTreeSet<FieldId>,
|
||||
embedding_configs: Vec<index::IndexEmbeddingConfig>,
|
||||
formatter_builder: MatcherBuilder<'a>,
|
||||
matcher_builder: MatcherBuilder<'a>,
|
||||
formatted_options: BTreeMap<FieldId, FormatOptions>,
|
||||
show_ranking_score: bool,
|
||||
show_ranking_score_details: bool,
|
||||
|
@ -1357,24 +1363,20 @@ impl<'a> HitMaker<'a> {
|
|||
tokenizer_builder.into_tokenizer()
|
||||
}
|
||||
|
||||
pub fn formatter_builder(
|
||||
matching_words: milli::MatchingWords,
|
||||
tokenizer: milli::tokenizer::Tokenizer<'_>,
|
||||
) -> MatcherBuilder<'_> {
|
||||
let formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
|
||||
|
||||
formatter_builder
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
matching_words: milli::MatchingWords,
|
||||
tokenizer: milli::tokenizer::Tokenizer<'a>,
|
||||
attr_fmt: AttributesFormat,
|
||||
index: &'a Index,
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
format: AttributesFormat,
|
||||
mut formatter_builder: MatcherBuilder<'a>,
|
||||
) -> milli::Result<Self> {
|
||||
formatter_builder.crop_marker(format.crop_marker);
|
||||
formatter_builder.highlight_prefix(format.highlight_pre_tag);
|
||||
formatter_builder.highlight_suffix(format.highlight_post_tag);
|
||||
let AttributesFormat { highlight_pre_tag, highlight_post_tag, crop_marker, .. } = attr_fmt;
|
||||
|
||||
let matcher_builder = MatcherBuilder::new(
|
||||
matching_words,
|
||||
tokenizer,
|
||||
MarkerOptions { highlight_pre_tag, highlight_post_tag, crop_marker },
|
||||
);
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let displayed_ids = index
|
||||
|
@ -1392,21 +1394,21 @@ impl<'a> HitMaker<'a> {
|
|||
let displayed_names = index.displayed_fields(rtxn)?.unwrap();
|
||||
!displayed_names.contains(&milli::constants::RESERVED_VECTORS_FIELD_NAME)
|
||||
}
|
||||
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
||||
// displayed_ids is a finite list, so hide if `_vectors` is not part of it
|
||||
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
||||
};
|
||||
|
||||
let displayed_ids =
|
||||
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
|
||||
|
||||
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
|
||||
let retrieve_vectors = if let RetrieveVectors::Retrieve = attr_fmt.retrieve_vectors {
|
||||
if vectors_is_hidden {
|
||||
RetrieveVectors::Hide
|
||||
} else {
|
||||
RetrieveVectors::Retrieve
|
||||
}
|
||||
} else {
|
||||
format.retrieve_vectors
|
||||
attr_fmt.retrieve_vectors
|
||||
};
|
||||
|
||||
let fids = |attrs: &BTreeSet<String>| {
|
||||
|
@ -1423,7 +1425,7 @@ impl<'a> HitMaker<'a> {
|
|||
}
|
||||
ids
|
||||
};
|
||||
let to_retrieve_ids: BTreeSet<_> = format
|
||||
let to_retrieve_ids: BTreeSet<_> = attr_fmt
|
||||
.attributes_to_retrieve
|
||||
.as_ref()
|
||||
.map(fids)
|
||||
|
@ -1432,12 +1434,12 @@ impl<'a> HitMaker<'a> {
|
|||
.cloned()
|
||||
.collect();
|
||||
|
||||
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
|
||||
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
|
||||
let attr_to_highlight = attr_fmt.attributes_to_highlight.unwrap_or_default();
|
||||
let attr_to_crop = attr_fmt.attributes_to_crop.unwrap_or_default();
|
||||
let formatted_options = compute_formatted_options(
|
||||
&attr_to_highlight,
|
||||
&attr_to_crop,
|
||||
format.crop_length,
|
||||
attr_fmt.crop_length,
|
||||
&to_retrieve_ids,
|
||||
&fields_ids_map,
|
||||
&displayed_ids,
|
||||
|
@ -1454,51 +1456,53 @@ impl<'a> HitMaker<'a> {
|
|||
retrieve_vectors,
|
||||
to_retrieve_ids,
|
||||
embedding_configs,
|
||||
formatter_builder,
|
||||
matcher_builder,
|
||||
formatted_options,
|
||||
show_ranking_score: format.show_ranking_score,
|
||||
show_ranking_score_details: format.show_ranking_score_details,
|
||||
show_matches_position: format.show_matches_position,
|
||||
sort: format.sort,
|
||||
locales: format.locales,
|
||||
show_ranking_score: attr_fmt.show_ranking_score,
|
||||
show_ranking_score_details: attr_fmt.show_ranking_score_details,
|
||||
show_matches_position: attr_fmt.show_matches_position,
|
||||
sort: attr_fmt.sort,
|
||||
locales: attr_fmt.locales,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
|
||||
let (_, obkv) =
|
||||
self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
|
||||
fn make_document(&self, obkv: &obkv::KvReaderU16) -> milli::Result<Document> {
|
||||
let mut document = serde_json::Map::new();
|
||||
|
||||
// First generate a document with all the displayed fields
|
||||
let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?;
|
||||
|
||||
let add_vectors_fid =
|
||||
self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);
|
||||
|
||||
// select the attributes to retrieve
|
||||
let attributes_to_retrieve = self
|
||||
.to_retrieve_ids
|
||||
.iter()
|
||||
// skip the vectors_fid if RetrieveVectors::Hide
|
||||
.filter(|fid| match self.vectors_fid {
|
||||
Some(vectors_fid) => {
|
||||
!(self.retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
|
||||
// recreate JSON with appropriate attributes
|
||||
for (key, value) in obkv.iter() {
|
||||
if self.vectors_fid.is_some_and(|vectors_fid| vectors_fid == key) {
|
||||
// (vectors aren't considered in `displayedAttributes` and `attributesToRetrieve`, but rather with `retrieveVectors`)
|
||||
if !self.retrieve_vectors.should_retrieve() {
|
||||
continue;
|
||||
}
|
||||
None => true,
|
||||
})
|
||||
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
|
||||
.chain(add_vectors_fid.iter())
|
||||
.map(|&fid| self.fields_ids_map.name(fid).expect("Missing field name"));
|
||||
} else if !self.to_retrieve_ids.contains(&key) || !self.displayed_ids.contains(&key) {
|
||||
// https://www.meilisearch.com/docs/reference/api/settings#displayed-attributes
|
||||
// https://www.meilisearch.com/docs/reference/api/search#attributes-to-retrieve
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut document =
|
||||
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
|
||||
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||
let key = self.fields_ids_map.name(key).expect("Missing field name").to_string();
|
||||
|
||||
if self.retrieve_vectors == RetrieveVectors::Retrieve {
|
||||
// Clippy is wrong
|
||||
document.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
|
||||
let obkv = self.index.document(self.rtxn, id)?;
|
||||
|
||||
let mut document = self.make_document(obkv)?;
|
||||
|
||||
if self.retrieve_vectors.should_retrieve() {
|
||||
#[allow(clippy::manual_unwrap_or_default)]
|
||||
let mut vectors = match document.remove("_vectors") {
|
||||
Some(Value::Object(map)) => map,
|
||||
_ => Default::default(),
|
||||
};
|
||||
|
||||
for (name, vector) in self.index.embeddings(self.rtxn, id)? {
|
||||
let user_provided = self
|
||||
.embedding_configs
|
||||
|
@ -1507,6 +1511,7 @@ impl<'a> HitMaker<'a> {
|
|||
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||
let embeddings =
|
||||
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
|
||||
|
||||
vectors.insert(
|
||||
name,
|
||||
serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?,
|
||||
|
@ -1518,10 +1523,10 @@ impl<'a> HitMaker<'a> {
|
|||
let localized_attributes =
|
||||
self.index.localized_attributes_rules(self.rtxn)?.unwrap_or_default();
|
||||
|
||||
let (matches_position, formatted) = format_fields(
|
||||
&displayed_document,
|
||||
let formatted = format_fields(
|
||||
&mut document,
|
||||
&self.fields_ids_map,
|
||||
&self.formatter_builder,
|
||||
&self.matcher_builder,
|
||||
&self.formatted_options,
|
||||
self.show_matches_position,
|
||||
&self.displayed_ids,
|
||||
|
@ -1538,13 +1543,7 @@ impl<'a> HitMaker<'a> {
|
|||
let ranking_score_details =
|
||||
self.show_ranking_score_details.then(|| ScoreDetails::to_json_map(score.iter()));
|
||||
|
||||
let hit = SearchHit {
|
||||
document,
|
||||
formatted,
|
||||
matches_position,
|
||||
ranking_score_details,
|
||||
ranking_score,
|
||||
};
|
||||
let hit = SearchHit { document, formatted, ranking_score_details, ranking_score };
|
||||
|
||||
Ok(hit)
|
||||
}
|
||||
|
@ -1553,7 +1552,7 @@ impl<'a> HitMaker<'a> {
|
|||
fn make_hits<'a>(
|
||||
index: &Index,
|
||||
rtxn: &RoTxn<'_>,
|
||||
format: AttributesFormat,
|
||||
attributes_format: AttributesFormat,
|
||||
matching_words: milli::MatchingWords,
|
||||
documents_ids_scores: impl Iterator<Item = (u32, &'a Vec<ScoreDetails>)> + 'a,
|
||||
) -> milli::Result<Vec<SearchHit>> {
|
||||
|
@ -1568,9 +1567,7 @@ fn make_hits<'a>(
|
|||
|
||||
let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref());
|
||||
|
||||
let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
|
||||
|
||||
let hit_maker = HitMaker::new(index, rtxn, format, formatter_builder)?;
|
||||
let hit_maker = HitMaker::new(matching_words, tokenizer, attributes_format, index, rtxn)?;
|
||||
|
||||
for (id, score) in documents_ids_scores {
|
||||
documents.push(hit_maker.make_hit(id, score)?);
|
||||
|
@ -1886,59 +1883,100 @@ fn add_non_formatted_ids_to_formatted_options(
|
|||
}
|
||||
}
|
||||
|
||||
fn make_document(
|
||||
displayed_attributes: &BTreeSet<FieldId>,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
obkv: &obkv::KvReaderU16,
|
||||
) -> milli::Result<Document> {
|
||||
let mut document = serde_json::Map::new();
|
||||
|
||||
// recreate the original json
|
||||
for (key, value) in obkv.iter() {
|
||||
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||
let key = field_ids_map.name(key).expect("Missing field name").to_string();
|
||||
|
||||
document.insert(key, value);
|
||||
}
|
||||
|
||||
// select the attributes to retrieve
|
||||
let displayed_attributes = displayed_attributes
|
||||
.iter()
|
||||
.map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
|
||||
|
||||
let document = permissive_json_pointer::select_values(&document, displayed_attributes);
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn format_fields(
|
||||
document: &Document,
|
||||
document: &mut Document,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
builder: &MatcherBuilder<'_>,
|
||||
matcher_builder: &MatcherBuilder<'_>,
|
||||
formatted_options: &BTreeMap<FieldId, FormatOptions>,
|
||||
compute_matches: bool,
|
||||
show_matches_position: bool,
|
||||
displayable_ids: &BTreeSet<FieldId>,
|
||||
locales: Option<&[Language]>,
|
||||
localized_attributes: &[LocalizedAttributesRule],
|
||||
) -> milli::Result<(Option<MatchesPosition>, Document)> {
|
||||
let mut matches_position = compute_matches.then(BTreeMap::new);
|
||||
let mut document = document.clone();
|
||||
|
||||
) -> milli::Result<Option<Document>> {
|
||||
// reduce the formatted option list to the attributes that should be formatted,
|
||||
// instead of all the attributes to display.
|
||||
let formatting_fields_options: Vec<_> = formatted_options
|
||||
let formatting_fields_options = formatted_options
|
||||
.iter()
|
||||
.filter(|(_, option)| option.should_format())
|
||||
.map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
|
||||
.collect();
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// select the attributes to retrieve
|
||||
let displayable_names =
|
||||
displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
|
||||
|
||||
let get_format_options = |key: Cow<'_, str>| {
|
||||
formatting_fields_options
|
||||
.iter()
|
||||
.filter(|(name, ..)| {
|
||||
milli::is_faceted_by(name, &key) || milli::is_faceted_by(&key, name)
|
||||
})
|
||||
.map(|(_, option)| **option)
|
||||
.reduce(|acc, option| acc.merge(option))
|
||||
};
|
||||
|
||||
let get_locales = |key: Cow<'_, str>| {
|
||||
// TODO: Should this be re computed every time?
|
||||
// if no locales has been provided, we try to find the locales in the localized_attributes.
|
||||
locales.or_else(|| {
|
||||
localized_attributes
|
||||
.iter()
|
||||
.find(|rule| matches!(rule.match_str(&key), PatternMatch::Match))
|
||||
.map(LocalizedAttributesRule::locales)
|
||||
})
|
||||
};
|
||||
|
||||
fn get_text(value: &mut Value) -> Option<Cow<'_, String>> {
|
||||
match value {
|
||||
Value::String(text) => Some(Cow::Borrowed(text)),
|
||||
Value::Number(number) => Some(Cow::Owned(number.to_string())),
|
||||
// boolean and null can not be matched by meili, can not be formatted
|
||||
// and array or object cannot be yielded by `permissive_json_pointer::map_leaf_values`
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
if show_matches_position {
|
||||
permissive_json_pointer::map_leaf_values(document, displayable_names, |key, _, value| {
|
||||
let Some(text) = get_text(value) else {
|
||||
*value = Value::Object(Map::from_iter(std::iter::once((
|
||||
"value".to_string(),
|
||||
value.take(),
|
||||
))));
|
||||
|
||||
return;
|
||||
};
|
||||
|
||||
let locales = get_locales(Cow::from(key));
|
||||
let mut matcher = matcher_builder.build(&text, locales);
|
||||
let format_options = get_format_options(Cow::from(key));
|
||||
let match_bounds = matcher.get_match_bounds(format_options);
|
||||
|
||||
let value_iter = std::iter::once(("value".to_string(), value.take()));
|
||||
|
||||
// do not include `matches` in case there is nothing to format
|
||||
let json_map = if let Some(mb) = match_bounds {
|
||||
let matches_iter = std::iter::once((
|
||||
"matches".to_string(),
|
||||
serde_json::to_value(mb).expect("TODO"),
|
||||
));
|
||||
Map::from_iter(value_iter.chain(matches_iter))
|
||||
} else {
|
||||
Map::from_iter(value_iter)
|
||||
};
|
||||
|
||||
*value = Value::Object(json_map);
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut formatted_document = document.clone();
|
||||
permissive_json_pointer::map_leaf_values(
|
||||
&mut document,
|
||||
&mut formatted_document,
|
||||
displayable_names,
|
||||
|key, array_indices, value| {
|
||||
|key, _, value| {
|
||||
// To get the formatting option of each key we need to see all the rules that applies
|
||||
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
|
||||
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
|
||||
|
@ -1946,37 +1984,22 @@ fn format_fields(
|
|||
// Warn: The time to compute the format list scales with the number of fields to format;
|
||||
// cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
|
||||
// d*f where d is the total number of fields to display and f is the total number of fields to format.
|
||||
let format = formatting_fields_options
|
||||
.iter()
|
||||
.filter(|(name, _option)| {
|
||||
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
|
||||
})
|
||||
.map(|(_, option)| **option)
|
||||
.reduce(|acc, option| acc.merge(option));
|
||||
let mut infos = Vec::new();
|
||||
let Some(text) = get_text(value) else {
|
||||
return;
|
||||
};
|
||||
|
||||
// if no locales has been provided, we try to find the locales in the localized_attributes.
|
||||
let locales = locales.or_else(|| {
|
||||
localized_attributes
|
||||
.iter()
|
||||
.find(|rule| rule.match_str(key) == PatternMatch::Match)
|
||||
.map(LocalizedAttributesRule::locales)
|
||||
});
|
||||
let format_options = get_format_options(Cow::from(key));
|
||||
|
||||
*value = format_value(
|
||||
std::mem::take(value),
|
||||
builder,
|
||||
format,
|
||||
&mut infos,
|
||||
compute_matches,
|
||||
array_indices,
|
||||
locales,
|
||||
);
|
||||
// there's nothing to format
|
||||
if !format_options.is_some_and(|v| v.should_format()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(matches) = matches_position.as_mut() {
|
||||
if !infos.is_empty() {
|
||||
matches.insert(key.to_owned(), infos);
|
||||
}
|
||||
let locales = get_locales(Cow::from(key));
|
||||
|
||||
let mut matcher = matcher_builder.build(&text, locales);
|
||||
if let Some(formatted_text) = matcher.get_formatted_text(format_options) {
|
||||
*value = Value::String(formatted_text);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
@ -1986,58 +2009,9 @@ fn format_fields(
|
|||
// This unwrap must be safe since we got the ids from the fields_ids_map just
|
||||
// before.
|
||||
.map(|&fid| field_ids_map.name(fid).unwrap());
|
||||
let document = permissive_json_pointer::select_values(&document, selectors);
|
||||
let formatted_document = permissive_json_pointer::select_values(&formatted_document, selectors);
|
||||
|
||||
Ok((matches_position, document))
|
||||
}
|
||||
|
||||
fn format_value(
|
||||
value: Value,
|
||||
builder: &MatcherBuilder<'_>,
|
||||
format_options: Option<FormatOptions>,
|
||||
infos: &mut Vec<MatchBounds>,
|
||||
compute_matches: bool,
|
||||
array_indices: &[usize],
|
||||
locales: Option<&[Language]>,
|
||||
) -> Value {
|
||||
match value {
|
||||
Value::String(old_string) => {
|
||||
let mut matcher = builder.build(&old_string, locales);
|
||||
if compute_matches {
|
||||
let matches = matcher.matches(array_indices);
|
||||
infos.extend_from_slice(&matches[..]);
|
||||
}
|
||||
|
||||
match format_options {
|
||||
Some(format_options) => {
|
||||
let value = matcher.format(format_options);
|
||||
Value::String(value.into_owned())
|
||||
}
|
||||
None => Value::String(old_string),
|
||||
}
|
||||
}
|
||||
// `map_leaf_values` makes sure this is only called for leaf fields
|
||||
Value::Array(_) => unreachable!(),
|
||||
Value::Object(_) => unreachable!(),
|
||||
Value::Number(number) => {
|
||||
let s = number.to_string();
|
||||
|
||||
let mut matcher = builder.build(&s, locales);
|
||||
if compute_matches {
|
||||
let matches = matcher.matches(array_indices);
|
||||
infos.extend_from_slice(&matches[..]);
|
||||
}
|
||||
|
||||
match format_options {
|
||||
Some(format_options) => {
|
||||
let value = matcher.format(format_options);
|
||||
Value::String(value.into_owned())
|
||||
}
|
||||
None => Value::String(s),
|
||||
}
|
||||
}
|
||||
value => value,
|
||||
}
|
||||
Ok(Some(formatted_document))
|
||||
}
|
||||
|
||||
pub(crate) fn parse_filter(
|
||||
|
|
|
@ -80,8 +80,9 @@ pub use self::localized_attributes_rules::LocalizedAttributesRule;
|
|||
pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
|
||||
pub use self::search::similar::Similar;
|
||||
pub use self::search::{
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
|
||||
Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
FacetDistribution, Filter, FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder,
|
||||
MatchingWords, OrderBy, Search, SearchResult, SemanticSearch, TermsMatchingStrategy,
|
||||
DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
pub use self::update::ChannelCongestion;
|
||||
|
||||
|
|
|
@ -7,7 +7,9 @@ use once_cell::sync::Lazy;
|
|||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
|
||||
pub use self::new::matches::{
|
||||
FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder, MatchingWords,
|
||||
};
|
||||
use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::index::MatchingStrategy;
|
||||
|
@ -278,7 +280,7 @@ impl<'a> Search<'a> {
|
|||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
|
|
222
crates/milli/src/search/new/matches/adjust_indices.rs
Normal file
222
crates/milli/src/search/new/matches/adjust_indices.rs
Normal file
|
@ -0,0 +1,222 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use charabia::{SeparatorKind, Token};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Direction {
|
||||
Forwards,
|
||||
Backwards,
|
||||
}
|
||||
|
||||
impl Direction {
|
||||
fn switch(&mut self) {
|
||||
*self = match self {
|
||||
Direction::Backwards => Direction::Forwards,
|
||||
Direction::Forwards => Direction::Backwards,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_adjusted_indices_for_too_few_words(
|
||||
tokens: &[Token],
|
||||
mut index_backward: usize,
|
||||
mut index_forward: usize,
|
||||
mut words_count: usize,
|
||||
crop_size: usize,
|
||||
) -> [usize; 2] {
|
||||
let mut valid_index_backward = index_backward;
|
||||
let mut valid_index_forward = index_forward;
|
||||
|
||||
let mut is_end_reached = index_forward == tokens.len() - 1;
|
||||
let mut is_beginning_reached = index_backward == 0;
|
||||
|
||||
let mut is_index_backwards_at_hard_separator = false;
|
||||
let mut is_index_forwards_at_hard_separator = false;
|
||||
|
||||
let mut is_crop_size_or_both_ends_reached =
|
||||
words_count == crop_size || (is_end_reached && is_beginning_reached);
|
||||
|
||||
let mut dir = Direction::Forwards;
|
||||
|
||||
loop {
|
||||
if is_crop_size_or_both_ends_reached {
|
||||
break;
|
||||
}
|
||||
|
||||
let (index, valid_index) = match dir {
|
||||
Direction::Backwards => (&mut index_backward, &mut valid_index_backward),
|
||||
Direction::Forwards => (&mut index_forward, &mut valid_index_forward),
|
||||
};
|
||||
|
||||
loop {
|
||||
match dir {
|
||||
Direction::Forwards => {
|
||||
if is_end_reached {
|
||||
break;
|
||||
}
|
||||
|
||||
*index += 1;
|
||||
|
||||
is_end_reached = *index == tokens.len() - 1;
|
||||
}
|
||||
Direction::Backwards => {
|
||||
if is_beginning_reached
|
||||
|| (!is_end_reached
|
||||
&& is_index_backwards_at_hard_separator
|
||||
&& !is_index_forwards_at_hard_separator)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
*index -= 1;
|
||||
|
||||
is_beginning_reached = *index == 0;
|
||||
}
|
||||
};
|
||||
|
||||
if is_end_reached && is_beginning_reached {
|
||||
is_crop_size_or_both_ends_reached = true;
|
||||
}
|
||||
|
||||
let maybe_is_token_hard_separator = tokens[*index]
|
||||
.separator_kind()
|
||||
.map(|sep_kind| matches!(sep_kind, SeparatorKind::Hard));
|
||||
|
||||
// it's not a separator
|
||||
if maybe_is_token_hard_separator.is_none() {
|
||||
*valid_index = *index;
|
||||
words_count += 1;
|
||||
|
||||
if words_count == crop_size {
|
||||
is_crop_size_or_both_ends_reached = true;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
let is_index_at_hard_separator = match dir {
|
||||
Direction::Backwards => &mut is_index_backwards_at_hard_separator,
|
||||
Direction::Forwards => &mut is_index_forwards_at_hard_separator,
|
||||
};
|
||||
*is_index_at_hard_separator =
|
||||
maybe_is_token_hard_separator.is_some_and(|is_hard| is_hard);
|
||||
}
|
||||
|
||||
dir.switch();
|
||||
|
||||
// 1. if end is reached, we can only advance backwards
|
||||
// 2. if forwards index reached a hard separator and backwards is currently hard, we can go backwards
|
||||
}
|
||||
|
||||
// keep advancing forward and backward to check if there's only separator tokens
|
||||
// left until the end if so, then include those too in the index range
|
||||
|
||||
let saved_index = valid_index_forward;
|
||||
loop {
|
||||
if valid_index_forward == tokens.len() - 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
valid_index_forward += 1;
|
||||
|
||||
if !tokens[valid_index_forward].is_separator() {
|
||||
valid_index_forward = saved_index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let saved_index = valid_index_backward;
|
||||
loop {
|
||||
if valid_index_backward == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
valid_index_backward -= 1;
|
||||
|
||||
if !tokens[valid_index_backward].is_separator() {
|
||||
valid_index_backward = saved_index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
[valid_index_backward, valid_index_forward]
|
||||
}
|
||||
|
||||
fn get_adjusted_index_forward_for_too_many_words(
|
||||
tokens: &[Token],
|
||||
index_backward: usize,
|
||||
mut index_forward: usize,
|
||||
mut words_count: usize,
|
||||
crop_size: usize,
|
||||
) -> usize {
|
||||
loop {
|
||||
if index_forward == index_backward {
|
||||
return index_forward;
|
||||
}
|
||||
|
||||
index_forward -= 1;
|
||||
|
||||
if tokens[index_forward].is_separator() {
|
||||
continue;
|
||||
}
|
||||
|
||||
words_count -= 1;
|
||||
|
||||
if words_count == crop_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
index_forward
|
||||
}
|
||||
|
||||
pub fn get_adjusted_indices_for_highlights_and_crop_size(
|
||||
tokens: &[Token],
|
||||
index_backward: usize,
|
||||
index_forward: usize,
|
||||
words_count: usize,
|
||||
crop_size: usize,
|
||||
) -> [usize; 2] {
|
||||
match words_count.cmp(&crop_size) {
|
||||
Ordering::Equal | Ordering::Less => get_adjusted_indices_for_too_few_words(
|
||||
tokens,
|
||||
index_backward,
|
||||
index_forward,
|
||||
words_count,
|
||||
crop_size,
|
||||
),
|
||||
Ordering::Greater => [
|
||||
index_backward,
|
||||
get_adjusted_index_forward_for_too_many_words(
|
||||
tokens,
|
||||
index_backward,
|
||||
index_forward,
|
||||
words_count,
|
||||
crop_size,
|
||||
),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usize) -> usize {
|
||||
let mut words_count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
while index != tokens.len() - 1 {
|
||||
if !tokens[index].is_separator() {
|
||||
words_count += 1;
|
||||
|
||||
if words_count == crop_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
index += 1;
|
||||
}
|
||||
|
||||
if index == tokens.len() - 1 {
|
||||
return index;
|
||||
}
|
||||
|
||||
index + 1
|
||||
}
|
|
@ -1,139 +0,0 @@
|
|||
use super::matching_words::WordId;
|
||||
use super::{Match, MatchPosition};
|
||||
|
||||
struct MatchIntervalWithScore {
|
||||
interval: [usize; 2],
|
||||
score: [i16; 3],
|
||||
}
|
||||
|
||||
// count score for phrases
|
||||
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
|
||||
let words_in_phrase_minus_one = (lwp - fwp) as i16;
|
||||
// will always be ordered, so +1 for each space between words
|
||||
*order_score += words_in_phrase_minus_one;
|
||||
// distance will always be 1, so -1 for each space between words
|
||||
*distance_score -= words_in_phrase_minus_one;
|
||||
}
|
||||
|
||||
/// Compute the score of a match interval:
|
||||
/// 1) count unique matches
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
let mut iter = matches.iter().peekable();
|
||||
while let Some(m) = iter.next() {
|
||||
if let Some(next_match) = iter.peek() {
|
||||
// if matches are ordered
|
||||
if next_match.ids.iter().min() > m.ids.iter().min() {
|
||||
order_score += 1;
|
||||
}
|
||||
|
||||
let m_last_word_pos = match m.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
lwp
|
||||
}
|
||||
};
|
||||
let next_match_first_word_pos = next_match.get_first_word_pos();
|
||||
|
||||
// compute distance between matches
|
||||
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
|
||||
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
|
||||
// in case last match is a phrase, count score for its words
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
}
|
||||
|
||||
ids.extend(m.ids.iter());
|
||||
}
|
||||
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
let uniq_score = ids.len() as i16;
|
||||
|
||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||
[uniq_score, distance_score, order_score]
|
||||
}
|
||||
|
||||
/// Returns the first and last match where the score computed by match_interval_score is the best.
|
||||
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
|
||||
if matches.is_empty() {
|
||||
panic!("`matches` should not be empty at this point");
|
||||
}
|
||||
|
||||
// positions of the first and the last match of the best matches interval in `matches`.
|
||||
let mut best_interval: Option<MatchIntervalWithScore> = None;
|
||||
|
||||
let mut save_best_interval = |interval_first, interval_last| {
|
||||
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
|
||||
let is_interval_score_better = &best_interval
|
||||
.as_ref()
|
||||
.is_none_or(|MatchIntervalWithScore { score, .. }| interval_score > *score);
|
||||
|
||||
if *is_interval_score_better {
|
||||
best_interval = Some(MatchIntervalWithScore {
|
||||
interval: [interval_first, interval_last],
|
||||
score: interval_score,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// we compute the matches interval if we have at least 2 matches.
|
||||
// current interval positions.
|
||||
let mut interval_first = 0;
|
||||
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
for (index, next_match) in matches.iter().enumerate() {
|
||||
// if next match would make interval gross more than crop_size,
|
||||
// we compare the current interval with the best one,
|
||||
// then we increase `interval_first` until next match can be added.
|
||||
let next_match_last_word_pos = next_match.get_last_word_pos();
|
||||
|
||||
// if the next match would mean that we pass the crop size window,
|
||||
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
|
||||
// and calculate a score for it, and check if it's better than our best so far
|
||||
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
|
||||
// if index is 0 there is no last viable match
|
||||
if index != 0 {
|
||||
let interval_last = index - 1;
|
||||
// keep interval if it's the best
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// advance start of the interval while interval is longer than crop_size.
|
||||
loop {
|
||||
interval_first += 1;
|
||||
if interval_first == matches.len() {
|
||||
interval_first -= 1;
|
||||
break;
|
||||
}
|
||||
|
||||
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
if interval_first_match_first_word_pos > next_match_last_word_pos
|
||||
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// compute the last interval score and compare it to the best one.
|
||||
let interval_last = matches.len() - 1;
|
||||
// if it's the last match with itself, we need to make sure it's
|
||||
// not a phrase longer than the crop window
|
||||
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// if none of the matches fit the criteria above, default to the first one
|
||||
best_interval.map_or(
|
||||
[&matches[0], &matches[0]],
|
||||
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
|
||||
)
|
||||
}
|
169
crates/milli/src/search/new/matches/best_match_range.rs
Normal file
169
crates/milli/src/search/new/matches/best_match_range.rs
Normal file
|
@ -0,0 +1,169 @@
|
|||
use std::cell::Cell;
|
||||
|
||||
use crate::search::new::matches::matching_words::QueryPosition;
|
||||
|
||||
use super::r#match::{Match, MatchPosition};
|
||||
|
||||
struct MatchesIndexRangeWithScore {
|
||||
matches_index_range: [usize; 2],
|
||||
score: [i16; 3],
|
||||
}
|
||||
|
||||
/// Compute the score of a match interval:
|
||||
/// 1) count unique matches
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn get_score(
|
||||
matches: &[Match],
|
||||
query_positions: &[QueryPosition],
|
||||
index_first: usize,
|
||||
index_last: usize,
|
||||
) -> [i16; 3] {
|
||||
let order_score = Cell::new(0);
|
||||
let distance_score = Cell::new(0);
|
||||
|
||||
let mut iter = (index_first..=index_last)
|
||||
.filter_map(|index| {
|
||||
query_positions.iter().find_map(move |v| (v.index == index).then(|| v.range[0]))
|
||||
})
|
||||
.peekable();
|
||||
while let (Some(range_first), Some(next_range_first)) = (iter.next(), iter.peek()) {
|
||||
if range_first < *next_range_first {
|
||||
order_score.set(order_score.get() + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// count score for phrases
|
||||
let tally_phrase_scores = |fwp, lwp| {
|
||||
let words_in_phrase_minus_one = (lwp - fwp) as i16;
|
||||
// will always be in the order of query, so +1 for each space between words
|
||||
order_score.set(order_score.get() + words_in_phrase_minus_one);
|
||||
// distance will always be 1, so -1 for each space between words
|
||||
distance_score.set(distance_score.get() - words_in_phrase_minus_one);
|
||||
};
|
||||
|
||||
let mut iter = matches[index_first..=index_last].iter().peekable();
|
||||
while let Some(r#match) = iter.next() {
|
||||
if let Some(next_match) = iter.peek() {
|
||||
let match_last_word_pos = match r#match.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => {
|
||||
tally_phrase_scores(fwp, lwp);
|
||||
lwp
|
||||
}
|
||||
};
|
||||
let next_match_first_word_pos = next_match.get_first_word_pos();
|
||||
|
||||
// compute distance between matches
|
||||
distance_score.set(
|
||||
distance_score.get()
|
||||
- (next_match_first_word_pos - match_last_word_pos).min(7) as i16,
|
||||
);
|
||||
} else if let MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } =
|
||||
r#match.position
|
||||
{
|
||||
// in case last match is a phrase, count score for its words
|
||||
tally_phrase_scores(fwp, lwp);
|
||||
}
|
||||
}
|
||||
|
||||
let mut uniqueness_score = 0i16;
|
||||
let mut current_range: Option<super::matching_words::UserQueryPositionRange> = None;
|
||||
|
||||
for qp in query_positions.iter().filter(|v| v.index >= index_first && v.index <= index_last) {
|
||||
match current_range.as_mut() {
|
||||
Some([saved_range_start, saved_range_end]) => {
|
||||
let [range_start, range_end] = qp.range;
|
||||
|
||||
if range_start > *saved_range_start {
|
||||
uniqueness_score += (*saved_range_end - *saved_range_start) as i16 + 1;
|
||||
|
||||
*saved_range_start = range_start;
|
||||
*saved_range_end = range_end;
|
||||
} else if range_end > *saved_range_end {
|
||||
*saved_range_end = range_end;
|
||||
}
|
||||
}
|
||||
None => current_range = Some(qp.range),
|
||||
}
|
||||
}
|
||||
|
||||
if let Some([saved_range_start, saved_range_end]) = current_range {
|
||||
uniqueness_score += (saved_range_end - saved_range_start) as i16 + 1;
|
||||
}
|
||||
|
||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||
[uniqueness_score, distance_score.into_inner(), order_score.into_inner()]
|
||||
}
|
||||
|
||||
/// Returns the first and last match where the score computed by match_interval_score is the best.
|
||||
pub fn get_best_match_index_range(
|
||||
matches: &[Match],
|
||||
query_positions: &[QueryPosition],
|
||||
crop_size: usize,
|
||||
) -> [usize; 2] {
|
||||
// positions of the first and the last match of the best matches index range in `matches`.
|
||||
let mut best_matches_index_range: Option<MatchesIndexRangeWithScore> = None;
|
||||
|
||||
let mut save_best_matches_index_range = |index_first, index_last| {
|
||||
let score = get_score(matches, query_positions, index_first, index_last);
|
||||
let is_score_better = best_matches_index_range.as_ref().is_none_or(|v| score > v.score);
|
||||
|
||||
if is_score_better {
|
||||
best_matches_index_range = Some(MatchesIndexRangeWithScore {
|
||||
matches_index_range: [index_first, index_last],
|
||||
score,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// we compute the matches index range if we have at least 2 matches.
|
||||
let mut index_first = 0;
|
||||
let mut first_match_first_word_pos = matches[index_first].get_first_word_pos();
|
||||
|
||||
for (index, next_match) in matches.iter().enumerate() {
|
||||
// if next match would make index range gross more than crop_size,
|
||||
// we compare the current index range with the best one,
|
||||
// then we increase `index_first` until next match can be added.
|
||||
let next_match_last_word_pos = next_match.get_last_word_pos();
|
||||
|
||||
// if the next match would mean that we pass the crop size window,
|
||||
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
|
||||
// and calculate a score for it, and check if it's better than our best so far
|
||||
if next_match_last_word_pos - first_match_first_word_pos + 1 > crop_size {
|
||||
// if index is 0 there is no previous viable match
|
||||
if index != 0 {
|
||||
// keep index range if it's the best
|
||||
save_best_matches_index_range(index_first, index - 1);
|
||||
}
|
||||
|
||||
// advance `index_first` while index range is longer than crop_size.
|
||||
loop {
|
||||
if index_first == matches.len() - 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
index_first += 1;
|
||||
first_match_first_word_pos = matches[index_first].get_first_word_pos();
|
||||
|
||||
// also make sure that subtracting won't cause a panic
|
||||
if next_match_last_word_pos < first_match_first_word_pos
|
||||
|| next_match_last_word_pos - first_match_first_word_pos + 1 < crop_size
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// compute the last index range score and compare it to the best one.
|
||||
let index_last = matches.len() - 1;
|
||||
// if it's the last match with itself, we need to make sure it's
|
||||
// not a phrase longer than the crop window
|
||||
if index_first != index_last || matches[index_first].get_word_count() < crop_size {
|
||||
save_best_matches_index_range(index_first, index_last);
|
||||
}
|
||||
|
||||
// if none of the matches fit the criteria above, default to the first one
|
||||
best_matches_index_range.map_or([0, 0], |v| v.matches_index_range)
|
||||
}
|
|
@ -1,62 +1,49 @@
|
|||
use super::matching_words::WordId;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchPosition {
|
||||
Word {
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
token_position: usize,
|
||||
},
|
||||
Phrase {
|
||||
// position of the first and last word in the phrase in the whole text.
|
||||
word_positions: [usize; 2],
|
||||
// position of the first and last token in the phrase in the whole text.
|
||||
token_positions: [usize; 2],
|
||||
},
|
||||
Word { word_position: usize, token_position: usize },
|
||||
Phrase { word_position_range: [usize; 2], token_position_range: [usize; 2] },
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct Match {
|
||||
pub char_count: usize,
|
||||
// ids of the query words that matches.
|
||||
pub ids: Vec<WordId>,
|
||||
pub byte_len: usize,
|
||||
pub position: MatchPosition,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub(super) fn get_first_word_pos(&self) -> usize {
|
||||
pub fn get_first_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
|
||||
MatchPosition::Phrase { word_position_range: [fwp, _], .. } => fwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_word_pos(&self) -> usize {
|
||||
pub fn get_last_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
|
||||
MatchPosition::Phrase { word_position_range: [_, lwp], .. } => lwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_first_token_pos(&self) -> usize {
|
||||
pub fn get_first_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
|
||||
MatchPosition::Phrase { token_position_range: [ftp, _], .. } => ftp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_token_pos(&self) -> usize {
|
||||
pub fn get_last_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
|
||||
MatchPosition::Phrase { token_position_range: [_, ltp], .. } => ltp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_word_count(&self) -> usize {
|
||||
pub fn get_word_count(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { .. } => 1,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
|
||||
MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => lwp - fwp + 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
270
crates/milli/src/search/new/matches/match_bounds.rs
Normal file
270
crates/milli/src/search/new/matches/match_bounds.rs
Normal file
|
@ -0,0 +1,270 @@
|
|||
use std::cmp::{max, min};
|
||||
|
||||
use super::{
|
||||
matching_words::QueryPosition,
|
||||
r#match::{Match, MatchPosition},
|
||||
};
|
||||
|
||||
use super::adjust_indices::{
|
||||
get_adjusted_index_forward_for_crop_size, get_adjusted_indices_for_highlights_and_crop_size,
|
||||
};
|
||||
use charabia::Token;
|
||||
use serde::Serialize;
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use super::FormatOptions;
|
||||
|
||||
// TODO: Differentiate if full match do not return None, instead return match bounds with full length
|
||||
#[derive(Serialize, ToSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct MatchBounds {
|
||||
pub highlight_toggle: bool,
|
||||
pub indices: Vec<usize>,
|
||||
}
|
||||
|
||||
struct MatchBoundsHelper<'a> {
|
||||
tokens: &'a [Token<'a>],
|
||||
matches: &'a [Match],
|
||||
query_positions: &'a [QueryPosition],
|
||||
}
|
||||
|
||||
struct MatchesAndCropIndices {
|
||||
matches_first_index: usize,
|
||||
matches_last_index: usize,
|
||||
crop_byte_start: usize,
|
||||
crop_byte_end: usize,
|
||||
}
|
||||
|
||||
enum CropThing {
|
||||
Last(usize),
|
||||
First(usize),
|
||||
}
|
||||
|
||||
impl MatchBoundsHelper<'_> {
|
||||
fn get_match_byte_position_range(&self, r#match: &Match) -> [usize; 2] {
|
||||
let byte_start = match r#match.position {
|
||||
MatchPosition::Word { token_position, .. } => self.tokens[token_position].byte_start,
|
||||
MatchPosition::Phrase { token_position_range: [ftp, ..], .. } => {
|
||||
self.tokens[ftp].byte_start
|
||||
}
|
||||
};
|
||||
|
||||
[byte_start, byte_start + r#match.byte_len]
|
||||
}
|
||||
|
||||
// TODO: Rename this
|
||||
fn get_match_byte_position_rangee(
|
||||
&self,
|
||||
index: &mut usize,
|
||||
crop_thing: CropThing,
|
||||
) -> [usize; 2] {
|
||||
let new_index = match crop_thing {
|
||||
CropThing::First(_) if *index != 0 => *index - 1,
|
||||
CropThing::Last(_) if *index != self.matches.len() - 1 => *index + 1,
|
||||
_ => {
|
||||
return self.get_match_byte_position_range(&self.matches[*index]);
|
||||
}
|
||||
};
|
||||
|
||||
let [byte_start, byte_end] = self.get_match_byte_position_range(&self.matches[new_index]);
|
||||
|
||||
// NOTE: This doesn't need additional checks, because `get_best_match_index_range` already
|
||||
// guarantees that the next or preceding match contains the crop boundary
|
||||
match crop_thing {
|
||||
CropThing::First(crop_byte_start) if crop_byte_start < byte_end => {
|
||||
*index -= 1;
|
||||
[byte_start, byte_end]
|
||||
}
|
||||
CropThing::Last(crop_byte_end) if byte_start < crop_byte_end => {
|
||||
*index += 1;
|
||||
[byte_start, byte_end]
|
||||
}
|
||||
_ => self.get_match_byte_position_range(&self.matches[*index]),
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: Description
|
||||
fn get_match_bounds(&self, mci: MatchesAndCropIndices) -> MatchBounds {
|
||||
let MatchesAndCropIndices {
|
||||
mut matches_first_index,
|
||||
mut matches_last_index,
|
||||
crop_byte_start,
|
||||
crop_byte_end,
|
||||
} = mci;
|
||||
|
||||
let [first_match_first_byte, first_match_last_byte] = self.get_match_byte_position_rangee(
|
||||
&mut matches_first_index,
|
||||
CropThing::First(crop_byte_start),
|
||||
);
|
||||
let first_match_first_byte = max(first_match_first_byte, crop_byte_start);
|
||||
|
||||
let [last_match_first_byte, last_match_last_byte] =
|
||||
if matches_first_index != matches_last_index {
|
||||
self.get_match_byte_position_rangee(
|
||||
&mut matches_last_index,
|
||||
CropThing::Last(crop_byte_end),
|
||||
)
|
||||
} else {
|
||||
[first_match_first_byte, first_match_last_byte]
|
||||
};
|
||||
let last_match_last_byte = min(last_match_last_byte, crop_byte_end);
|
||||
|
||||
let selected_matches_len = matches_last_index - matches_first_index + 1;
|
||||
let mut indices_size = 2 * selected_matches_len;
|
||||
|
||||
let crop_byte_start_is_not_first_match_start = crop_byte_start != first_match_first_byte;
|
||||
let crop_byte_end_is_not_last_match_end = crop_byte_end != last_match_last_byte;
|
||||
|
||||
if crop_byte_start_is_not_first_match_start {
|
||||
indices_size += 1;
|
||||
}
|
||||
|
||||
if crop_byte_end_is_not_last_match_end {
|
||||
indices_size += 1;
|
||||
}
|
||||
|
||||
let mut indices = Vec::with_capacity(indices_size);
|
||||
|
||||
if crop_byte_start_is_not_first_match_start {
|
||||
indices.push(crop_byte_start);
|
||||
}
|
||||
|
||||
indices.push(first_match_first_byte);
|
||||
|
||||
if selected_matches_len > 1 {
|
||||
indices.push(first_match_last_byte);
|
||||
}
|
||||
|
||||
if selected_matches_len > 2 {
|
||||
for index in (matches_first_index + 1)..matches_last_index {
|
||||
let [m_byte_start, m_byte_end] =
|
||||
self.get_match_byte_position_range(&self.matches[index]);
|
||||
|
||||
indices.push(m_byte_start);
|
||||
indices.push(m_byte_end);
|
||||
}
|
||||
}
|
||||
|
||||
if selected_matches_len > 1 {
|
||||
indices.push(last_match_first_byte);
|
||||
}
|
||||
|
||||
indices.push(last_match_last_byte);
|
||||
|
||||
if crop_byte_end_is_not_last_match_end {
|
||||
indices.push(crop_byte_end);
|
||||
}
|
||||
|
||||
MatchBounds { highlight_toggle: !crop_byte_start_is_not_first_match_start, indices }
|
||||
}
|
||||
|
||||
/// For crop but no highlight.
|
||||
fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> MatchBounds {
|
||||
let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size);
|
||||
let final_token = &self.tokens[final_token_index];
|
||||
|
||||
// TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start?
|
||||
|
||||
// TODO: Can here be an error, because it's byte_start but it could be byte_end?
|
||||
MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] }
|
||||
}
|
||||
|
||||
fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices {
|
||||
let asd = |i1, i2| {
|
||||
println!(
|
||||
"{}|{}|{}\n{} {}",
|
||||
self.tokens[..i1].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
|
||||
self.tokens[i1..i2].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
|
||||
self.tokens[i2..].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
|
||||
i1,
|
||||
i2
|
||||
);
|
||||
};
|
||||
|
||||
// TODO: This doesn't give back 2 phrases if one is out of crop window
|
||||
// Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them
|
||||
let [matches_first_index, matches_last_index] =
|
||||
super::best_match_range::get_best_match_index_range(
|
||||
self.matches,
|
||||
self.query_positions,
|
||||
crop_size,
|
||||
);
|
||||
|
||||
let first_match = &self.matches[matches_first_index];
|
||||
let last_match = &self.matches[matches_last_index];
|
||||
|
||||
let last_match_last_word_pos = last_match.get_last_word_pos();
|
||||
let first_match_first_word_pos = first_match.get_first_word_pos();
|
||||
|
||||
let words_count = last_match_last_word_pos - first_match_first_word_pos + 1;
|
||||
let [index_backward, index_forward] = get_adjusted_indices_for_highlights_and_crop_size(
|
||||
self.tokens,
|
||||
first_match.get_first_token_pos(),
|
||||
last_match.get_last_token_pos(),
|
||||
words_count,
|
||||
crop_size,
|
||||
);
|
||||
|
||||
asd(first_match.get_first_token_pos(), last_match.get_last_token_pos());
|
||||
asd(index_backward, index_forward);
|
||||
|
||||
let backward_token = &self.tokens[index_backward];
|
||||
let forward_token = &self.tokens[index_forward];
|
||||
|
||||
MatchesAndCropIndices {
|
||||
matches_first_index,
|
||||
matches_last_index,
|
||||
crop_byte_start: backward_token.byte_start,
|
||||
crop_byte_end: forward_token.byte_end,
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: description
|
||||
fn get_crop_and_highlight_bounds_with_matches(&self, crop_size: usize) -> MatchBounds {
|
||||
self.get_match_bounds(self.get_matches_and_crop_indices(crop_size))
|
||||
}
|
||||
|
||||
/// For when there are no matches, but crop is required.
|
||||
fn get_crop_bounds_with_matches(&self, crop_size: usize) -> MatchBounds {
|
||||
let mci = self.get_matches_and_crop_indices(crop_size);
|
||||
|
||||
MatchBounds {
|
||||
highlight_toggle: false,
|
||||
indices: vec![mci.crop_byte_start, mci.crop_byte_end],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MatchBounds {
|
||||
pub fn try_new(
|
||||
tokens: &[Token],
|
||||
matches: &[Match],
|
||||
query_positions: &[QueryPosition],
|
||||
format_options: FormatOptions,
|
||||
) -> Option<MatchBounds> {
|
||||
let mbh = MatchBoundsHelper { tokens, matches, query_positions };
|
||||
|
||||
if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) {
|
||||
if matches.is_empty() {
|
||||
return Some(mbh.get_crop_bounds_with_no_matches(crop_size));
|
||||
}
|
||||
|
||||
if format_options.highlight {
|
||||
return Some(mbh.get_crop_and_highlight_bounds_with_matches(crop_size));
|
||||
}
|
||||
|
||||
return Some(mbh.get_crop_bounds_with_matches(crop_size));
|
||||
}
|
||||
|
||||
if !format_options.highlight || matches.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(mbh.get_match_bounds(MatchesAndCropIndices {
|
||||
matches_first_index: 0,
|
||||
matches_last_index: matches.len() - 1,
|
||||
crop_byte_start: 0,
|
||||
crop_byte_end: tokens[tokens.len() - 1].byte_end,
|
||||
}))
|
||||
}
|
||||
}
|
|
@ -1,24 +1,89 @@
|
|||
use std::cmp::Reverse;
|
||||
use std::fmt;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::fmt::{Debug, Formatter, Result};
|
||||
|
||||
use charabia::Token;
|
||||
|
||||
use super::super::interner::Interned;
|
||||
use super::super::query_term::LocatedQueryTerm;
|
||||
use super::super::{DedupInterner, Phrase};
|
||||
use super::r#match::{Match, MatchPosition};
|
||||
use crate::SearchContext;
|
||||
|
||||
pub struct LocatedMatchingPhrase {
|
||||
pub value: Interned<Phrase>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
enum PrefixedOrEquality {
|
||||
Prefixed,
|
||||
Equality,
|
||||
NotApplicable,
|
||||
}
|
||||
|
||||
pub struct LocatedMatchingWords {
|
||||
pub value: Vec<Interned<String>>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
pub is_prefix: bool,
|
||||
pub original_char_count: usize,
|
||||
impl PrefixedOrEquality {
|
||||
fn new(string: &str, other_string: &str, is_other_string_prefix: bool) -> Self {
|
||||
if string.is_empty() {
|
||||
return if other_string.is_empty() { Self::Equality } else { Self::NotApplicable };
|
||||
}
|
||||
|
||||
let mut other_string_iter = other_string.chars();
|
||||
|
||||
for c in string.chars() {
|
||||
let Some(other_c) = other_string_iter.next() else {
|
||||
return if is_other_string_prefix { Self::Prefixed } else { Self::NotApplicable };
|
||||
};
|
||||
|
||||
if c != other_c {
|
||||
return Self::NotApplicable;
|
||||
}
|
||||
}
|
||||
|
||||
if other_string_iter.next().is_some() {
|
||||
return Self::NotApplicable;
|
||||
}
|
||||
|
||||
Self::Equality
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Consider using a tuple here, because indexing this thing out of bounds only incurs a runtime error
|
||||
pub type UserQueryPositionRange = [u16; 2];
|
||||
|
||||
struct LocatedMatchingPhrase {
|
||||
value: Interned<Phrase>,
|
||||
position: UserQueryPositionRange,
|
||||
}
|
||||
|
||||
struct LocatedMatchingWords {
|
||||
value: Vec<Interned<String>>,
|
||||
position: UserQueryPositionRange,
|
||||
is_prefix: bool,
|
||||
original_char_count: usize,
|
||||
}
|
||||
|
||||
struct TokenPositionHelper<'a> {
|
||||
token: &'a Token<'a>,
|
||||
position_by_word: usize,
|
||||
position_by_token: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenPositionHelper<'a> {
|
||||
fn iter_from_tokens(tokens: &'a [Token]) -> impl Iterator<Item = Self> + Clone {
|
||||
tokens
|
||||
.iter()
|
||||
.scan([0, 0], |[token_position, word_position], token| {
|
||||
// TODO: Naming
|
||||
let token_word_thingy = Self {
|
||||
position_by_token: *token_position,
|
||||
position_by_word: *word_position,
|
||||
token,
|
||||
};
|
||||
|
||||
*token_position += 1;
|
||||
|
||||
if !token.is_separator() {
|
||||
*word_position += 1;
|
||||
}
|
||||
|
||||
Some(token_word_thingy)
|
||||
})
|
||||
.filter(|t| !t.token.is_separator())
|
||||
}
|
||||
}
|
||||
|
||||
/// Structure created from a query tree
|
||||
|
@ -27,180 +92,263 @@ pub struct LocatedMatchingWords {
|
|||
pub struct MatchingWords {
|
||||
word_interner: DedupInterner<String>,
|
||||
phrase_interner: DedupInterner<Phrase>,
|
||||
phrases: Vec<LocatedMatchingPhrase>,
|
||||
words: Vec<LocatedMatchingWords>,
|
||||
located_matching_phrases: Vec<LocatedMatchingPhrase>,
|
||||
located_matching_words: Vec<LocatedMatchingWords>,
|
||||
}
|
||||
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
pub struct QueryPosition {
|
||||
pub range: UserQueryPositionRange,
|
||||
pub index: usize,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||
let mut phrases = Vec::new();
|
||||
let mut words = Vec::new();
|
||||
pub fn new(ctx: SearchContext, located_terms: &[LocatedQueryTerm]) -> Self {
|
||||
let mut located_matching_phrases = Vec::new();
|
||||
let mut located_matching_words = Vec::new();
|
||||
|
||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm
|
||||
// and wrap them in dedicated structures.
|
||||
for located_term in located_terms {
|
||||
let term = ctx.term_interner.get(located_term.value);
|
||||
for LocatedQueryTerm { value, positions } in located_terms {
|
||||
let term = ctx.term_interner.get(*value);
|
||||
let (matching_words, matching_phrases) = term.all_computed_derivations();
|
||||
|
||||
for matching_phrase in matching_phrases {
|
||||
phrases.push(LocatedMatchingPhrase {
|
||||
value: matching_phrase,
|
||||
positions: located_term.positions.clone(),
|
||||
});
|
||||
let position = [*positions.start(), *positions.end()];
|
||||
|
||||
if !matching_phrases.is_empty() {
|
||||
located_matching_phrases.reserve(matching_phrases.len());
|
||||
located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
|
||||
LocatedMatchingPhrase { value: *matching_phrase, position }
|
||||
}));
|
||||
}
|
||||
|
||||
words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
positions: located_term.positions.clone(),
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
if !matching_words.is_empty() {
|
||||
located_matching_words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
position,
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Sort word to put prefixes at the bottom prioritizing the exact matches.
|
||||
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
|
||||
// Sort words by having `is_prefix` as false first and then by their lengths in reverse order.
|
||||
// This is only meant to help with what we match a token against first.
|
||||
located_matching_words.sort_unstable_by_key(|lmw| {
|
||||
(lmw.is_prefix, Reverse(lmw.position[1] - lmw.position[0]))
|
||||
});
|
||||
|
||||
Self {
|
||||
phrases,
|
||||
words,
|
||||
located_matching_phrases,
|
||||
located_matching_words,
|
||||
word_interner: ctx.word_interner,
|
||||
phrase_interner: ctx.phrase_interner,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
|
||||
fn try_get_phrase_match<'a>(
|
||||
&self,
|
||||
token_position_helper_iter: &mut (impl Iterator<Item = TokenPositionHelper<'a>> + Clone),
|
||||
) -> Option<(Match, UserQueryPositionRange)> {
|
||||
let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| {
|
||||
let words = &self.phrase_interner.get(lmp.value).words;
|
||||
|
||||
let words_iter = words
|
||||
.iter()
|
||||
.map(|maybe_word| maybe_word.map(|word| self.word_interner.get(word).as_str()))
|
||||
.peekable();
|
||||
|
||||
(lmp.position, words_iter)
|
||||
});
|
||||
|
||||
'outer: loop {
|
||||
let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?;
|
||||
|
||||
// TODO: if it's worth it, clone only if we have to
|
||||
let mut tph_iter = token_position_helper_iter.clone();
|
||||
|
||||
let mut first_tph_details = None;
|
||||
let last_tph_details = loop {
|
||||
// 1. get word from `words_iter` and token word thingy from `token_word_thingy_iter`
|
||||
let (Some(word), Some(tph)) = (words_iter.next(), tph_iter.next()) else {
|
||||
// 2. if there are no more words or token word thingys, get to next phrase and reset `token_word_thingy_iter`
|
||||
continue 'outer;
|
||||
};
|
||||
|
||||
// ?. save first token position bla bla bla
|
||||
if first_tph_details.is_none() {
|
||||
first_tph_details = Some([
|
||||
tph.position_by_token,
|
||||
tph.position_by_word,
|
||||
tph.token.char_start,
|
||||
tph.token.byte_start,
|
||||
]);
|
||||
}
|
||||
|
||||
// 3. check if word matches our token
|
||||
let is_matching = match word {
|
||||
Some(word) => tph.token.lemma() == word,
|
||||
// a `None` value in the phrase words iterator corresponds to a stop word,
|
||||
// the value is considered a match if the current token is categorized as a stop word.
|
||||
None => tph.token.is_stopword(),
|
||||
};
|
||||
|
||||
// 4. if it does not, get to next phrase and restart `token_word_thingy_iter`
|
||||
if !is_matching {
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
// 5. if it does, and there are no words left, time to return
|
||||
if words_iter.peek().is_none() {
|
||||
break [
|
||||
tph.position_by_token,
|
||||
tph.position_by_word,
|
||||
tph.token.char_end,
|
||||
tph.token.byte_end,
|
||||
];
|
||||
}
|
||||
};
|
||||
|
||||
let [first_tph_position_by_token, first_tph_position_by_word, first_tph_char_start, first_tph_byte_start] =
|
||||
first_tph_details.expect("TODO");
|
||||
let [last_tph_position_by_token, last_tph_position_by_word, last_tph_char_end, last_tph_byte_end] =
|
||||
last_tph_details;
|
||||
|
||||
// save new position in parameter iterator
|
||||
*token_position_helper_iter = tph_iter;
|
||||
|
||||
return Some((
|
||||
Match {
|
||||
// do not +1, because Token index ranges are exclusive
|
||||
byte_len: last_tph_byte_end - first_tph_byte_start,
|
||||
char_count: last_tph_char_end - first_tph_char_start,
|
||||
position: MatchPosition::Phrase {
|
||||
word_position_range: [
|
||||
first_tph_position_by_word,
|
||||
last_tph_position_by_word,
|
||||
],
|
||||
token_position_range: [
|
||||
first_tph_position_by_token,
|
||||
last_tph_position_by_token,
|
||||
],
|
||||
},
|
||||
},
|
||||
query_position_range,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to match the token with one of the located_words.
|
||||
fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
for located_words in &self.words {
|
||||
for word in &located_words.value {
|
||||
let word = self.word_interner.get(*word);
|
||||
// if the word is a prefix we match using starts_with.
|
||||
if located_words.is_prefix && token.lemma().starts_with(word) {
|
||||
let Some((char_index, c)) =
|
||||
word.char_indices().take(located_words.original_char_count).last()
|
||||
else {
|
||||
continue;
|
||||
fn try_get_word_match(
|
||||
&self,
|
||||
tph: TokenPositionHelper,
|
||||
text: &str,
|
||||
) -> Option<(Match, UserQueryPositionRange)> {
|
||||
// TODO: There is potentially an optimization to be made here
|
||||
// if we matched a term then we can skip checking it for further iterations?
|
||||
|
||||
println!(
|
||||
"{:?}",
|
||||
self.located_matching_words
|
||||
.iter()
|
||||
.flat_map(|lw| lw.value.iter().map(move |w| (
|
||||
lw.is_prefix,
|
||||
lw.original_char_count,
|
||||
self.word_interner.get(*w)
|
||||
)))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
self.located_matching_words
|
||||
.iter()
|
||||
.flat_map(|lw| lw.value.iter().map(move |w| (lw, self.word_interner.get(*w))))
|
||||
.find_map(|(located_words, word)| {
|
||||
let [char_count, byte_len] =
|
||||
match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix)
|
||||
{
|
||||
PrefixedOrEquality::Prefixed => {
|
||||
let prefix_byte_len = text[tph.token.byte_start..]
|
||||
.char_indices()
|
||||
.nth(located_words.original_char_count - 1)
|
||||
.map(|(i, c)| i + c.len_utf8())
|
||||
.expect("expected text to have n-th thing bal bla TODO");
|
||||
|
||||
// TODO: Investigate token original byte length and similar methods and why they're not good enough
|
||||
// That might be because token original byte length only or could also refer to the normalized byte length
|
||||
|
||||
[located_words.original_char_count, prefix_byte_len]
|
||||
}
|
||||
// do not +1, because Token index ranges are exclusive
|
||||
PrefixedOrEquality::Equality => [
|
||||
tph.token.char_end - tph.token.char_start,
|
||||
tph.token.byte_end - tph.token.byte_start,
|
||||
],
|
||||
_ => return None,
|
||||
};
|
||||
let prefix_length = char_index + c.len_utf8();
|
||||
let (char_count, byte_len) = token.original_lengths(prefix_length);
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { ids, char_count, byte_len });
|
||||
// else we exact match the token.
|
||||
} else if token.lemma() == word {
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full {
|
||||
char_count: token.char_end - token.char_start,
|
||||
byte_len: token.byte_end - token.byte_start,
|
||||
ids,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
matching_words: &'a MatchingWords,
|
||||
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.phrases.next() {
|
||||
// Try to match all the phrases first.
|
||||
Some(located_phrase) => {
|
||||
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
|
||||
|
||||
// create a PartialMatch struct to make it compute the first match
|
||||
// instead of duplicating the code.
|
||||
let ids = &located_phrase.positions;
|
||||
// collect the references of words from the interner.
|
||||
let words = phrase
|
||||
.words
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||
})
|
||||
.collect();
|
||||
let partial = PartialMatch { matching_words: words, ids };
|
||||
|
||||
partial.match_token(self.token).or_else(|| self.next())
|
||||
}
|
||||
// If no phrases matches, try to match uiques words.
|
||||
None => self.matching_words.match_unique_words(self.token),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type WordId = u16;
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: Vec<Option<&'a str>>,
|
||||
ids: &'a RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
let Self { mut matching_words, ids, .. } = self;
|
||||
|
||||
let is_matching = match matching_words.first()? {
|
||||
Some(word) => &token.lemma() == word,
|
||||
// a None value in the phrase corresponds to a stop word,
|
||||
// the walue is considered a match if the current token is categorized as a stop word.
|
||||
None => token.is_stopword(),
|
||||
};
|
||||
|
||||
// if there are remaining words to match in the phrase and the current token is matching,
|
||||
// return a new Partial match allowing the highlighter to continue.
|
||||
if is_matching && matching_words.len() > 1 {
|
||||
matching_words.remove(0);
|
||||
Some(MatchType::Partial(Self { matching_words, ids }))
|
||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||
// return a Full match.
|
||||
} else if is_matching {
|
||||
Some(MatchType::Full {
|
||||
char_count: token.char_end - token.char_start,
|
||||
byte_len: token.byte_end - token.byte_start,
|
||||
ids,
|
||||
Some((
|
||||
Match {
|
||||
char_count,
|
||||
byte_len,
|
||||
position: MatchPosition::Word {
|
||||
word_position: tph.position_by_word,
|
||||
token_position: tph.position_by_token,
|
||||
},
|
||||
},
|
||||
located_words.position,
|
||||
))
|
||||
})
|
||||
// if the current token doesn't match, return None to break the match sequence.
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
||||
pub fn get_matches_and_query_positions(
|
||||
&self,
|
||||
tokens: &[Token],
|
||||
text: &str,
|
||||
) -> (Vec<Match>, Vec<QueryPosition>) {
|
||||
// TODO: Note in the doc that with the help of this iter, matches are guaranteed to be ordered
|
||||
let mut token_position_helper_iter = TokenPositionHelper::iter_from_tokens(tokens);
|
||||
let mut matches = Vec::new();
|
||||
let mut query_positions = Vec::new();
|
||||
|
||||
loop {
|
||||
// try and get a phrase match
|
||||
if let Some((r#match, range)) =
|
||||
self.try_get_phrase_match(&mut token_position_helper_iter)
|
||||
{
|
||||
matches.push(r#match);
|
||||
query_positions.push(QueryPosition { range, index: matches.len() - 1 });
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the above fails, try get next token position helper
|
||||
if let Some(tph) = token_position_helper_iter.next() {
|
||||
// and then try and get a word match
|
||||
if let Some((r#match, range)) = self.try_get_word_match(tph, text) {
|
||||
matches.push(r#match);
|
||||
query_positions.push(QueryPosition { range, index: matches.len() - 1 });
|
||||
}
|
||||
} else {
|
||||
// there are no more items in the iterator, we are done searching for matches
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: Explain why
|
||||
query_positions.sort_unstable_by_key(|v| v.range[0]);
|
||||
|
||||
(matches, query_positions)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
|
||||
impl Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
|
||||
let MatchingWords {
|
||||
word_interner,
|
||||
phrase_interner,
|
||||
located_matching_phrases: phrases,
|
||||
located_matching_words: words,
|
||||
} = self;
|
||||
|
||||
let phrases: Vec<_> = phrases
|
||||
.iter()
|
||||
|
@ -213,37 +361,33 @@ impl fmt::Debug for MatchingWords {
|
|||
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
p.positions.clone(),
|
||||
p.position,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let words: Vec<_> = words
|
||||
.iter()
|
||||
.flat_map(|w| {
|
||||
w.value
|
||||
.iter()
|
||||
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
|
||||
.map(|s| (word_interner.get(*s), w.position, w.is_prefix))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
|
||||
mod tests {
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::query_term::ExtractedTokens;
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
use std::borrow::Cow;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
|
@ -262,70 +406,77 @@ pub(crate) mod tests {
|
|||
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let text = "split this world";
|
||||
let tokens = tokenizer.tokenize(text);
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
let matching_words = MatchingWords::new(ctx, &query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
matching_words.get_matches_and_query_positions(
|
||||
&[
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
}
|
||||
],
|
||||
text
|
||||
),
|
||||
(
|
||||
vec![
|
||||
Match {
|
||||
char_count: 5,
|
||||
byte_len: 5,
|
||||
position: MatchPosition::Word { word_position: 0, token_position: 0 }
|
||||
},
|
||||
Match {
|
||||
char_count: 5,
|
||||
byte_len: 5,
|
||||
position: MatchPosition::Word { word_position: 2, token_position: 2 }
|
||||
},
|
||||
Match {
|
||||
char_count: 5,
|
||||
byte_len: 5,
|
||||
position: MatchPosition::Word { word_position: 3, token_position: 3 }
|
||||
}
|
||||
],
|
||||
vec![
|
||||
QueryPosition { range: [0, 0], index: 0 },
|
||||
QueryPosition { range: [2, 2], index: 1 },
|
||||
QueryPosition { range: [2, 2], index: 2 }
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,15 +0,0 @@
|
|||
use charabia::{SeparatorKind, Token, TokenKind};
|
||||
|
||||
pub enum SimpleTokenKind {
|
||||
Separator(SeparatorKind),
|
||||
NotSeparator,
|
||||
}
|
||||
|
||||
impl SimpleTokenKind {
|
||||
pub fn new(token: &&Token<'_>) -> Self {
|
||||
match token.kind {
|
||||
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
|
||||
_ => Self::NotSeparator,
|
||||
}
|
||||
}
|
||||
}
|
|
@ -489,8 +489,7 @@ impl QueryTerm {
|
|||
let mut words = BTreeSet::new();
|
||||
let mut phrases = BTreeSet::new();
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
|
||||
&self.zero_typo;
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, .. } = &self.zero_typo;
|
||||
words.extend(zero_typo.iter().copied());
|
||||
words.extend(prefix_of.iter().copied());
|
||||
phrases.extend(phrase.iter().copied());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue