diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 50eec46fe..919fb0a74 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1551,9 +1551,10 @@ fn retrieve_documents>( Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()).chain( - (retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"), - ), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain(retrieve_vectors.should_retrieve().then_some("_vectors")), ), None => document?, }) @@ -1586,7 +1587,7 @@ fn retrieve_document>( attributes_to_retrieve .iter() .map(|s| s.as_ref()) - .chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")), + .chain(retrieve_vectors.should_retrieve().then_some("_vectors")), ), None => document, }; diff --git a/crates/meilisearch/src/search/federated/perform.rs b/crates/meilisearch/src/search/federated/perform.rs index 5ad64d63c..0c5a94b4c 100644 --- a/crates/meilisearch/src/search/federated/perform.rs +++ b/crates/meilisearch/src/search/federated/perform.rs @@ -815,7 +815,8 @@ impl SearchByIndex { let (result, _semantic_hit_count) = super::super::search_from_kind(index_uid.to_string(), search_kind, search)?; - let format = AttributesFormat { + + let attributes_format = AttributesFormat { attributes_to_retrieve: query.attributes_to_retrieve, retrieve_vectors, attributes_to_highlight: query.attributes_to_highlight, @@ -846,12 +847,11 @@ impl SearchByIndex { let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); - let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - let hit_maker = - HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| { - MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) - })?; + HitMaker::new(matching_words, tokenizer, attributes_format, &index, &rtxn) + .map_err(|e| { + MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) + })?; results_by_query.push(SearchResultByQuery { weight, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 5e543c53f..8ea21cf32 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1,4 +1,5 @@ use core::fmt; +use std::borrow::Cow; use std::cmp::min; use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::str::FromStr; @@ -28,11 +29,11 @@ use meilisearch_types::{milli, Document}; use milli::tokenizer::{Language, TokenizerBuilder}; use milli::{ AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, LocalizedAttributesRule, - MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + MarkerOptions, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; use regex::Regex; use serde::{Deserialize, Serialize}; -use serde_json::{json, Value}; +use serde_json::{json, Map, Value}; #[cfg(test)] mod mod_test; use utoipa::ToSchema; @@ -47,7 +48,9 @@ pub use federated::{ mod ranking_rules; -type MatchesPosition = BTreeMap>; +// TODO: Adapt this type to support cropping +// { "_matchesPosition": { "overview": { first: false, highlighted: [[0,4,6,11,5,234,6,241,5]] } } } +// type MatchesPosition = BTreeMap>; pub const DEFAULT_SEARCH_OFFSET: fn() -> usize = || 0; pub const DEFAULT_SEARCH_LIMIT: fn() -> usize = || 20; @@ -810,11 +813,9 @@ pub struct SearchHit { #[serde(flatten)] #[schema(additional_properties, inline, value_type = HashMap)] pub document: Document, - #[serde(default, rename = "_formatted", skip_serializing_if = "Document::is_empty")] + #[serde(default, rename = "_formatted", skip_serializing_if = "Option::is_none")] #[schema(additional_properties, value_type = HashMap)] - pub formatted: Document, - #[serde(default, rename = "_matchesPosition", skip_serializing_if = "Option::is_none")] - pub matches_position: Option, + pub formatted: Option, #[serde(default, rename = "_rankingScore", skip_serializing_if = "Option::is_none")] pub ranking_score: Option, #[serde(default, rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")] @@ -1291,6 +1292,7 @@ struct AttributesFormat { crop_marker: String, highlight_pre_tag: String, highlight_post_tag: String, + // TODO: Might want to rename this to signify that this will not yield _formatted anymore, only positions show_matches_position: bool, sort: Option>, show_ranking_score: bool, @@ -1298,7 +1300,7 @@ struct AttributesFormat { locales: Option>, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy)] pub enum RetrieveVectors { /// Remove the `_vectors` field /// @@ -1318,6 +1320,10 @@ impl RetrieveVectors { Self::Hide } } + + pub fn should_retrieve(&self) -> bool { + matches!(self, Self::Retrieve) + } } struct HitMaker<'a> { @@ -1329,7 +1335,7 @@ struct HitMaker<'a> { retrieve_vectors: RetrieveVectors, to_retrieve_ids: BTreeSet, embedding_configs: Vec, - formatter_builder: MatcherBuilder<'a>, + matcher_builder: MatcherBuilder<'a>, formatted_options: BTreeMap, show_ranking_score: bool, show_ranking_score_details: bool, @@ -1357,24 +1363,20 @@ impl<'a> HitMaker<'a> { tokenizer_builder.into_tokenizer() } - pub fn formatter_builder( - matching_words: milli::MatchingWords, - tokenizer: milli::tokenizer::Tokenizer<'_>, - ) -> MatcherBuilder<'_> { - let formatter_builder = MatcherBuilder::new(matching_words, tokenizer); - - formatter_builder - } - pub fn new( + matching_words: milli::MatchingWords, + tokenizer: milli::tokenizer::Tokenizer<'a>, + attr_fmt: AttributesFormat, index: &'a Index, rtxn: &'a RoTxn<'a>, - format: AttributesFormat, - mut formatter_builder: MatcherBuilder<'a>, ) -> milli::Result { - formatter_builder.crop_marker(format.crop_marker); - formatter_builder.highlight_prefix(format.highlight_pre_tag); - formatter_builder.highlight_suffix(format.highlight_post_tag); + let AttributesFormat { highlight_pre_tag, highlight_post_tag, crop_marker, .. } = attr_fmt; + + let matcher_builder = MatcherBuilder::new( + matching_words, + tokenizer, + MarkerOptions { highlight_pre_tag, highlight_post_tag, crop_marker }, + ); let fields_ids_map = index.fields_ids_map(rtxn)?; let displayed_ids = index @@ -1392,21 +1394,21 @@ impl<'a> HitMaker<'a> { let displayed_names = index.displayed_fields(rtxn)?.unwrap(); !displayed_names.contains(&milli::constants::RESERVED_VECTORS_FIELD_NAME) } - // displayed_ids is a finit list, so hide if `_vectors` is not part of it + // displayed_ids is a finite list, so hide if `_vectors` is not part of it (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), }; let displayed_ids = displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); - let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors { + let retrieve_vectors = if let RetrieveVectors::Retrieve = attr_fmt.retrieve_vectors { if vectors_is_hidden { RetrieveVectors::Hide } else { RetrieveVectors::Retrieve } } else { - format.retrieve_vectors + attr_fmt.retrieve_vectors }; let fids = |attrs: &BTreeSet| { @@ -1423,7 +1425,7 @@ impl<'a> HitMaker<'a> { } ids }; - let to_retrieve_ids: BTreeSet<_> = format + let to_retrieve_ids: BTreeSet<_> = attr_fmt .attributes_to_retrieve .as_ref() .map(fids) @@ -1432,12 +1434,12 @@ impl<'a> HitMaker<'a> { .cloned() .collect(); - let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); - let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); + let attr_to_highlight = attr_fmt.attributes_to_highlight.unwrap_or_default(); + let attr_to_crop = attr_fmt.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( &attr_to_highlight, &attr_to_crop, - format.crop_length, + attr_fmt.crop_length, &to_retrieve_ids, &fields_ids_map, &displayed_ids, @@ -1454,51 +1456,53 @@ impl<'a> HitMaker<'a> { retrieve_vectors, to_retrieve_ids, embedding_configs, - formatter_builder, + matcher_builder, formatted_options, - show_ranking_score: format.show_ranking_score, - show_ranking_score_details: format.show_ranking_score_details, - show_matches_position: format.show_matches_position, - sort: format.sort, - locales: format.locales, + show_ranking_score: attr_fmt.show_ranking_score, + show_ranking_score_details: attr_fmt.show_ranking_score_details, + show_matches_position: attr_fmt.show_matches_position, + sort: attr_fmt.sort, + locales: attr_fmt.locales, }) } - pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result { - let (_, obkv) = - self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; + fn make_document(&self, obkv: &obkv::KvReaderU16) -> milli::Result { + let mut document = serde_json::Map::new(); - // First generate a document with all the displayed fields - let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?; - - let add_vectors_fid = - self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve); - - // select the attributes to retrieve - let attributes_to_retrieve = self - .to_retrieve_ids - .iter() - // skip the vectors_fid if RetrieveVectors::Hide - .filter(|fid| match self.vectors_fid { - Some(vectors_fid) => { - !(self.retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid) + // recreate JSON with appropriate attributes + for (key, value) in obkv.iter() { + if self.vectors_fid.is_some_and(|vectors_fid| vectors_fid == key) { + // (vectors aren't considered in `displayedAttributes` and `attributesToRetrieve`, but rather with `retrieveVectors`) + if !self.retrieve_vectors.should_retrieve() { + continue; } - None => true, - }) - // need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve` - .chain(add_vectors_fid.iter()) - .map(|&fid| self.fields_ids_map.name(fid).expect("Missing field name")); + } else if !self.to_retrieve_ids.contains(&key) || !self.displayed_ids.contains(&key) { + // https://www.meilisearch.com/docs/reference/api/settings#displayed-attributes + // https://www.meilisearch.com/docs/reference/api/search#attributes-to-retrieve + continue; + } - let mut document = - permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + let key = self.fields_ids_map.name(key).expect("Missing field name").to_string(); - if self.retrieve_vectors == RetrieveVectors::Retrieve { - // Clippy is wrong + document.insert(key, value); + } + + Ok(document) + } + + pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result { + let obkv = self.index.document(self.rtxn, id)?; + + let mut document = self.make_document(obkv)?; + + if self.retrieve_vectors.should_retrieve() { #[allow(clippy::manual_unwrap_or_default)] let mut vectors = match document.remove("_vectors") { Some(Value::Object(map)) => map, _ => Default::default(), }; + for (name, vector) in self.index.embeddings(self.rtxn, id)? { let user_provided = self .embedding_configs @@ -1507,6 +1511,7 @@ impl<'a> HitMaker<'a> { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; + vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, @@ -1518,10 +1523,10 @@ impl<'a> HitMaker<'a> { let localized_attributes = self.index.localized_attributes_rules(self.rtxn)?.unwrap_or_default(); - let (matches_position, formatted) = format_fields( - &displayed_document, + let formatted = format_fields( + &mut document, &self.fields_ids_map, - &self.formatter_builder, + &self.matcher_builder, &self.formatted_options, self.show_matches_position, &self.displayed_ids, @@ -1538,13 +1543,7 @@ impl<'a> HitMaker<'a> { let ranking_score_details = self.show_ranking_score_details.then(|| ScoreDetails::to_json_map(score.iter())); - let hit = SearchHit { - document, - formatted, - matches_position, - ranking_score_details, - ranking_score, - }; + let hit = SearchHit { document, formatted, ranking_score_details, ranking_score }; Ok(hit) } @@ -1553,7 +1552,7 @@ impl<'a> HitMaker<'a> { fn make_hits<'a>( index: &Index, rtxn: &RoTxn<'_>, - format: AttributesFormat, + attributes_format: AttributesFormat, matching_words: milli::MatchingWords, documents_ids_scores: impl Iterator)> + 'a, ) -> milli::Result> { @@ -1568,9 +1567,7 @@ fn make_hits<'a>( let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); - let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - - let hit_maker = HitMaker::new(index, rtxn, format, formatter_builder)?; + let hit_maker = HitMaker::new(matching_words, tokenizer, attributes_format, index, rtxn)?; for (id, score) in documents_ids_scores { documents.push(hit_maker.make_hit(id, score)?); @@ -1886,59 +1883,100 @@ fn add_non_formatted_ids_to_formatted_options( } } -fn make_document( - displayed_attributes: &BTreeSet, - field_ids_map: &FieldsIdsMap, - obkv: &obkv::KvReaderU16, -) -> milli::Result { - let mut document = serde_json::Map::new(); - - // recreate the original json - for (key, value) in obkv.iter() { - let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; - let key = field_ids_map.name(key).expect("Missing field name").to_string(); - - document.insert(key, value); - } - - // select the attributes to retrieve - let displayed_attributes = displayed_attributes - .iter() - .map(|&fid| field_ids_map.name(fid).expect("Missing field name")); - - let document = permissive_json_pointer::select_values(&document, displayed_attributes); - Ok(document) -} - #[allow(clippy::too_many_arguments)] fn format_fields( - document: &Document, + document: &mut Document, field_ids_map: &FieldsIdsMap, - builder: &MatcherBuilder<'_>, + matcher_builder: &MatcherBuilder<'_>, formatted_options: &BTreeMap, - compute_matches: bool, + show_matches_position: bool, displayable_ids: &BTreeSet, locales: Option<&[Language]>, localized_attributes: &[LocalizedAttributesRule], -) -> milli::Result<(Option, Document)> { - let mut matches_position = compute_matches.then(BTreeMap::new); - let mut document = document.clone(); - +) -> milli::Result> { // reduce the formatted option list to the attributes that should be formatted, // instead of all the attributes to display. - let formatting_fields_options: Vec<_> = formatted_options + let formatting_fields_options = formatted_options .iter() .filter(|(_, option)| option.should_format()) .map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option)) - .collect(); + .collect::>(); // select the attributes to retrieve let displayable_names = displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name")); + + let get_format_options = |key: Cow<'_, str>| { + formatting_fields_options + .iter() + .filter(|(name, ..)| { + milli::is_faceted_by(name, &key) || milli::is_faceted_by(&key, name) + }) + .map(|(_, option)| **option) + .reduce(|acc, option| acc.merge(option)) + }; + + let get_locales = |key: Cow<'_, str>| { + // TODO: Should this be re computed every time? + // if no locales has been provided, we try to find the locales in the localized_attributes. + locales.or_else(|| { + localized_attributes + .iter() + .find(|rule| matches!(rule.match_str(&key), PatternMatch::Match)) + .map(LocalizedAttributesRule::locales) + }) + }; + + fn get_text(value: &mut Value) -> Option> { + match value { + Value::String(text) => Some(Cow::Borrowed(text)), + Value::Number(number) => Some(Cow::Owned(number.to_string())), + // boolean and null can not be matched by meili, can not be formatted + // and array or object cannot be yielded by `permissive_json_pointer::map_leaf_values` + _ => None, + } + } + + if show_matches_position { + permissive_json_pointer::map_leaf_values(document, displayable_names, |key, _, value| { + let Some(text) = get_text(value) else { + *value = Value::Object(Map::from_iter(std::iter::once(( + "value".to_string(), + value.take(), + )))); + + return; + }; + + let locales = get_locales(Cow::from(key)); + let mut matcher = matcher_builder.build(&text, locales); + let format_options = get_format_options(Cow::from(key)); + let match_bounds = matcher.get_match_bounds(format_options); + + let value_iter = std::iter::once(("value".to_string(), value.take())); + + // do not include `matches` in case there is nothing to format + let json_map = if let Some(mb) = match_bounds { + let matches_iter = std::iter::once(( + "matches".to_string(), + serde_json::to_value(mb).expect("TODO"), + )); + Map::from_iter(value_iter.chain(matches_iter)) + } else { + Map::from_iter(value_iter) + }; + + *value = Value::Object(json_map); + }); + + return Ok(None); + } + + let mut formatted_document = document.clone(); permissive_json_pointer::map_leaf_values( - &mut document, + &mut formatted_document, displayable_names, - |key, array_indices, value| { + |key, _, value| { // To get the formatting option of each key we need to see all the rules that applies // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only @@ -1946,37 +1984,22 @@ fn format_fields( // Warn: The time to compute the format list scales with the number of fields to format; // cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity: // d*f where d is the total number of fields to display and f is the total number of fields to format. - let format = formatting_fields_options - .iter() - .filter(|(name, _option)| { - milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) - }) - .map(|(_, option)| **option) - .reduce(|acc, option| acc.merge(option)); - let mut infos = Vec::new(); + let Some(text) = get_text(value) else { + return; + }; - // if no locales has been provided, we try to find the locales in the localized_attributes. - let locales = locales.or_else(|| { - localized_attributes - .iter() - .find(|rule| rule.match_str(key) == PatternMatch::Match) - .map(LocalizedAttributesRule::locales) - }); + let format_options = get_format_options(Cow::from(key)); - *value = format_value( - std::mem::take(value), - builder, - format, - &mut infos, - compute_matches, - array_indices, - locales, - ); + // there's nothing to format + if !format_options.is_some_and(|v| v.should_format()) { + return; + } - if let Some(matches) = matches_position.as_mut() { - if !infos.is_empty() { - matches.insert(key.to_owned(), infos); - } + let locales = get_locales(Cow::from(key)); + + let mut matcher = matcher_builder.build(&text, locales); + if let Some(formatted_text) = matcher.get_formatted_text(format_options) { + *value = Value::String(formatted_text); } }, ); @@ -1986,58 +2009,9 @@ fn format_fields( // This unwrap must be safe since we got the ids from the fields_ids_map just // before. .map(|&fid| field_ids_map.name(fid).unwrap()); - let document = permissive_json_pointer::select_values(&document, selectors); + let formatted_document = permissive_json_pointer::select_values(&formatted_document, selectors); - Ok((matches_position, document)) -} - -fn format_value( - value: Value, - builder: &MatcherBuilder<'_>, - format_options: Option, - infos: &mut Vec, - compute_matches: bool, - array_indices: &[usize], - locales: Option<&[Language]>, -) -> Value { - match value { - Value::String(old_string) => { - let mut matcher = builder.build(&old_string, locales); - if compute_matches { - let matches = matcher.matches(array_indices); - infos.extend_from_slice(&matches[..]); - } - - match format_options { - Some(format_options) => { - let value = matcher.format(format_options); - Value::String(value.into_owned()) - } - None => Value::String(old_string), - } - } - // `map_leaf_values` makes sure this is only called for leaf fields - Value::Array(_) => unreachable!(), - Value::Object(_) => unreachable!(), - Value::Number(number) => { - let s = number.to_string(); - - let mut matcher = builder.build(&s, locales); - if compute_matches { - let matches = matcher.matches(array_indices); - infos.extend_from_slice(&matches[..]); - } - - match format_options { - Some(format_options) => { - let value = matcher.format(format_options); - Value::String(value.into_owned()) - } - None => Value::String(s), - } - } - value => value, - } + Ok(Some(formatted_document)) } pub(crate) fn parse_filter( diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 504b4c68d..fbaf36fe8 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -80,8 +80,9 @@ pub use self::localized_attributes_rules::LocalizedAttributesRule; pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::similar::Similar; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy, - Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder, + MatchingWords, OrderBy, Search, SearchResult, SemanticSearch, TermsMatchingStrategy, + DEFAULT_VALUES_PER_FACET, }; pub use self::update::ChannelCongestion; diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 62183afc3..2ddb2ddb9 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -7,7 +7,9 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; -pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; +pub use self::new::matches::{ + FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder, MatchingWords, +}; use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; @@ -278,7 +280,7 @@ impl<'a> Search<'a> { // consume context and located_query_terms to build MatchingWords. let matching_words = match located_query_terms { - Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), + Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms), None => MatchingWords::default(), }; diff --git a/crates/milli/src/search/new/matches/adjust_indices.rs b/crates/milli/src/search/new/matches/adjust_indices.rs new file mode 100644 index 000000000..6c5df2ddf --- /dev/null +++ b/crates/milli/src/search/new/matches/adjust_indices.rs @@ -0,0 +1,222 @@ +use std::cmp::Ordering; + +use charabia::{SeparatorKind, Token}; + +#[derive(Clone)] +enum Direction { + Forwards, + Backwards, +} + +impl Direction { + fn switch(&mut self) { + *self = match self { + Direction::Backwards => Direction::Forwards, + Direction::Forwards => Direction::Backwards, + } + } +} + +fn get_adjusted_indices_for_too_few_words( + tokens: &[Token], + mut index_backward: usize, + mut index_forward: usize, + mut words_count: usize, + crop_size: usize, +) -> [usize; 2] { + let mut valid_index_backward = index_backward; + let mut valid_index_forward = index_forward; + + let mut is_end_reached = index_forward == tokens.len() - 1; + let mut is_beginning_reached = index_backward == 0; + + let mut is_index_backwards_at_hard_separator = false; + let mut is_index_forwards_at_hard_separator = false; + + let mut is_crop_size_or_both_ends_reached = + words_count == crop_size || (is_end_reached && is_beginning_reached); + + let mut dir = Direction::Forwards; + + loop { + if is_crop_size_or_both_ends_reached { + break; + } + + let (index, valid_index) = match dir { + Direction::Backwards => (&mut index_backward, &mut valid_index_backward), + Direction::Forwards => (&mut index_forward, &mut valid_index_forward), + }; + + loop { + match dir { + Direction::Forwards => { + if is_end_reached { + break; + } + + *index += 1; + + is_end_reached = *index == tokens.len() - 1; + } + Direction::Backwards => { + if is_beginning_reached + || (!is_end_reached + && is_index_backwards_at_hard_separator + && !is_index_forwards_at_hard_separator) + { + break; + } + + *index -= 1; + + is_beginning_reached = *index == 0; + } + }; + + if is_end_reached && is_beginning_reached { + is_crop_size_or_both_ends_reached = true; + } + + let maybe_is_token_hard_separator = tokens[*index] + .separator_kind() + .map(|sep_kind| matches!(sep_kind, SeparatorKind::Hard)); + + // it's not a separator + if maybe_is_token_hard_separator.is_none() { + *valid_index = *index; + words_count += 1; + + if words_count == crop_size { + is_crop_size_or_both_ends_reached = true; + } + + break; + } + + let is_index_at_hard_separator = match dir { + Direction::Backwards => &mut is_index_backwards_at_hard_separator, + Direction::Forwards => &mut is_index_forwards_at_hard_separator, + }; + *is_index_at_hard_separator = + maybe_is_token_hard_separator.is_some_and(|is_hard| is_hard); + } + + dir.switch(); + + // 1. if end is reached, we can only advance backwards + // 2. if forwards index reached a hard separator and backwards is currently hard, we can go backwards + } + + // keep advancing forward and backward to check if there's only separator tokens + // left until the end if so, then include those too in the index range + + let saved_index = valid_index_forward; + loop { + if valid_index_forward == tokens.len() - 1 { + break; + } + + valid_index_forward += 1; + + if !tokens[valid_index_forward].is_separator() { + valid_index_forward = saved_index; + break; + } + } + + let saved_index = valid_index_backward; + loop { + if valid_index_backward == 0 { + break; + } + + valid_index_backward -= 1; + + if !tokens[valid_index_backward].is_separator() { + valid_index_backward = saved_index; + break; + } + } + + [valid_index_backward, valid_index_forward] +} + +fn get_adjusted_index_forward_for_too_many_words( + tokens: &[Token], + index_backward: usize, + mut index_forward: usize, + mut words_count: usize, + crop_size: usize, +) -> usize { + loop { + if index_forward == index_backward { + return index_forward; + } + + index_forward -= 1; + + if tokens[index_forward].is_separator() { + continue; + } + + words_count -= 1; + + if words_count == crop_size { + break; + } + } + + index_forward +} + +pub fn get_adjusted_indices_for_highlights_and_crop_size( + tokens: &[Token], + index_backward: usize, + index_forward: usize, + words_count: usize, + crop_size: usize, +) -> [usize; 2] { + match words_count.cmp(&crop_size) { + Ordering::Equal | Ordering::Less => get_adjusted_indices_for_too_few_words( + tokens, + index_backward, + index_forward, + words_count, + crop_size, + ), + Ordering::Greater => [ + index_backward, + get_adjusted_index_forward_for_too_many_words( + tokens, + index_backward, + index_forward, + words_count, + crop_size, + ), + ], + } +} + +pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usize) -> usize { + let mut words_count = 0; + let mut index = 0; + + while index != tokens.len() - 1 { + if !tokens[index].is_separator() { + words_count += 1; + + if words_count == crop_size { + break; + } + } + + index += 1; + } + + if index == tokens.len() - 1 { + return index; + } + + index + 1 +} diff --git a/crates/milli/src/search/new/matches/best_match_interval.rs b/crates/milli/src/search/new/matches/best_match_interval.rs deleted file mode 100644 index 1a8914e98..000000000 --- a/crates/milli/src/search/new/matches/best_match_interval.rs +++ /dev/null @@ -1,139 +0,0 @@ -use super::matching_words::WordId; -use super::{Match, MatchPosition}; - -struct MatchIntervalWithScore { - interval: [usize; 2], - score: [i16; 3], -} - -// count score for phrases -fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; -} - -/// Compute the score of a match interval: -/// 1) count unique matches -/// 2) calculate distance between matches -/// 3) count ordered matches -fn get_interval_score(matches: &[Match]) -> [i16; 3] { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - let m_last_word_pos = match m.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => { - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - lwp - } - }; - let next_match_first_word_pos = next_match.get_first_word_pos(); - - // compute distance between matches - distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; - } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position { - // in case last match is a phrase, count score for its words - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - [uniq_score, distance_score, order_score] -} - -/// Returns the first and last match where the score computed by match_interval_score is the best. -pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] { - if matches.is_empty() { - panic!("`matches` should not be empty at this point"); - } - - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval: Option = None; - - let mut save_best_interval = |interval_first, interval_last| { - let interval_score = get_interval_score(&matches[interval_first..=interval_last]); - let is_interval_score_better = &best_interval - .as_ref() - .is_none_or(|MatchIntervalWithScore { score, .. }| interval_score > *score); - - if *is_interval_score_better { - best_interval = Some(MatchIntervalWithScore { - interval: [interval_first, interval_last], - score: interval_score, - }); - } - }; - - // we compute the matches interval if we have at least 2 matches. - // current interval positions. - let mut interval_first = 0; - let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - for (index, next_match) in matches.iter().enumerate() { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // if index is 0 there is no last viable match - if index != 0 { - let interval_last = index - 1; - // keep interval if it's the best - save_best_interval(interval_first, interval_last); - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - if interval_first == matches.len() { - interval_first -= 1; - break; - } - - interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - if interval_first_match_first_word_pos > next_match_last_word_pos - || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size - { - break; - } - } - } - } - - // compute the last interval score and compare it to the best one. - let interval_last = matches.len() - 1; - // if it's the last match with itself, we need to make sure it's - // not a phrase longer than the crop window - if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - save_best_interval(interval_first, interval_last); - } - - // if none of the matches fit the criteria above, default to the first one - best_interval.map_or( - [&matches[0], &matches[0]], - |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]], - ) -} diff --git a/crates/milli/src/search/new/matches/best_match_range.rs b/crates/milli/src/search/new/matches/best_match_range.rs new file mode 100644 index 000000000..6c8857cdd --- /dev/null +++ b/crates/milli/src/search/new/matches/best_match_range.rs @@ -0,0 +1,169 @@ +use std::cell::Cell; + +use crate::search::new::matches::matching_words::QueryPosition; + +use super::r#match::{Match, MatchPosition}; + +struct MatchesIndexRangeWithScore { + matches_index_range: [usize; 2], + score: [i16; 3], +} + +/// Compute the score of a match interval: +/// 1) count unique matches +/// 2) calculate distance between matches +/// 3) count ordered matches +fn get_score( + matches: &[Match], + query_positions: &[QueryPosition], + index_first: usize, + index_last: usize, +) -> [i16; 3] { + let order_score = Cell::new(0); + let distance_score = Cell::new(0); + + let mut iter = (index_first..=index_last) + .filter_map(|index| { + query_positions.iter().find_map(move |v| (v.index == index).then(|| v.range[0])) + }) + .peekable(); + while let (Some(range_first), Some(next_range_first)) = (iter.next(), iter.peek()) { + if range_first < *next_range_first { + order_score.set(order_score.get() + 1); + } + } + + // count score for phrases + let tally_phrase_scores = |fwp, lwp| { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be in the order of query, so +1 for each space between words + order_score.set(order_score.get() + words_in_phrase_minus_one); + // distance will always be 1, so -1 for each space between words + distance_score.set(distance_score.get() - words_in_phrase_minus_one); + }; + + let mut iter = matches[index_first..=index_last].iter().peekable(); + while let Some(r#match) = iter.next() { + if let Some(next_match) = iter.peek() { + let match_last_word_pos = match r#match.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => { + tally_phrase_scores(fwp, lwp); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score.set( + distance_score.get() + - (next_match_first_word_pos - match_last_word_pos).min(7) as i16, + ); + } else if let MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } = + r#match.position + { + // in case last match is a phrase, count score for its words + tally_phrase_scores(fwp, lwp); + } + } + + let mut uniqueness_score = 0i16; + let mut current_range: Option = None; + + for qp in query_positions.iter().filter(|v| v.index >= index_first && v.index <= index_last) { + match current_range.as_mut() { + Some([saved_range_start, saved_range_end]) => { + let [range_start, range_end] = qp.range; + + if range_start > *saved_range_start { + uniqueness_score += (*saved_range_end - *saved_range_start) as i16 + 1; + + *saved_range_start = range_start; + *saved_range_end = range_end; + } else if range_end > *saved_range_end { + *saved_range_end = range_end; + } + } + None => current_range = Some(qp.range), + } + } + + if let Some([saved_range_start, saved_range_end]) = current_range { + uniqueness_score += (saved_range_end - saved_range_start) as i16 + 1; + } + + // rank by unique match count, then by distance between matches, then by ordered match count. + [uniqueness_score, distance_score.into_inner(), order_score.into_inner()] +} + +/// Returns the first and last match where the score computed by match_interval_score is the best. +pub fn get_best_match_index_range( + matches: &[Match], + query_positions: &[QueryPosition], + crop_size: usize, +) -> [usize; 2] { + // positions of the first and the last match of the best matches index range in `matches`. + let mut best_matches_index_range: Option = None; + + let mut save_best_matches_index_range = |index_first, index_last| { + let score = get_score(matches, query_positions, index_first, index_last); + let is_score_better = best_matches_index_range.as_ref().is_none_or(|v| score > v.score); + + if is_score_better { + best_matches_index_range = Some(MatchesIndexRangeWithScore { + matches_index_range: [index_first, index_last], + score, + }); + } + }; + + // we compute the matches index range if we have at least 2 matches. + let mut index_first = 0; + let mut first_match_first_word_pos = matches[index_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make index range gross more than crop_size, + // we compare the current index range with the best one, + // then we increase `index_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - first_match_first_word_pos + 1 > crop_size { + // if index is 0 there is no previous viable match + if index != 0 { + // keep index range if it's the best + save_best_matches_index_range(index_first, index - 1); + } + + // advance `index_first` while index range is longer than crop_size. + loop { + if index_first == matches.len() - 1 { + break; + } + + index_first += 1; + first_match_first_word_pos = matches[index_first].get_first_word_pos(); + + // also make sure that subtracting won't cause a panic + if next_match_last_word_pos < first_match_first_word_pos + || next_match_last_word_pos - first_match_first_word_pos + 1 < crop_size + { + break; + } + } + } + } + + // compute the last index range score and compare it to the best one. + let index_last = matches.len() - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if index_first != index_last || matches[index_first].get_word_count() < crop_size { + save_best_matches_index_range(index_first, index_last); + } + + // if none of the matches fit the criteria above, default to the first one + best_matches_index_range.map_or([0, 0], |v| v.matches_index_range) +} diff --git a/crates/milli/src/search/new/matches/match.rs b/crates/milli/src/search/new/matches/match.rs index 2eef4d5a6..570ea2e8e 100644 --- a/crates/milli/src/search/new/matches/match.rs +++ b/crates/milli/src/search/new/matches/match.rs @@ -1,62 +1,49 @@ -use super::matching_words::WordId; - -#[derive(Clone, Debug)] +#[derive(Debug, PartialEq)] pub enum MatchPosition { - Word { - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, - }, - Phrase { - // position of the first and last word in the phrase in the whole text. - word_positions: [usize; 2], - // position of the first and last token in the phrase in the whole text. - token_positions: [usize; 2], - }, + Word { word_position: usize, token_position: usize }, + Phrase { word_position_range: [usize; 2], token_position_range: [usize; 2] }, } -#[derive(Clone, Debug)] +#[derive(Debug, PartialEq)] pub struct Match { pub char_count: usize, - // ids of the query words that matches. - pub ids: Vec, + pub byte_len: usize, pub position: MatchPosition, } impl Match { - pub(super) fn get_first_word_pos(&self) -> usize { + pub fn get_first_word_pos(&self) -> usize { match self.position { MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp, + MatchPosition::Phrase { word_position_range: [fwp, _], .. } => fwp, } } - pub(super) fn get_last_word_pos(&self) -> usize { + pub fn get_last_word_pos(&self) -> usize { match self.position { MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp, + MatchPosition::Phrase { word_position_range: [_, lwp], .. } => lwp, } } - pub(super) fn get_first_token_pos(&self) -> usize { + pub fn get_first_token_pos(&self) -> usize { match self.position { MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp, + MatchPosition::Phrase { token_position_range: [ftp, _], .. } => ftp, } } - pub(super) fn get_last_token_pos(&self) -> usize { + pub fn get_last_token_pos(&self) -> usize { match self.position { MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp, + MatchPosition::Phrase { token_position_range: [_, ltp], .. } => ltp, } } - pub(super) fn get_word_count(&self) -> usize { + pub fn get_word_count(&self) -> usize { match self.position { MatchPosition::Word { .. } => 1, - MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1, + MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => lwp - fwp + 1, } } } diff --git a/crates/milli/src/search/new/matches/match_bounds.rs b/crates/milli/src/search/new/matches/match_bounds.rs new file mode 100644 index 000000000..44f88b648 --- /dev/null +++ b/crates/milli/src/search/new/matches/match_bounds.rs @@ -0,0 +1,270 @@ +use std::cmp::{max, min}; + +use super::{ + matching_words::QueryPosition, + r#match::{Match, MatchPosition}, +}; + +use super::adjust_indices::{ + get_adjusted_index_forward_for_crop_size, get_adjusted_indices_for_highlights_and_crop_size, +}; +use charabia::Token; +use serde::Serialize; +use utoipa::ToSchema; + +use super::FormatOptions; + +// TODO: Differentiate if full match do not return None, instead return match bounds with full length +#[derive(Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct MatchBounds { + pub highlight_toggle: bool, + pub indices: Vec, +} + +struct MatchBoundsHelper<'a> { + tokens: &'a [Token<'a>], + matches: &'a [Match], + query_positions: &'a [QueryPosition], +} + +struct MatchesAndCropIndices { + matches_first_index: usize, + matches_last_index: usize, + crop_byte_start: usize, + crop_byte_end: usize, +} + +enum CropThing { + Last(usize), + First(usize), +} + +impl MatchBoundsHelper<'_> { + fn get_match_byte_position_range(&self, r#match: &Match) -> [usize; 2] { + let byte_start = match r#match.position { + MatchPosition::Word { token_position, .. } => self.tokens[token_position].byte_start, + MatchPosition::Phrase { token_position_range: [ftp, ..], .. } => { + self.tokens[ftp].byte_start + } + }; + + [byte_start, byte_start + r#match.byte_len] + } + + // TODO: Rename this + fn get_match_byte_position_rangee( + &self, + index: &mut usize, + crop_thing: CropThing, + ) -> [usize; 2] { + let new_index = match crop_thing { + CropThing::First(_) if *index != 0 => *index - 1, + CropThing::Last(_) if *index != self.matches.len() - 1 => *index + 1, + _ => { + return self.get_match_byte_position_range(&self.matches[*index]); + } + }; + + let [byte_start, byte_end] = self.get_match_byte_position_range(&self.matches[new_index]); + + // NOTE: This doesn't need additional checks, because `get_best_match_index_range` already + // guarantees that the next or preceding match contains the crop boundary + match crop_thing { + CropThing::First(crop_byte_start) if crop_byte_start < byte_end => { + *index -= 1; + [byte_start, byte_end] + } + CropThing::Last(crop_byte_end) if byte_start < crop_byte_end => { + *index += 1; + [byte_start, byte_end] + } + _ => self.get_match_byte_position_range(&self.matches[*index]), + } + } + + /// TODO: Description + fn get_match_bounds(&self, mci: MatchesAndCropIndices) -> MatchBounds { + let MatchesAndCropIndices { + mut matches_first_index, + mut matches_last_index, + crop_byte_start, + crop_byte_end, + } = mci; + + let [first_match_first_byte, first_match_last_byte] = self.get_match_byte_position_rangee( + &mut matches_first_index, + CropThing::First(crop_byte_start), + ); + let first_match_first_byte = max(first_match_first_byte, crop_byte_start); + + let [last_match_first_byte, last_match_last_byte] = + if matches_first_index != matches_last_index { + self.get_match_byte_position_rangee( + &mut matches_last_index, + CropThing::Last(crop_byte_end), + ) + } else { + [first_match_first_byte, first_match_last_byte] + }; + let last_match_last_byte = min(last_match_last_byte, crop_byte_end); + + let selected_matches_len = matches_last_index - matches_first_index + 1; + let mut indices_size = 2 * selected_matches_len; + + let crop_byte_start_is_not_first_match_start = crop_byte_start != first_match_first_byte; + let crop_byte_end_is_not_last_match_end = crop_byte_end != last_match_last_byte; + + if crop_byte_start_is_not_first_match_start { + indices_size += 1; + } + + if crop_byte_end_is_not_last_match_end { + indices_size += 1; + } + + let mut indices = Vec::with_capacity(indices_size); + + if crop_byte_start_is_not_first_match_start { + indices.push(crop_byte_start); + } + + indices.push(first_match_first_byte); + + if selected_matches_len > 1 { + indices.push(first_match_last_byte); + } + + if selected_matches_len > 2 { + for index in (matches_first_index + 1)..matches_last_index { + let [m_byte_start, m_byte_end] = + self.get_match_byte_position_range(&self.matches[index]); + + indices.push(m_byte_start); + indices.push(m_byte_end); + } + } + + if selected_matches_len > 1 { + indices.push(last_match_first_byte); + } + + indices.push(last_match_last_byte); + + if crop_byte_end_is_not_last_match_end { + indices.push(crop_byte_end); + } + + MatchBounds { highlight_toggle: !crop_byte_start_is_not_first_match_start, indices } + } + + /// For crop but no highlight. + fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> MatchBounds { + let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size); + let final_token = &self.tokens[final_token_index]; + + // TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start? + + // TODO: Can here be an error, because it's byte_start but it could be byte_end? + MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] } + } + + fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices { + let asd = |i1, i2| { + println!( + "{}|{}|{}\n{} {}", + self.tokens[..i1].iter().map(|v| v.lemma()).collect::>().join(""), + self.tokens[i1..i2].iter().map(|v| v.lemma()).collect::>().join(""), + self.tokens[i2..].iter().map(|v| v.lemma()).collect::>().join(""), + i1, + i2 + ); + }; + + // TODO: This doesn't give back 2 phrases if one is out of crop window + // Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them + let [matches_first_index, matches_last_index] = + super::best_match_range::get_best_match_index_range( + self.matches, + self.query_positions, + crop_size, + ); + + let first_match = &self.matches[matches_first_index]; + let last_match = &self.matches[matches_last_index]; + + let last_match_last_word_pos = last_match.get_last_word_pos(); + let first_match_first_word_pos = first_match.get_first_word_pos(); + + let words_count = last_match_last_word_pos - first_match_first_word_pos + 1; + let [index_backward, index_forward] = get_adjusted_indices_for_highlights_and_crop_size( + self.tokens, + first_match.get_first_token_pos(), + last_match.get_last_token_pos(), + words_count, + crop_size, + ); + + asd(first_match.get_first_token_pos(), last_match.get_last_token_pos()); + asd(index_backward, index_forward); + + let backward_token = &self.tokens[index_backward]; + let forward_token = &self.tokens[index_forward]; + + MatchesAndCropIndices { + matches_first_index, + matches_last_index, + crop_byte_start: backward_token.byte_start, + crop_byte_end: forward_token.byte_end, + } + } + + /// TODO: description + fn get_crop_and_highlight_bounds_with_matches(&self, crop_size: usize) -> MatchBounds { + self.get_match_bounds(self.get_matches_and_crop_indices(crop_size)) + } + + /// For when there are no matches, but crop is required. + fn get_crop_bounds_with_matches(&self, crop_size: usize) -> MatchBounds { + let mci = self.get_matches_and_crop_indices(crop_size); + + MatchBounds { + highlight_toggle: false, + indices: vec![mci.crop_byte_start, mci.crop_byte_end], + } + } +} + +impl MatchBounds { + pub fn try_new( + tokens: &[Token], + matches: &[Match], + query_positions: &[QueryPosition], + format_options: FormatOptions, + ) -> Option { + let mbh = MatchBoundsHelper { tokens, matches, query_positions }; + + if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) { + if matches.is_empty() { + return Some(mbh.get_crop_bounds_with_no_matches(crop_size)); + } + + if format_options.highlight { + return Some(mbh.get_crop_and_highlight_bounds_with_matches(crop_size)); + } + + return Some(mbh.get_crop_bounds_with_matches(crop_size)); + } + + if !format_options.highlight || matches.is_empty() { + return None; + } + + Some(mbh.get_match_bounds(MatchesAndCropIndices { + matches_first_index: 0, + matches_last_index: matches.len() - 1, + crop_byte_start: 0, + crop_byte_end: tokens[tokens.len() - 1].byte_end, + })) + } +} diff --git a/crates/milli/src/search/new/matches/matching_words.rs b/crates/milli/src/search/new/matches/matching_words.rs index 64235298b..3edc3eb38 100644 --- a/crates/milli/src/search/new/matches/matching_words.rs +++ b/crates/milli/src/search/new/matches/matching_words.rs @@ -1,24 +1,89 @@ use std::cmp::Reverse; -use std::fmt; -use std::ops::RangeInclusive; +use std::fmt::{Debug, Formatter, Result}; use charabia::Token; use super::super::interner::Interned; use super::super::query_term::LocatedQueryTerm; use super::super::{DedupInterner, Phrase}; +use super::r#match::{Match, MatchPosition}; use crate::SearchContext; -pub struct LocatedMatchingPhrase { - pub value: Interned, - pub positions: RangeInclusive, +enum PrefixedOrEquality { + Prefixed, + Equality, + NotApplicable, } -pub struct LocatedMatchingWords { - pub value: Vec>, - pub positions: RangeInclusive, - pub is_prefix: bool, - pub original_char_count: usize, +impl PrefixedOrEquality { + fn new(string: &str, other_string: &str, is_other_string_prefix: bool) -> Self { + if string.is_empty() { + return if other_string.is_empty() { Self::Equality } else { Self::NotApplicable }; + } + + let mut other_string_iter = other_string.chars(); + + for c in string.chars() { + let Some(other_c) = other_string_iter.next() else { + return if is_other_string_prefix { Self::Prefixed } else { Self::NotApplicable }; + }; + + if c != other_c { + return Self::NotApplicable; + } + } + + if other_string_iter.next().is_some() { + return Self::NotApplicable; + } + + Self::Equality + } +} + +// TODO: Consider using a tuple here, because indexing this thing out of bounds only incurs a runtime error +pub type UserQueryPositionRange = [u16; 2]; + +struct LocatedMatchingPhrase { + value: Interned, + position: UserQueryPositionRange, +} + +struct LocatedMatchingWords { + value: Vec>, + position: UserQueryPositionRange, + is_prefix: bool, + original_char_count: usize, +} + +struct TokenPositionHelper<'a> { + token: &'a Token<'a>, + position_by_word: usize, + position_by_token: usize, +} + +impl<'a> TokenPositionHelper<'a> { + fn iter_from_tokens(tokens: &'a [Token]) -> impl Iterator + Clone { + tokens + .iter() + .scan([0, 0], |[token_position, word_position], token| { + // TODO: Naming + let token_word_thingy = Self { + position_by_token: *token_position, + position_by_word: *word_position, + token, + }; + + *token_position += 1; + + if !token.is_separator() { + *word_position += 1; + } + + Some(token_word_thingy) + }) + .filter(|t| !t.token.is_separator()) + } } /// Structure created from a query tree @@ -27,180 +92,263 @@ pub struct LocatedMatchingWords { pub struct MatchingWords { word_interner: DedupInterner, phrase_interner: DedupInterner, - phrases: Vec, - words: Vec, + located_matching_phrases: Vec, + located_matching_words: Vec, +} + +#[cfg_attr(test, derive(Debug, PartialEq))] +pub struct QueryPosition { + pub range: UserQueryPositionRange, + pub index: usize, } impl MatchingWords { - pub fn new(ctx: SearchContext<'_>, located_terms: Vec) -> Self { - let mut phrases = Vec::new(); - let mut words = Vec::new(); + pub fn new(ctx: SearchContext, located_terms: &[LocatedQueryTerm]) -> Self { + let mut located_matching_phrases = Vec::new(); + let mut located_matching_words = Vec::new(); // Extract and centralize the different phrases and words to match stored in a QueryTerm // and wrap them in dedicated structures. - for located_term in located_terms { - let term = ctx.term_interner.get(located_term.value); + for LocatedQueryTerm { value, positions } in located_terms { + let term = ctx.term_interner.get(*value); let (matching_words, matching_phrases) = term.all_computed_derivations(); - for matching_phrase in matching_phrases { - phrases.push(LocatedMatchingPhrase { - value: matching_phrase, - positions: located_term.positions.clone(), - }); + let position = [*positions.start(), *positions.end()]; + + if !matching_phrases.is_empty() { + located_matching_phrases.reserve(matching_phrases.len()); + located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| { + LocatedMatchingPhrase { value: *matching_phrase, position } + })); } - words.push(LocatedMatchingWords { - value: matching_words, - positions: located_term.positions.clone(), - is_prefix: term.is_prefix(), - original_char_count: term.original_word(&ctx).chars().count(), - }); + if !matching_words.is_empty() { + located_matching_words.push(LocatedMatchingWords { + value: matching_words, + position, + is_prefix: term.is_prefix(), + original_char_count: term.original_word(&ctx).chars().count(), + }); + } } - // Sort word to put prefixes at the bottom prioritizing the exact matches. - words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len()))); + // Sort words by having `is_prefix` as false first and then by their lengths in reverse order. + // This is only meant to help with what we match a token against first. + located_matching_words.sort_unstable_by_key(|lmw| { + (lmw.is_prefix, Reverse(lmw.position[1] - lmw.position[0])) + }); Self { - phrases, - words, + located_matching_phrases, + located_matching_words, word_interner: ctx.word_interner, phrase_interner: ctx.phrase_interner, } } - /// Returns an iterator over terms that match or partially match the given token. - pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { - MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } + fn try_get_phrase_match<'a>( + &self, + token_position_helper_iter: &mut (impl Iterator> + Clone), + ) -> Option<(Match, UserQueryPositionRange)> { + let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| { + let words = &self.phrase_interner.get(lmp.value).words; + + let words_iter = words + .iter() + .map(|maybe_word| maybe_word.map(|word| self.word_interner.get(word).as_str())) + .peekable(); + + (lmp.position, words_iter) + }); + + 'outer: loop { + let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?; + + // TODO: if it's worth it, clone only if we have to + let mut tph_iter = token_position_helper_iter.clone(); + + let mut first_tph_details = None; + let last_tph_details = loop { + // 1. get word from `words_iter` and token word thingy from `token_word_thingy_iter` + let (Some(word), Some(tph)) = (words_iter.next(), tph_iter.next()) else { + // 2. if there are no more words or token word thingys, get to next phrase and reset `token_word_thingy_iter` + continue 'outer; + }; + + // ?. save first token position bla bla bla + if first_tph_details.is_none() { + first_tph_details = Some([ + tph.position_by_token, + tph.position_by_word, + tph.token.char_start, + tph.token.byte_start, + ]); + } + + // 3. check if word matches our token + let is_matching = match word { + Some(word) => tph.token.lemma() == word, + // a `None` value in the phrase words iterator corresponds to a stop word, + // the value is considered a match if the current token is categorized as a stop word. + None => tph.token.is_stopword(), + }; + + // 4. if it does not, get to next phrase and restart `token_word_thingy_iter` + if !is_matching { + continue 'outer; + } + + // 5. if it does, and there are no words left, time to return + if words_iter.peek().is_none() { + break [ + tph.position_by_token, + tph.position_by_word, + tph.token.char_end, + tph.token.byte_end, + ]; + } + }; + + let [first_tph_position_by_token, first_tph_position_by_word, first_tph_char_start, first_tph_byte_start] = + first_tph_details.expect("TODO"); + let [last_tph_position_by_token, last_tph_position_by_word, last_tph_char_end, last_tph_byte_end] = + last_tph_details; + + // save new position in parameter iterator + *token_position_helper_iter = tph_iter; + + return Some(( + Match { + // do not +1, because Token index ranges are exclusive + byte_len: last_tph_byte_end - first_tph_byte_start, + char_count: last_tph_char_end - first_tph_char_start, + position: MatchPosition::Phrase { + word_position_range: [ + first_tph_position_by_word, + last_tph_position_by_word, + ], + token_position_range: [ + first_tph_position_by_token, + last_tph_position_by_token, + ], + }, + }, + query_position_range, + )); + } } /// Try to match the token with one of the located_words. - fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option> { - for located_words in &self.words { - for word in &located_words.value { - let word = self.word_interner.get(*word); - // if the word is a prefix we match using starts_with. - if located_words.is_prefix && token.lemma().starts_with(word) { - let Some((char_index, c)) = - word.char_indices().take(located_words.original_char_count).last() - else { - continue; + fn try_get_word_match( + &self, + tph: TokenPositionHelper, + text: &str, + ) -> Option<(Match, UserQueryPositionRange)> { + // TODO: There is potentially an optimization to be made here + // if we matched a term then we can skip checking it for further iterations? + + println!( + "{:?}", + self.located_matching_words + .iter() + .flat_map(|lw| lw.value.iter().map(move |w| ( + lw.is_prefix, + lw.original_char_count, + self.word_interner.get(*w) + ))) + .collect::>() + ); + + self.located_matching_words + .iter() + .flat_map(|lw| lw.value.iter().map(move |w| (lw, self.word_interner.get(*w)))) + .find_map(|(located_words, word)| { + let [char_count, byte_len] = + match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) + { + PrefixedOrEquality::Prefixed => { + let prefix_byte_len = text[tph.token.byte_start..] + .char_indices() + .nth(located_words.original_char_count - 1) + .map(|(i, c)| i + c.len_utf8()) + .expect("expected text to have n-th thing bal bla TODO"); + + // TODO: Investigate token original byte length and similar methods and why they're not good enough + // That might be because token original byte length only or could also refer to the normalized byte length + + [located_words.original_char_count, prefix_byte_len] + } + // do not +1, because Token index ranges are exclusive + PrefixedOrEquality::Equality => [ + tph.token.char_end - tph.token.char_start, + tph.token.byte_end - tph.token.byte_start, + ], + _ => return None, }; - let prefix_length = char_index + c.len_utf8(); - let (char_count, byte_len) = token.original_lengths(prefix_length); - let ids = &located_words.positions; - return Some(MatchType::Full { ids, char_count, byte_len }); - // else we exact match the token. - } else if token.lemma() == word { - let ids = &located_words.positions; - return Some(MatchType::Full { - char_count: token.char_end - token.char_start, - byte_len: token.byte_end - token.byte_start, - ids, - }); - } - } - } - None - } -} - -/// Iterator over terms that match the given token, -/// This allow to lazily evaluate matches. -pub struct MatchesIter<'a, 'b> { - matching_words: &'a MatchingWords, - phrases: Box + 'a>, - token: &'b Token<'b>, -} - -impl<'a> Iterator for MatchesIter<'a, '_> { - type Item = MatchType<'a>; - - fn next(&mut self) -> Option { - match self.phrases.next() { - // Try to match all the phrases first. - Some(located_phrase) => { - let phrase = self.matching_words.phrase_interner.get(located_phrase.value); - - // create a PartialMatch struct to make it compute the first match - // instead of duplicating the code. - let ids = &located_phrase.positions; - // collect the references of words from the interner. - let words = phrase - .words - .iter() - .map(|word| { - word.map(|word| self.matching_words.word_interner.get(word).as_str()) - }) - .collect(); - let partial = PartialMatch { matching_words: words, ids }; - - partial.match_token(self.token).or_else(|| self.next()) - } - // If no phrases matches, try to match uiques words. - None => self.matching_words.match_unique_words(self.token), - } - } -} - -/// Id of a matching term corespounding to a word written by the end user. -pub type WordId = u16; - -/// A given token can partially match a query word for several reasons: -/// - split words -/// - multi-word synonyms -/// In these cases we need to match consecutively several tokens to consider that the match is full. -#[derive(Debug, PartialEq)] -pub enum MatchType<'a> { - Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, - Partial(PartialMatch<'a>), -} - -/// Structure helper to match several tokens in a row in order to complete a partial match. -#[derive(Debug, PartialEq)] -pub struct PartialMatch<'a> { - matching_words: Vec>, - ids: &'a RangeInclusive, -} - -impl<'a> PartialMatch<'a> { - /// Returns: - /// - None if the given token breaks the partial match - /// - Partial if the given token matches the partial match but doesn't complete it - /// - Full if the given token completes the partial match - pub fn match_token(self, token: &Token<'_>) -> Option> { - let Self { mut matching_words, ids, .. } = self; - - let is_matching = match matching_words.first()? { - Some(word) => &token.lemma() == word, - // a None value in the phrase corresponds to a stop word, - // the walue is considered a match if the current token is categorized as a stop word. - None => token.is_stopword(), - }; - - // if there are remaining words to match in the phrase and the current token is matching, - // return a new Partial match allowing the highlighter to continue. - if is_matching && matching_words.len() > 1 { - matching_words.remove(0); - Some(MatchType::Partial(Self { matching_words, ids })) - // if there is no remaining word to match in the phrase and the current token is matching, - // return a Full match. - } else if is_matching { - Some(MatchType::Full { - char_count: token.char_end - token.char_start, - byte_len: token.byte_end - token.byte_start, - ids, + Some(( + Match { + char_count, + byte_len, + position: MatchPosition::Word { + word_position: tph.position_by_word, + token_position: tph.position_by_token, + }, + }, + located_words.position, + )) }) - // if the current token doesn't match, return None to break the match sequence. - } else { - None + } + + pub fn get_matches_and_query_positions( + &self, + tokens: &[Token], + text: &str, + ) -> (Vec, Vec) { + // TODO: Note in the doc that with the help of this iter, matches are guaranteed to be ordered + let mut token_position_helper_iter = TokenPositionHelper::iter_from_tokens(tokens); + let mut matches = Vec::new(); + let mut query_positions = Vec::new(); + + loop { + // try and get a phrase match + if let Some((r#match, range)) = + self.try_get_phrase_match(&mut token_position_helper_iter) + { + matches.push(r#match); + query_positions.push(QueryPosition { range, index: matches.len() - 1 }); + + continue; + } + + // if the above fails, try get next token position helper + if let Some(tph) = token_position_helper_iter.next() { + // and then try and get a word match + if let Some((r#match, range)) = self.try_get_word_match(tph, text) { + matches.push(r#match); + query_positions.push(QueryPosition { range, index: matches.len() - 1 }); + } + } else { + // there are no more items in the iterator, we are done searching for matches + break; + }; } + + // TODO: Explain why + query_positions.sort_unstable_by_key(|v| v.range[0]); + + (matches, query_positions) } } -impl fmt::Debug for MatchingWords { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let MatchingWords { word_interner, phrase_interner, phrases, words } = self; +impl Debug for MatchingWords { + fn fmt(&self, f: &mut Formatter<'_>) -> Result { + let MatchingWords { + word_interner, + phrase_interner, + located_matching_phrases: phrases, + located_matching_words: words, + } = self; let phrases: Vec<_> = phrases .iter() @@ -213,37 +361,33 @@ impl fmt::Debug for MatchingWords { .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w))) .collect::>() .join(" "), - p.positions.clone(), + p.position, ) }) .collect(); - let words: Vec<_> = words .iter() .flat_map(|w| { w.value .iter() - .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix)) + .map(|s| (word_interner.get(*s), w.position, w.is_prefix)) .collect::>() }) .collect(); - f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish() } } #[cfg(test)] -pub(crate) mod tests { - use std::borrow::Cow; - - use charabia::{TokenKind, TokenizerBuilder}; - +mod tests { use super::super::super::located_query_terms_from_tokens; use super::*; use crate::index::tests::TempIndex; use crate::search::new::query_term::ExtractedTokens; + use charabia::{TokenKind, TokenizerBuilder}; + use std::borrow::Cow; - pub(crate) fn temp_index_with_documents() -> TempIndex { + fn temp_index_with_documents() -> TempIndex { let temp_index = TempIndex::new(); temp_index .add_documents(documents!([ @@ -262,70 +406,77 @@ pub(crate) mod tests { let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); - let tokens = tokenizer.tokenize("split this world"); + let text = "split this world"; + let tokens = tokenizer.tokenize(text); let ExtractedTokens { query_terms, .. } = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); - let matching_words = MatchingWords::new(ctx, query_terms); + let matching_words = MatchingWords::new(ctx, &query_terms); assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("split"), - char_end: "split".chars().count(), - byte_end: "split".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("nyc"), - char_end: "nyc".chars().count(), - byte_end: "nyc".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("world"), - char_end: "world".chars().count(), - byte_end: "world".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("worlded"), - char_end: "worlded".chars().count(), - byte_end: "worlded".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("thisnew"), - char_end: "thisnew".chars().count(), - byte_end: "thisnew".len(), - ..Default::default() - }) - .next(), - None + matching_words.get_matches_and_query_positions( + &[ + Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("split"), + char_end: "split".chars().count(), + byte_end: "split".len(), + ..Default::default() + }, + Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), + byte_end: "nyc".len(), + ..Default::default() + }, + Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), + byte_end: "world".len(), + ..Default::default() + }, + Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("worlded"), + char_end: "worlded".chars().count(), + byte_end: "worlded".len(), + ..Default::default() + }, + Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), + byte_end: "thisnew".len(), + ..Default::default() + } + ], + text + ), + ( + vec![ + Match { + char_count: 5, + byte_len: 5, + position: MatchPosition::Word { word_position: 0, token_position: 0 } + }, + Match { + char_count: 5, + byte_len: 5, + position: MatchPosition::Word { word_position: 2, token_position: 2 } + }, + Match { + char_count: 5, + byte_len: 5, + position: MatchPosition::Word { word_position: 3, token_position: 3 } + } + ], + vec![ + QueryPosition { range: [0, 0], index: 0 }, + QueryPosition { range: [2, 2], index: 1 }, + QueryPosition { range: [2, 2], index: 2 } + ] + ) ); } } diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 2d6f2cf17..f47582af7 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -1,92 +1,54 @@ -mod best_match_interval; +mod adjust_indices; +mod best_match_range; mod r#match; +mod match_bounds; mod matching_words; -mod simple_token_kind; -use std::borrow::Cow; -use std::cmp::{max, min}; - -use charabia::{Language, SeparatorKind, Token, Tokenizer}; -use either::Either; -use itertools::Itertools; +use charabia::{Language, Token, Tokenizer}; +pub use match_bounds::MatchBounds; pub use matching_words::MatchingWords; -use matching_words::{MatchType, PartialMatch}; -use r#match::{Match, MatchPosition}; -use serde::{Deserialize, Serialize}; -use simple_token_kind::SimpleTokenKind; -use utoipa::ToSchema; +use matching_words::QueryPosition; +use r#match::Match; -const DEFAULT_CROP_MARKER: &str = "…"; -const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; -const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; - -/// Structure used to build a Matcher allowing to customize formatting tags. -pub struct MatcherBuilder<'m> { - matching_words: MatchingWords, - tokenizer: Tokenizer<'m>, - crop_marker: Option, - highlight_prefix: Option, - highlight_suffix: Option, +pub struct MarkerOptions { + pub highlight_pre_tag: String, + pub highlight_post_tag: String, + pub crop_marker: String, } -impl<'m> MatcherBuilder<'m> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self { - Self { - matching_words, - tokenizer, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } +/// Structure used to build a Matcher allowing to customize formatting tags. +pub struct MatcherBuilder<'a> { + matching_words: MatchingWords, + tokenizer: Tokenizer<'a>, + marker_options: MarkerOptions, +} - pub fn crop_marker(&mut self, marker: String) -> &Self { - self.crop_marker = Some(marker); - self - } - - pub fn highlight_prefix(&mut self, prefix: String) -> &Self { - self.highlight_prefix = Some(prefix); - self - } - - pub fn highlight_suffix(&mut self, suffix: String) -> &Self { - self.highlight_suffix = Some(suffix); - self +impl<'a> MatcherBuilder<'a> { + pub fn new( + matching_words: MatchingWords, + tokenizer: Tokenizer<'a>, + marker_options: MarkerOptions, + ) -> Self { + Self { matching_words, tokenizer, marker_options } } pub fn build<'t, 'lang>( &self, text: &'t str, locales: Option<&'lang [Language]>, - ) -> Matcher<'t, 'm, '_, 'lang> { - let crop_marker = match &self.crop_marker { - Some(marker) => marker.as_str(), - None => DEFAULT_CROP_MARKER, - }; - - let highlight_prefix = match &self.highlight_prefix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_PREFIX, - }; - let highlight_suffix = match &self.highlight_suffix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_SUFFIX, - }; + ) -> Matcher<'t, 'a, '_, 'lang> { Matcher { text, matching_words: &self.matching_words, tokenizer: &self.tokenizer, - crop_marker, - highlight_prefix, - highlight_suffix, - matches: None, + marker_options: &self.marker_options, + tokens_matches_and_query_positions: None, locales, } } } -#[derive(Copy, Clone, Default, Debug)] +#[derive(Copy, Clone, Default)] pub struct FormatOptions { pub highlight: bool, pub crop: Option, @@ -102,14 +64,6 @@ impl FormatOptions { } } -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, ToSchema)] -pub struct MatchBounds { - pub start: usize, - pub length: usize, - #[serde(skip_serializing_if = "Option::is_none", default)] - pub indices: Option>, -} - /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -117,388 +71,88 @@ pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { matching_words: &'b MatchingWords, tokenizer: &'b Tokenizer<'tokenizer>, locales: Option<&'lang [Language]>, - crop_marker: &'b str, - highlight_prefix: &'b str, - highlight_suffix: &'b str, - matches: Option<(Vec>, Vec)>, + marker_options: &'b MarkerOptions, + tokens_matches_and_query_positions: Option<((Vec, Vec), Vec>)>, } -impl<'t> Matcher<'t, '_, '_, '_> { - /// Iterates over tokens and save any of them that matches the query. - fn compute_matches(&mut self) -> &mut Self { - /// some words are counted as matches only if they are close together and in the good order, - /// compute_partial_match peek into next words to validate if the match is complete. - fn compute_partial_match<'a>( - mut partial: PartialMatch<'a>, - first_token_position: usize, - first_word_position: usize, - first_word_char_start: &usize, - words_positions: &mut impl Iterator)>, - matches: &mut Vec, - ) -> bool { - for (token_position, word_position, word) in words_positions { - partial = match partial.match_token(word) { - // token matches the partial match, but the match is not full, - // we temporarily save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => partial, - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { ids, .. }) => { - // save the token that closes the partial match as a match. - matches.push(Match { - char_count: word.char_end - *first_word_char_start, - ids: ids.clone().collect(), - position: MatchPosition::Phrase { - word_positions: [first_word_position, word_position], - token_positions: [first_token_position, token_position], - }, - }); - - // the match is complete, we return true. - return true; - } - // no match, continue to next match. - None => break, - }; - } - - // the match is not complete, we return false. - false +impl Matcher<'_, '_, '_, '_> { + /// TODO: description + pub fn get_match_bounds( + &mut self, + // TODO: Add option to count UTF-16 segments, or whatever JS works with when slicing strings + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#utf-16_characters_unicode_code_points_and_grapheme_clusters + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice + format_options: Option, + ) -> Option { + if self.text.is_empty() { + return None; } - let tokens: Vec<_> = - self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect(); - let mut matches = Vec::new(); + let ((matches, query_positions), tokens) = + self.tokens_matches_and_query_positions.get_or_insert_with(|| { + let tokens = self + .tokenizer + .tokenize_with_allow_list(self.text, self.locales) + .collect::>(); - let mut words_positions = tokens - .iter() - .scan((0, 0), |(token_position, word_position), token| { - let current_token_position = *token_position; - let current_word_position = *word_position; - *token_position += 1; - if !token.is_separator() { - *word_position += 1; - } + (self.matching_words.get_matches_and_query_positions(&tokens, self.text), tokens) + }); - Some((current_token_position, current_word_position, token)) - }) - .filter(|(_, _, token)| !token.is_separator()); - - while let Some((token_position, word_position, word)) = words_positions.next() { - for match_type in self.matching_words.match_token(word) { - match match_type { - // we match, we save the current token as a match, - // then we continue the rest of the tokens. - MatchType::Full { ids, char_count, .. } => { - let ids: Vec<_> = ids.clone().collect(); - matches.push(Match { - char_count, - ids, - position: MatchPosition::Word { word_position, token_position }, - }); - break; - } - // we match partially, iterate over next tokens to check if we can complete the match. - MatchType::Partial(partial) => { - // if match is completed, we break the matching loop over the current token, - // then we continue the rest of the tokens. - let mut wp = words_positions.clone(); - if compute_partial_match( - partial, - token_position, - word_position, - &word.char_start, - &mut wp, - &mut matches, - ) { - words_positions = wp; - break; - } - } - } - } - } - - self.matches = Some((tokens, matches)); - self + MatchBounds::try_new(tokens, matches, query_positions, format_options.unwrap_or_default()) } - /// Returns boundaries of the words that match the query. - pub fn matches(&mut self, array_indices: &[usize]) -> Vec { - match &self.matches { - None => self.compute_matches().matches(array_indices), - Some((tokens, matches)) => matches - .iter() - .map(|m| MatchBounds { - start: tokens[m.get_first_token_pos()].byte_start, - length: self.calc_byte_length(tokens, m), - indices: if array_indices.is_empty() { - None - } else { - Some(array_indices.to_owned()) - }, - }) - .collect(), + pub fn get_formatted_text(&mut self, format_options: Option) -> Option { + let MatchBounds { mut highlight_toggle, ref indices } = + self.get_match_bounds(format_options)?; + + let MarkerOptions { highlight_pre_tag, highlight_post_tag, crop_marker } = + &self.marker_options; + + let mut formatted_text = Vec::new(); + + let mut indices_iter = indices.iter(); + let mut previous_index = indices_iter.next().expect("TODO"); + + // push crop marker if it's not the start of the text + if !crop_marker.is_empty() && *previous_index != 0 { + formatted_text.push(crop_marker.as_str()); } - } - fn calc_byte_length(&self, tokens: &[Token<'t>], m: &Match) -> usize { - (m.get_first_token_pos()..=m.get_last_token_pos()) - .flat_map(|i| match &tokens[i].char_map { - Some(char_map) => { - char_map.iter().map(|(original, _)| *original as usize).collect_vec() - } - None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(), - }) - .take(m.char_count) - .sum() - } - - /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { - let ( - mut remaining_words, - is_iterating_forward, - before_tokens_starting_index, - after_tokens_starting_index, - ) = if !matches.is_empty() { - let [matches_first, matches_last] = - best_match_interval::find_best_match_interval(matches, crop_size); - - let matches_size = - matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; - - let is_crop_size_gte_match_size = crop_size >= matches_size; - let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size; - - let remaining_words = if is_crop_size_gte_match_size { - crop_size - matches_size - } else { - // in case matches size is greater than crop size, which implies there's only one match, - // we count words backwards, because we have to remove words, as they're extra words outside of - // crop window - matches_size - crop_size - }; - - let after_tokens_starting_index = if matches_size == 0 { - 0 - } else { - let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1; - if last_match_last_token_position_plus_one < tokens.len() { - last_match_last_token_position_plus_one - } else { - // we have matched the end of possible tokens, there's nothing to advance - tokens.len() - } - }; - - ( - remaining_words, - is_iterating_forward, - if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 }, - after_tokens_starting_index, - ) - } else { - (crop_size, true, 0, 0) - }; - - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable(); - // an iterator ... - let mut after_tokens = if is_iterating_forward { - // ... starting from the last match token position and going towards the end of the text. - Either::Left(tokens[after_tokens_starting_index..].iter().peekable()) - } else { - // ... starting from the last match token position and going towards the start of the text. - Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable()) - }; - - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); - let after_token_kind = - after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new); - - match (before_token_kind, after_token_kind) { - // we can expand both sides. - (Some(before_token_kind), Some(after_token_kind)) => { - match (before_token_kind, after_token_kind) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - ( - SimpleTokenKind::Separator(before_token_separator_kind), - SimpleTokenKind::Separator(after_token_separator_kind), - ) => { - if before_token_separator_kind == after_token_separator_kind { - before_tokens.next(); - - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { - after_tokens.next(); - } - } else if matches!(before_token_separator_kind, SeparatorKind::Hard) { - after_tokens.next(); - } else { - before_tokens.next(); - } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { - before_tokens.next(); - remaining_words -= 1; - } - // right is a word, advance right. - (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { - after_tokens.next(); - remaining_words -= 1; - } - // both are words, advance left then right if remaining_word > 0. - (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { - after_tokens.next(); - remaining_words -= 1; - } - } - } - } - // the end of the text is reached, advance left. - (Some(before_token_kind), None) => { - before_tokens.next(); - if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { - remaining_words -= 1; - } - } - // the start of the text is reached, advance right. - (None, Some(after_token_kind)) => { - after_tokens.next(); - if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { - remaining_words -= 1; - } - } - // no more token to add. - (None, None) => break, + for index in indices_iter { + if highlight_toggle { + formatted_text.push(highlight_pre_tag.as_str()); } - } - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + formatted_text.push(&self.text[*previous_index..*index]); - [crop_byte_start, crop_byte_end] - } - - // Returns the formatted version of the original text. - pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { - if !format_options.highlight && format_options.crop.is_none() { - // compute matches is not needed if no highlight nor crop is requested. - Cow::Borrowed(self.text) - } else { - match &self.matches { - Some((tokens, matches)) => { - // If the text has to be cropped, crop around the best interval. - let [crop_byte_start, crop_byte_end] = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.crop_bounds(tokens, matches, crop_size) - } - _ => [0, self.text.len()], - }; - - let mut formatted = Vec::new(); - - // push crop marker if it's not the start of the text. - if crop_byte_start > 0 && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - let mut byte_index = crop_byte_start; - - if format_options.highlight { - // insert highlight markers around matches. - for m in matches { - let [m_byte_start, m_byte_end] = match m.position { - MatchPosition::Word { token_position, .. } => { - let token = &tokens[token_position]; - [&token.byte_start, &token.byte_end] - } - MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { - [&tokens[ftp].byte_start, &tokens[ltp].byte_end] - } - }; - - // skip matches out of the crop window - if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { - continue; - } - - // adjust start and end to the crop window size - let [m_byte_start, m_byte_end] = [ - max(m_byte_start, &crop_byte_start), - min(m_byte_end, &crop_byte_end), - ]; - - // push text that is positioned before our matches - if byte_index < *m_byte_start { - formatted.push(&self.text[byte_index..*m_byte_start]); - } - - formatted.push(self.highlight_prefix); - - // TODO: This is additional work done, charabia::token::Token byte_len - // should already get us the original byte length, however, that doesn't work as - // it's supposed to, investigate why - let highlight_byte_index = self.text[*m_byte_start..] - .char_indices() - .nth(m.char_count) - .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); - formatted.push(&self.text[*m_byte_start..highlight_byte_index]); - - formatted.push(self.highlight_suffix); - - // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < *m_byte_end { - formatted.push(&self.text[highlight_byte_index..*m_byte_end]); - } - - byte_index = *m_byte_end; - } - } - - // push the rest of the text between last match and the end of crop. - if byte_index < crop_byte_end { - formatted.push(&self.text[byte_index..crop_byte_end]); - } - - // push crop marker if it's not the end of the text. - if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - if formatted.len() == 1 { - // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) - } else { - Cow::Owned(formatted.concat()) - } - } - None => self.compute_matches().format(format_options), + if highlight_toggle { + formatted_text.push(highlight_post_tag.as_str()); } + + highlight_toggle = !highlight_toggle; + previous_index = index; } + + // push crop marker if it's not the end of the text + if !crop_marker.is_empty() && *previous_index < self.text.len() { + formatted_text.push(crop_marker.as_str()); + } + + if formatted_text.len() == 1 { + // avoid concatenating if there is only one element + return Some(formatted_text[0].to_string()); + } + + Some(formatted_text.concat()) } } #[cfg(test)] mod tests { - use charabia::TokenizerBuilder; - use matching_words::tests::temp_index_with_documents; - use super::*; use crate::index::tests::TempIndex; use crate::{execute_search, filtered_universe, SearchContext, TimeBudget}; + use charabia::TokenizerBuilder; impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self { @@ -526,423 +180,321 @@ mod tests { .unwrap(); // consume context and located_query_terms to build MatchingWords. - let matching_words = match located_query_terms { - Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), - None => MatchingWords::default(), - }; + let matching_words = located_query_terms + .map(|located_query_terms| MatchingWords::new(ctx, &located_query_terms)) + .unwrap_or_default(); - MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer()) + MatcherBuilder::new( + matching_words, + TokenizerBuilder::default().into_tokenizer(), + MarkerOptions { + highlight_pre_tag: "".to_string(), + highlight_post_tag: "".to_string(), + crop_marker: "…".to_string(), + }, + ) } } - #[test] - fn format_identity() { - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - - let format_options = FormatOptions { highlight: false, crop: None }; - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text, None); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text, None); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text, None); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - } - - #[test] - fn format_highlight() { - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text, None); - assert_eq!(&matcher.format(format_options), ""); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text, None); - assert_eq!(&matcher.format(format_options), ":-)"); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text, None); - // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves." - ); - } - - #[test] - fn highlight_unicode() { - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "world"); - let format_options = FormatOptions { highlight: true, crop: None }; - - // Text containing prefix match. - let text = "Ŵôřlḑôle"; - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑôle" - ); - - // Text containing unicode match. - let text = "Ŵôřlḑ"; - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑ" - ); - - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "westfali"); - let format_options = FormatOptions { highlight: true, crop: None }; - - // Text containing unicode match. - let text = "Westfália"; - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Westfália" - ); - } - - #[test] - fn format_crop() { - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - - let format_options = FormatOptions { highlight: false, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text, None); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text without any match starting by a separator. - let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let mut matcher = builder.build(text, None); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"(A quick brown fox can not jump 32 feet, right…" - ); - - // Test phrase propagation - let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let mut matcher = builder.build(text, None); - // should crop the phrase instead of croping around the match. - insta::assert_snapshot!( - matcher.format(format_options), - @"…Split The World is a book written by Emily Henry…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text, None); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text, None); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text, None); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with different density. - let text = "split void the void void world void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text, None); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with same word. - let text = "split split split split split split void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text, None); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn format_highlight_crop() { - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - - let format_options = FormatOptions { highlight: true, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text, None); - // both should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text, None); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text, None); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text, None); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn format_highlight_crop_phrase_query() { - //! testing: https://github.com/meilisearch/meilisearch/issues/3975 + pub fn rename_me( + format_options: Option, + text: &str, + query: &str, + expected_maybe_text: Option<&str>, + ) { let temp_index = TempIndex::new(); - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; + // document will always contain the same exact text normally + // TODO: Describe this better and ask if this is actually the case temp_index .add_documents(documents!([ - { "id": 1, "text": text } + { "id": 1, "text": text.to_string() }, ])) .unwrap(); let rtxn = temp_index.read_txn().unwrap(); - - let format_options = FormatOptions { highlight: true, crop: Some(10) }; - - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query); let mut matcher = builder.build(text, None); - // should return 10 words with a marker at the start as well the end, and the highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…the power to split the world between those who embraced…" - ); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); - let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". - insta::assert_snapshot!( - matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" + assert_eq!( + matcher.get_formatted_text(format_options), + expected_maybe_text.map(|v| v.to_string()) ); + } - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention had the power to split the world\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"The groundbreaking invention had the power to split the world…" - ); + struct FormatVariations<'a> { + highlight_with_crop: Option<&'a str>, + highlight: Option<&'a str>, + crop: Option<&'a str>, + } - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention had the power to split the world between those\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"The groundbreaking invention had the power to split the world…" - ); + impl<'a> FormatVariations<'a> { + fn get(&self) -> [(Option, Option<&'a str>); 5] { + [ + (None, None), + (Some(FormatOptions { highlight: true, crop: Some(2) }), self.highlight_with_crop), + (Some(FormatOptions { highlight: true, crop: None }), self.highlight), + (Some(FormatOptions { highlight: false, crop: Some(2) }), self.crop), + (Some(FormatOptions { highlight: false, crop: None }), None), + ] + } + } - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - // TODO: Should include exclamation mark without crop markers - @"…between those who embraced progress and those who resisted change…" - ); - - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"groundbreaking invention\" \"split the world between\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"…groundbreaking invention had the power to split the world between…" - ); - - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"groundbreaking invention\" \"had the power to split the world between those\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"…invention had the power to split the world between those…" + /// "Dei store fiskane eta dei små — dei liger under som minst förmå." + /// + /// (Men are like fish; the great ones devour the small.) + fn rename_me_with_base_text( + format_options: Option, + query: &str, + expected_maybe_text: Option<&str>, + ) { + rename_me( + format_options, + "Dei store fiskane eta dei små — dei liger under som minst förmå.", + query, + expected_maybe_text, ); } #[test] - fn smaller_crop_size() { - //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); + fn empty_query() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("Dei store…"), + highlight: None, + crop: Some("Dei store…"), + } + .get()) + { + rename_me_with_base_text(format_options, "", expected_maybe_text); + } + } - let text = "void void split the world void void."; + #[test] + fn only_separators() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some(":-…"), + highlight: None, + crop: Some(":-…"), + } + .get()) + { + rename_me(format_options, ":-)", ":-)", expected_maybe_text); + } + } - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(text, None); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split the…" - ); + #[test] + fn highlight_end() { + // TODO: Why is "förmå" marked as prefix in located matching words? + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("…minst förmå."), + highlight: Some("Dei store fiskane eta dei små — dei liger under som minst förmå."), + crop: Some("…minst förmå."), + } + .get()) { + rename_me_with_base_text(format_options, "minst förmå", expected_maybe_text); + } + } - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(text, None); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split…" - ); + #[test] + fn highlight_beginning_and_middle() { + // TODO: Why is "store" marked as prefix in located matching words? + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("Dei store…"), + highlight: Some("Dei store fiskane eta dei små — dei liger under som minst förmå."), + crop: Some("Dei store…"), + } + .get()) { + rename_me_with_base_text(format_options, "Dei store", expected_maybe_text); + } + } - // set crop size to 0 - let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(text, None); - // because crop size is 0, crop is ignored. - insta::assert_snapshot!( - matcher.format(format_options), - @"void void split the world void void." + #[test] + fn partial_match_middle() { + // TODO: Is this intentional? + // Here the only interned word is "forma", hence it cannot find the searched prefix + // word "fo" inside "forma" within milli::search::new::matches::matching_words::MatchingWords::try_get_word_match + // `milli::search::new::query_term::QueryTerm::all_computed_derivations` might be at fault here + + // interned words = ["forma"] + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("…förmå, på…"), + highlight: Some("altså, förmå, på en måte"), + crop: Some("…förmå, på…"), + } + .get()) + { + rename_me(format_options, "altså, förmå, på en måte", "fo", expected_maybe_text); + } + + // interned words = ["fo", "forma"] + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("…fo rmå…"), + highlight: Some("altså, fo rmå, på en måte"), + crop: Some("…fo förmå…"), + } + .get()) + { + rename_me(format_options, "altså, fo förmå, på en måte", "fo", expected_maybe_text); + } + } + + #[test] + fn partial_match_end() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("förmå, på…"), + highlight: Some("förmå, på en måte"), + crop: Some("förmå, på…"), + } + .get()) + { + rename_me(format_options, "förmå, på en måte", "fo", expected_maybe_text); + } + + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("fo rmå…"), + highlight: Some("fo rmå, på en måte"), + crop: Some("fo förmå…"), + } + .get()) + { + rename_me(format_options, "fo förmå, på en måte", "fo", expected_maybe_text); + } + } + + #[test] + fn partial_match_beginning() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("altså, förmå"), + highlight: Some("altså, förmå"), + crop: Some("altså, förmå"), + } + .get()) + { + rename_me(format_options, "altså, förmå", "fo", expected_maybe_text); + } + + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("…fo rmå"), + highlight: Some("altså, fo rmå"), + crop: Some("…fo förmå"), + } + .get()) + { + rename_me(format_options, "altså, fo förmå", "fo", expected_maybe_text); + } + } + + #[test] + fn separator_at_end() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("…minst förmå. , ;"), + highlight: Some("; , — dei liger under som minst förmå. , ;"), + crop: Some("…minst förmå. , ;"), + } + .get()) + { + rename_me( + format_options, + "; , — dei liger under som minst förmå. , ;", + "minst", + expected_maybe_text, + ); + } + } + + #[test] + fn separator_at_beginning() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("; , — dei liger…"), + highlight: Some("; , — dei liger under som minst förmå. , ;"), + crop: Some("; , — dei liger…"), + } + .get()) + { + rename_me( + format_options, + "; , — dei liger under som minst förmå. , ;", + "dei", + expected_maybe_text, + ); + } + } + + #[test] + fn phrase() { + for (format_options, expected_maybe_text) in (FormatVariations { + highlight_with_crop: Some("…dei liger…"), + highlight: Some( + "Dei store fiskane eta dei små — dei liger under som minst förmå.", + ), + crop: Some("…dei liger…"), + } + .get()) + { + rename_me_with_base_text(format_options, "\"dei liger\"", expected_maybe_text); + } + } + + #[test] + fn phrase_highlight_bigger_than_crop() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(1) }), + "\"dei liger\"", + Some("…dei…"), ); } #[test] - fn partial_matches() { - let temp_index = temp_index_with_documents(); - let rtxn = temp_index.read_txn().unwrap(); - let mut builder = - MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\""); - builder.highlight_prefix("_".to_string()); - builder.highlight_suffix("_".to_string()); + fn phrase_bigger_than_crop() { + rename_me_with_base_text( + Some(FormatOptions { highlight: false, crop: Some(1) }), + "\"dei liger\"", + Some("…dei…"), + ); + } - let format_options = FormatOptions { highlight: true, crop: None }; + #[test] + fn phrase_highlight_crop_middle() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(4) }), + "\"dei liger\"", + Some("…små — dei liger under…"), + ); + } - let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"_the_ _do or_ die can't be he do and or isn'_t he_" + #[test] + fn phrase_crop_middle() { + rename_me_with_base_text( + Some(FormatOptions { highlight: false, crop: Some(4) }), + "\"dei liger\"", + Some("…små — dei liger under…"), + ); + } + + #[test] + fn phrase_highlight_crop_end() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(4) }), + "\"minst förmå\"", + Some("…under som minst förmå."), + ); + } + + #[test] + fn phrase_crop_end() { + rename_me_with_base_text( + Some(FormatOptions { highlight: false, crop: Some(4) }), + "\"minst förmå\"", + Some("…under som minst förmå."), + ); + } + + #[test] + fn phrase_highlight_crop_beginning() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(4) }), + "\"Dei store\"", + Some("Dei store fiskane eta…"), ); } } diff --git a/crates/milli/src/search/new/matches/simple_token_kind.rs b/crates/milli/src/search/new/matches/simple_token_kind.rs deleted file mode 100644 index b34a8c985..000000000 --- a/crates/milli/src/search/new/matches/simple_token_kind.rs +++ /dev/null @@ -1,15 +0,0 @@ -use charabia::{SeparatorKind, Token, TokenKind}; - -pub enum SimpleTokenKind { - Separator(SeparatorKind), - NotSeparator, -} - -impl SimpleTokenKind { - pub fn new(token: &&Token<'_>) -> Self { - match token.kind { - TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), - _ => Self::NotSeparator, - } - } -} diff --git a/crates/milli/src/search/new/query_term/mod.rs b/crates/milli/src/search/new/query_term/mod.rs index ba8964e34..748248fc3 100644 --- a/crates/milli/src/search/new/query_term/mod.rs +++ b/crates/milli/src/search/new/query_term/mod.rs @@ -489,8 +489,7 @@ impl QueryTerm { let mut words = BTreeSet::new(); let mut phrases = BTreeSet::new(); - let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } = - &self.zero_typo; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, .. } = &self.zero_typo; words.extend(zero_typo.iter().copied()); words.extend(prefix_of.iter().copied()); phrases.extend(phrase.iter().copied());