Refactor matches, change behaviour of showMatchesPosition

2025-07-04 20:37:15 +02:00 · 2025-06-07 11:45:01 +03:00 · 2025-06-07 11:45:01 +03:00 · 24f213c343
commit 24f213c343
parent 97aeb6db4d
13 changed files with 1504 additions and 1395 deletions
--- a/crates/meilisearch/src/routes/indexes/documents.rs
+++ b/crates/meilisearch/src/routes/indexes/documents.rs
@ -1551,9 +1551,10 @@ fn retrieve_documents<S: AsRef<str>>(
            Ok(match &attributes_to_retrieve {
                Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
                    &document?,
-                    attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
-                        (retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
-                    ),
+                    attributes_to_retrieve
+                        .iter()
+                        .map(|s| s.as_ref())
+                        .chain(retrieve_vectors.should_retrieve().then_some("_vectors")),
                ),
                None => document?,
            })
@ -1586,7 +1587,7 @@ fn retrieve_document<S: AsRef<str>>(
            attributes_to_retrieve
                .iter()
                .map(|s| s.as_ref())
-                .chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
+                .chain(retrieve_vectors.should_retrieve().then_some("_vectors")),
        ),
        None => document,
    };
--- a/crates/meilisearch/src/search/federated/perform.rs
+++ b/crates/meilisearch/src/search/federated/perform.rs
@ -815,7 +815,8 @@ impl SearchByIndex {

                let (result, _semantic_hit_count) =
                    super::super::search_from_kind(index_uid.to_string(), search_kind, search)?;
-                let format = AttributesFormat {
+
+                let attributes_format = AttributesFormat {
                    attributes_to_retrieve: query.attributes_to_retrieve,
                    retrieve_vectors,
                    attributes_to_highlight: query.attributes_to_highlight,
@ -846,12 +847,11 @@ impl SearchByIndex {

                let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref());

-                let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
-
                let hit_maker =
-                    HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| {
-                        MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
-                    })?;
+                    HitMaker::new(matching_words, tokenizer, attributes_format, &index, &rtxn)
+                        .map_err(|e| {
+                            MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
+                        })?;

                results_by_query.push(SearchResultByQuery {
                    weight,
--- a/crates/meilisearch/src/search/mod.rs
+++ b/crates/meilisearch/src/search/mod.rs
@ -1,4 +1,5 @@
 use core::fmt;
+use std::borrow::Cow;
 use std::cmp::min;
 use std::collections::{BTreeMap, BTreeSet, HashSet};
 use std::str::FromStr;
@ -27,11 +28,11 @@ use meilisearch_types::{milli, Document};
 use milli::tokenizer::{Language, TokenizerBuilder};
 use milli::{
    AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, LocalizedAttributesRule,
-    MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    MarkerOptions, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
 };
 use regex::Regex;
 use serde::{Deserialize, Serialize};
-use serde_json::{json, Value};
+use serde_json::{json, Map, Value};
 #[cfg(test)]
 mod mod_test;
 use utoipa::ToSchema;
@ -46,7 +47,9 @@ pub use federated::{

 mod ranking_rules;

-type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;
+// TODO: Adapt this type to support cropping
+// { "_matchesPosition": { "overview": { first: false, highlighted: [[0,4,6,11,5,234,6,241,5]] } } }
+// type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;

 pub const DEFAULT_SEARCH_OFFSET: fn() -> usize = || 0;
 pub const DEFAULT_SEARCH_LIMIT: fn() -> usize = || 20;
@ -742,11 +745,9 @@ pub struct SearchHit {
    #[serde(flatten)]
    #[schema(additional_properties, inline, value_type = HashMap<String, Value>)]
    pub document: Document,
-    #[serde(default, rename = "_formatted", skip_serializing_if = "Document::is_empty")]
+    #[serde(default, rename = "_formatted", skip_serializing_if = "Option::is_none")]
    #[schema(additional_properties, value_type = HashMap<String, Value>)]
-    pub formatted: Document,
-    #[serde(default, rename = "_matchesPosition", skip_serializing_if = "Option::is_none")]
-    pub matches_position: Option<MatchesPosition>,
+    pub formatted: Option<Document>,
    #[serde(default, rename = "_rankingScore", skip_serializing_if = "Option::is_none")]
    pub ranking_score: Option<f64>,
    #[serde(default, rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")]
@ -1223,6 +1224,7 @@ struct AttributesFormat {
    crop_marker: String,
    highlight_pre_tag: String,
    highlight_post_tag: String,
+    // TODO: Might want to rename this to signify that this will not yield _formatted anymore, only positions
    show_matches_position: bool,
    sort: Option<Vec<String>>,
    show_ranking_score: bool,
@ -1230,7 +1232,7 @@ struct AttributesFormat {
    locales: Option<Vec<Language>>,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy)]
 pub enum RetrieveVectors {
    /// Remove the `_vectors` field
    ///
@ -1250,6 +1252,10 @@ impl RetrieveVectors {
            Self::Hide
        }
    }
+
+    pub fn should_retrieve(&self) -> bool {
+        matches!(self, Self::Retrieve)
+    }
 }

 struct HitMaker<'a> {
@ -1261,7 +1267,7 @@ struct HitMaker<'a> {
    retrieve_vectors: RetrieveVectors,
    to_retrieve_ids: BTreeSet<FieldId>,
    embedding_configs: Vec<milli::index::IndexEmbeddingConfig>,
-    formatter_builder: MatcherBuilder<'a>,
+    matcher_builder: MatcherBuilder<'a>,
    formatted_options: BTreeMap<FieldId, FormatOptions>,
    show_ranking_score: bool,
    show_ranking_score_details: bool,
@ -1289,24 +1295,20 @@ impl<'a> HitMaker<'a> {
        tokenizer_builder.into_tokenizer()
    }

-    pub fn formatter_builder(
-        matching_words: milli::MatchingWords,
-        tokenizer: milli::tokenizer::Tokenizer<'_>,
-    ) -> MatcherBuilder<'_> {
-        let formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
-
-        formatter_builder
-    }
-
    pub fn new(
+        matching_words: milli::MatchingWords,
+        tokenizer: milli::tokenizer::Tokenizer<'a>,
+        attr_fmt: AttributesFormat,
        index: &'a Index,
        rtxn: &'a RoTxn<'a>,
-        format: AttributesFormat,
-        mut formatter_builder: MatcherBuilder<'a>,
    ) -> milli::Result<Self> {
-        formatter_builder.crop_marker(format.crop_marker);
-        formatter_builder.highlight_prefix(format.highlight_pre_tag);
-        formatter_builder.highlight_suffix(format.highlight_post_tag);
+        let AttributesFormat { highlight_pre_tag, highlight_post_tag, crop_marker, .. } = attr_fmt;
+
+        let matcher_builder = MatcherBuilder::new(
+            matching_words,
+            tokenizer,
+            MarkerOptions { highlight_pre_tag, highlight_post_tag, crop_marker },
+        );

        let fields_ids_map = index.fields_ids_map(rtxn)?;
        let displayed_ids = index
@ -1324,21 +1326,21 @@ impl<'a> HitMaker<'a> {
                let displayed_names = index.displayed_fields(rtxn)?.unwrap();
                !displayed_names.contains(&milli::constants::RESERVED_VECTORS_FIELD_NAME)
            }
-            // displayed_ids is a finit list, so hide if `_vectors` is not part of it
+            // displayed_ids is a finite list, so hide if `_vectors` is not part of it
            (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
        };

        let displayed_ids =
            displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());

-        let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
+        let retrieve_vectors = if let RetrieveVectors::Retrieve = attr_fmt.retrieve_vectors {
            if vectors_is_hidden {
                RetrieveVectors::Hide
            } else {
                RetrieveVectors::Retrieve
            }
        } else {
-            format.retrieve_vectors
+            attr_fmt.retrieve_vectors
        };

        let fids = |attrs: &BTreeSet<String>| {
@ -1355,7 +1357,7 @@ impl<'a> HitMaker<'a> {
            }
            ids
        };
-        let to_retrieve_ids: BTreeSet<_> = format
+        let to_retrieve_ids: BTreeSet<_> = attr_fmt
            .attributes_to_retrieve
            .as_ref()
            .map(fids)
@ -1364,12 +1366,12 @@ impl<'a> HitMaker<'a> {
            .cloned()
            .collect();

-        let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
-        let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
+        let attr_to_highlight = attr_fmt.attributes_to_highlight.unwrap_or_default();
+        let attr_to_crop = attr_fmt.attributes_to_crop.unwrap_or_default();
        let formatted_options = compute_formatted_options(
            &attr_to_highlight,
            &attr_to_crop,
-            format.crop_length,
+            attr_fmt.crop_length,
            &to_retrieve_ids,
            &fields_ids_map,
            &displayed_ids,
@ -1386,51 +1388,53 @@ impl<'a> HitMaker<'a> {
            retrieve_vectors,
            to_retrieve_ids,
            embedding_configs,
-            formatter_builder,
+            matcher_builder,
            formatted_options,
-            show_ranking_score: format.show_ranking_score,
-            show_ranking_score_details: format.show_ranking_score_details,
-            show_matches_position: format.show_matches_position,
-            sort: format.sort,
-            locales: format.locales,
+            show_ranking_score: attr_fmt.show_ranking_score,
+            show_ranking_score_details: attr_fmt.show_ranking_score_details,
+            show_matches_position: attr_fmt.show_matches_position,
+            sort: attr_fmt.sort,
+            locales: attr_fmt.locales,
        })
    }

-    pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
-        let (_, obkv) =
-            self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
+    fn make_document(&self, obkv: &obkv::KvReaderU16) -> milli::Result<Document> {
+        let mut document = serde_json::Map::new();

-        // First generate a document with all the displayed fields
-        let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?;
-
-        let add_vectors_fid =
-            self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);
-
-        // select the attributes to retrieve
-        let attributes_to_retrieve = self
-            .to_retrieve_ids
-            .iter()
-            // skip the vectors_fid if RetrieveVectors::Hide
-            .filter(|fid| match self.vectors_fid {
-                Some(vectors_fid) => {
-                    !(self.retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
+        // recreate JSON with appropriate attributes
+        for (key, value) in obkv.iter() {
+            if self.vectors_fid.is_some_and(|vectors_fid| vectors_fid == key) {
+                // (vectors aren't considered in `displayedAttributes` and `attributesToRetrieve`, but rather with `retrieveVectors`)
+                if !self.retrieve_vectors.should_retrieve() {
+                    continue;
                }
-                None => true,
-            })
-            // need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
-            .chain(add_vectors_fid.iter())
-            .map(|&fid| self.fields_ids_map.name(fid).expect("Missing field name"));
+            } else if !self.to_retrieve_ids.contains(&key) || !self.displayed_ids.contains(&key) {
+                // https://www.meilisearch.com/docs/reference/api/settings#displayed-attributes
+                // https://www.meilisearch.com/docs/reference/api/search#attributes-to-retrieve
+                continue;
+            }

-        let mut document =
-            permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
+            let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
+            let key = self.fields_ids_map.name(key).expect("Missing field name").to_string();

-        if self.retrieve_vectors == RetrieveVectors::Retrieve {
-            // Clippy is wrong
+            document.insert(key, value);
+        }
+
+        Ok(document)
+    }
+
+    pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
+        let obkv = self.index.document(self.rtxn, id)?;
+
+        let mut document = self.make_document(obkv)?;
+
+        if self.retrieve_vectors.should_retrieve() {
            #[allow(clippy::manual_unwrap_or_default)]
            let mut vectors = match document.remove("_vectors") {
                Some(Value::Object(map)) => map,
                _ => Default::default(),
            };
+
            for (name, vector) in self.index.embeddings(self.rtxn, id)? {
                let user_provided = self
                    .embedding_configs
@ -1439,6 +1443,7 @@ impl<'a> HitMaker<'a> {
                    .is_some_and(|conf| conf.user_provided.contains(id));
                let embeddings =
                    ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
+
                vectors.insert(
                    name,
                    serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?,
@ -1450,10 +1455,10 @@ impl<'a> HitMaker<'a> {
        let localized_attributes =
            self.index.localized_attributes_rules(self.rtxn)?.unwrap_or_default();

-        let (matches_position, formatted) = format_fields(
-            &displayed_document,
+        let formatted = format_fields(
+            &mut document,
            &self.fields_ids_map,
-            &self.formatter_builder,
+            &self.matcher_builder,
            &self.formatted_options,
            self.show_matches_position,
            &self.displayed_ids,
@ -1470,13 +1475,7 @@ impl<'a> HitMaker<'a> {
        let ranking_score_details =
            self.show_ranking_score_details.then(|| ScoreDetails::to_json_map(score.iter()));

-        let hit = SearchHit {
-            document,
-            formatted,
-            matches_position,
-            ranking_score_details,
-            ranking_score,
-        };
+        let hit = SearchHit { document, formatted, ranking_score_details, ranking_score };

        Ok(hit)
    }
@ -1485,7 +1484,7 @@ impl<'a> HitMaker<'a> {
 fn make_hits<'a>(
    index: &Index,
    rtxn: &RoTxn<'_>,
-    format: AttributesFormat,
+    attributes_format: AttributesFormat,
    matching_words: milli::MatchingWords,
    documents_ids_scores: impl Iterator<Item = (u32, &'a Vec<ScoreDetails>)> + 'a,
 ) -> milli::Result<Vec<SearchHit>> {
@ -1500,9 +1499,7 @@ fn make_hits<'a>(

    let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref());

-    let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
-
-    let hit_maker = HitMaker::new(index, rtxn, format, formatter_builder)?;
+    let hit_maker = HitMaker::new(matching_words, tokenizer, attributes_format, index, rtxn)?;

    for (id, score) in documents_ids_scores {
        documents.push(hit_maker.make_hit(id, score)?);
@ -1818,59 +1815,100 @@ fn add_non_formatted_ids_to_formatted_options(
    }
 }

-fn make_document(
-    displayed_attributes: &BTreeSet<FieldId>,
-    field_ids_map: &FieldsIdsMap,
-    obkv: &obkv::KvReaderU16,
-) -> milli::Result<Document> {
-    let mut document = serde_json::Map::new();
-
-    // recreate the original json
-    for (key, value) in obkv.iter() {
-        let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
-        let key = field_ids_map.name(key).expect("Missing field name").to_string();
-
-        document.insert(key, value);
-    }
-
-    // select the attributes to retrieve
-    let displayed_attributes = displayed_attributes
-        .iter()
-        .map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
-
-    let document = permissive_json_pointer::select_values(&document, displayed_attributes);
-    Ok(document)
-}
-
 #[allow(clippy::too_many_arguments)]
 fn format_fields(
-    document: &Document,
+    document: &mut Document,
    field_ids_map: &FieldsIdsMap,
-    builder: &MatcherBuilder<'_>,
+    matcher_builder: &MatcherBuilder<'_>,
    formatted_options: &BTreeMap<FieldId, FormatOptions>,
-    compute_matches: bool,
+    show_matches_position: bool,
    displayable_ids: &BTreeSet<FieldId>,
    locales: Option<&[Language]>,
    localized_attributes: &[LocalizedAttributesRule],
-) -> milli::Result<(Option<MatchesPosition>, Document)> {
-    let mut matches_position = compute_matches.then(BTreeMap::new);
-    let mut document = document.clone();
-
+) -> milli::Result<Option<Document>> {
    // reduce the formatted option list to the attributes that should be formatted,
    // instead of all the attributes to display.
-    let formatting_fields_options: Vec<_> = formatted_options
+    let formatting_fields_options = formatted_options
        .iter()
        .filter(|(_, option)| option.should_format())
        .map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
-        .collect();
+        .collect::<Vec<_>>();

    // select the attributes to retrieve
    let displayable_names =
        displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
+
+    let get_format_options = |key: Cow<'_, str>| {
+        formatting_fields_options
+            .iter()
+            .filter(|(name, ..)| {
+                milli::is_faceted_by(name, &key) || milli::is_faceted_by(&key, name)
+            })
+            .map(|(_, option)| **option)
+            .reduce(|acc, option| acc.merge(option))
+    };
+
+    let get_locales = |key: Cow<'_, str>| {
+        // TODO: Should this be re computed every time?
+        // if no locales has been provided, we try to find the locales in the localized_attributes.
+        locales.or_else(|| {
+            localized_attributes
+                .iter()
+                .find(|rule| matches!(rule.match_str(&key), PatternMatch::Match))
+                .map(LocalizedAttributesRule::locales)
+        })
+    };
+
+    fn get_text(value: &mut Value) -> Option<Cow<'_, String>> {
+        match value {
+            Value::String(text) => Some(Cow::Borrowed(text)),
+            Value::Number(number) => Some(Cow::Owned(number.to_string())),
+            // boolean and null can not be matched by meili, can not be formatted
+            // and array or object cannot be yielded by `permissive_json_pointer::map_leaf_values`
+            _ => None,
+        }
+    }
+
+    if show_matches_position {
+        permissive_json_pointer::map_leaf_values(document, displayable_names, |key, _, value| {
+            let Some(text) = get_text(value) else {
+                *value = Value::Object(Map::from_iter(std::iter::once((
+                    "value".to_string(),
+                    value.take(),
+                ))));
+
+                return;
+            };
+
+            let locales = get_locales(Cow::from(key));
+            let mut matcher = matcher_builder.build(&text, locales);
+            let format_options = get_format_options(Cow::from(key));
+            let match_bounds = matcher.get_match_bounds(format_options);
+
+            let value_iter = std::iter::once(("value".to_string(), value.take()));
+
+            // do not include `matches` in case there is nothing to format
+            let json_map = if let Some(mb) = match_bounds {
+                let matches_iter = std::iter::once((
+                    "matches".to_string(),
+                    serde_json::to_value(mb).expect("TODO"),
+                ));
+                Map::from_iter(value_iter.chain(matches_iter))
+            } else {
+                Map::from_iter(value_iter)
+            };
+
+            *value = Value::Object(json_map);
+        });
+
+        return Ok(None);
+    }
+
+    let mut formatted_document = document.clone();
    permissive_json_pointer::map_leaf_values(
-        &mut document,
+        &mut formatted_document,
        displayable_names,
-        |key, array_indices, value| {
+        |key, _, value| {
            // To get the formatting option of each key we need to see all the rules that applies
            // to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
            // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
@ -1878,37 +1916,22 @@ fn format_fields(
            // Warn: The time to compute the format list scales with the number of fields to format;
            // cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
            // d*f where d is the total number of fields to display and f is the total number of fields to format.
-            let format = formatting_fields_options
-                .iter()
-                .filter(|(name, _option)| {
-                    milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
-                })
-                .map(|(_, option)| **option)
-                .reduce(|acc, option| acc.merge(option));
-            let mut infos = Vec::new();
+            let Some(text) = get_text(value) else {
+                return;
+            };

-            // if no locales has been provided, we try to find the locales in the localized_attributes.
-            let locales = locales.or_else(|| {
-                localized_attributes
-                    .iter()
-                    .find(|rule| rule.match_str(key) == PatternMatch::Match)
-                    .map(LocalizedAttributesRule::locales)
-            });
+            let format_options = get_format_options(Cow::from(key));

-            *value = format_value(
-                std::mem::take(value),
-                builder,
-                format,
-                &mut infos,
-                compute_matches,
-                array_indices,
-                locales,
-            );
+            // there's nothing to format
+            if !format_options.is_some_and(|v| v.should_format()) {
+                return;
+            }

-            if let Some(matches) = matches_position.as_mut() {
-                if !infos.is_empty() {
-                    matches.insert(key.to_owned(), infos);
-                }
+            let locales = get_locales(Cow::from(key));
+
+            let mut matcher = matcher_builder.build(&text, locales);
+            if let Some(formatted_text) = matcher.get_formatted_text(format_options) {
+                *value = Value::String(formatted_text);
            }
        },
    );
@ -1918,58 +1941,9 @@ fn format_fields(
        // This unwrap must be safe since we got the ids from the fields_ids_map just
        // before.
        .map(|&fid| field_ids_map.name(fid).unwrap());
-    let document = permissive_json_pointer::select_values(&document, selectors);
+    let formatted_document = permissive_json_pointer::select_values(&formatted_document, selectors);

-    Ok((matches_position, document))
-}
-
-fn format_value(
-    value: Value,
-    builder: &MatcherBuilder<'_>,
-    format_options: Option<FormatOptions>,
-    infos: &mut Vec<MatchBounds>,
-    compute_matches: bool,
-    array_indices: &[usize],
-    locales: Option<&[Language]>,
-) -> Value {
-    match value {
-        Value::String(old_string) => {
-            let mut matcher = builder.build(&old_string, locales);
-            if compute_matches {
-                let matches = matcher.matches(array_indices);
-                infos.extend_from_slice(&matches[..]);
-            }
-
-            match format_options {
-                Some(format_options) => {
-                    let value = matcher.format(format_options);
-                    Value::String(value.into_owned())
-                }
-                None => Value::String(old_string),
-            }
-        }
-        // `map_leaf_values` makes sure this is only called for leaf fields
-        Value::Array(_) => unreachable!(),
-        Value::Object(_) => unreachable!(),
-        Value::Number(number) => {
-            let s = number.to_string();
-
-            let mut matcher = builder.build(&s, locales);
-            if compute_matches {
-                let matches = matcher.matches(array_indices);
-                infos.extend_from_slice(&matches[..]);
-            }
-
-            match format_options {
-                Some(format_options) => {
-                    let value = matcher.format(format_options);
-                    Value::String(value.into_owned())
-                }
-                None => Value::String(s),
-            }
-        }
-        value => value,
-    }
+    Ok(Some(formatted_document))
 }

 pub(crate) fn parse_filter(
--- a/crates/milli/src/lib.rs
+++ b/crates/milli/src/lib.rs
@ -79,8 +79,9 @@ pub use self::localized_attributes_rules::LocalizedAttributesRule;
 pub use self::search::facet::{FacetValueHit, SearchForFacetValues};
 pub use self::search::similar::Similar;
 pub use self::search::{
-    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
-    Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    FacetDistribution, Filter, FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder,
+    MatchingWords, OrderBy, Search, SearchResult, SemanticSearch, TermsMatchingStrategy,
+    DEFAULT_VALUES_PER_FACET,
 };
 pub use self::update::ChannelCongestion;

--- a/crates/milli/src/search/mod.rs
+++ b/crates/milli/src/search/mod.rs
@ -7,7 +7,9 @@ use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;

 pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
-pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
+pub use self::new::matches::{
+    FormatOptions, MarkerOptions, MatchBounds, MatcherBuilder, MatchingWords,
+};
 use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
 use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
 use crate::score_details::{ScoreDetails, ScoringStrategy};
@ -277,7 +279,7 @@ impl<'a> Search<'a> {

        // consume context and located_query_terms to build MatchingWords.
        let matching_words = match located_query_terms {
-            Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
+            Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms),
            None => MatchingWords::default(),
        };

--- a/crates/milli/src/search/new/matches/adjust_indices.rs
+++ b/crates/milli/src/search/new/matches/adjust_indices.rs
@ -0,0 +1,204 @@
+use std::cmp::Ordering;
+
+use charabia::{SeparatorKind, Token, TokenKind};
+
+enum SimpleTokenKind {
+    Separator(SeparatorKind),
+    NonSeparator,
+    Done,
+}
+
+impl SimpleTokenKind {
+    fn new(token: &Token) -> Self {
+        match token.kind {
+            TokenKind::Separator(separator_kind) => Self::Separator(separator_kind),
+            _ => Self::NonSeparator,
+        }
+    }
+}
+
+struct CropBoundsHelper<'a> {
+    tokens: &'a [Token<'a>],
+    index_backward: usize,
+    backward_token_kind: SimpleTokenKind,
+    index_forward: usize,
+    forward_token_kind: SimpleTokenKind,
+}
+
+impl CropBoundsHelper<'_> {
+    fn advance_backward(&mut self) {
+        if matches!(self.backward_token_kind, SimpleTokenKind::Done) {
+            return;
+        }
+
+        if self.index_backward != 0 {
+            self.index_backward -= 1;
+            self.backward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_backward]);
+        } else {
+            self.backward_token_kind = SimpleTokenKind::Done;
+        }
+    }
+
+    fn advance_forward(&mut self) {
+        if matches!(self.forward_token_kind, SimpleTokenKind::Done) {
+            return;
+        }
+
+        if self.index_forward != self.tokens.len() - 1 {
+            self.index_forward += 1;
+            self.forward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_forward]);
+        } else {
+            self.forward_token_kind = SimpleTokenKind::Done;
+        }
+    }
+}
+
+fn get_adjusted_indices_for_too_few_words(
+    tokens: &[Token],
+    index_backward: usize,
+    index_forward: usize,
+    mut words_count: usize,
+    crop_size: usize,
+) -> [usize; 2] {
+    let crop_size = crop_size + 2;
+    let mut cbh = CropBoundsHelper {
+        tokens,
+        index_backward,
+        backward_token_kind: SimpleTokenKind::new(&tokens[index_backward]),
+        index_forward,
+        forward_token_kind: SimpleTokenKind::new(&tokens[index_forward]),
+    };
+
+    loop {
+        match [&cbh.backward_token_kind, &cbh.forward_token_kind] {
+            // if they are both separators and are the same kind then advance both,
+            // or expand in the soft separator side
+            [SimpleTokenKind::Separator(backward_sk), SimpleTokenKind::Separator(forward_sk)] => {
+                if backward_sk == forward_sk {
+                    cbh.advance_backward();
+
+                    // this avoids having an ending separator before crop marker
+                    if words_count < crop_size - 1 {
+                        cbh.advance_forward();
+                    }
+                } else if matches!(backward_sk, SeparatorKind::Hard) {
+                    cbh.advance_forward();
+                } else {
+                    cbh.advance_backward();
+                }
+            }
+            // both are words, advance left then right if we haven't reached `crop_size`
+            [SimpleTokenKind::NonSeparator, SimpleTokenKind::NonSeparator] => {
+                cbh.advance_backward();
+                words_count += 1;
+
+                if words_count != crop_size {
+                    cbh.advance_forward();
+                    words_count += 1;
+                }
+            }
+            [SimpleTokenKind::Done, SimpleTokenKind::Done] => break,
+            // if one of the tokens is non-separator and the other a separator, we expand in the non-separator side
+            // if one of the sides reached the end, we expand in the opposite direction
+            [backward_stk, SimpleTokenKind::Done]
+            | [backward_stk @ SimpleTokenKind::NonSeparator, SimpleTokenKind::Separator(_)] => {
+                if matches!(backward_stk, SimpleTokenKind::NonSeparator) {
+                    words_count += 1;
+                }
+                cbh.advance_backward();
+            }
+            [SimpleTokenKind::Done, forward_stk]
+            | [SimpleTokenKind::Separator(_), forward_stk @ SimpleTokenKind::NonSeparator] => {
+                if matches!(forward_stk, SimpleTokenKind::NonSeparator) {
+                    words_count += 1;
+                }
+                cbh.advance_forward();
+            }
+        }
+
+        if words_count == crop_size {
+            break;
+        }
+    }
+
+    [cbh.index_backward, cbh.index_forward]
+}
+
+fn get_adjusted_index_forward_for_too_many_words(
+    tokens: &[Token],
+    mut index_forward: usize,
+    mut words_count: usize,
+    crop_size: usize,
+) -> usize {
+    while index_forward != 0 {
+        if matches!(SimpleTokenKind::new(&tokens[index_forward]), SimpleTokenKind::NonSeparator) {
+            words_count -= 1;
+
+            if words_count == crop_size {
+                break;
+            }
+        }
+
+        index_forward -= 1;
+    }
+
+    if index_forward == 0 {
+        return index_forward;
+    }
+
+    index_forward - 1
+}
+
+pub fn get_adjusted_indices_for_highlights_and_crop_size(
+    tokens: &[Token],
+    index_backward: usize,
+    index_forward: usize,
+    words_count: usize,
+    crop_size: usize,
+) -> [usize; 2] {
+    match words_count.cmp(&crop_size) {
+        Ordering::Less => get_adjusted_indices_for_too_few_words(
+            tokens,
+            index_backward,
+            index_forward,
+            words_count,
+            crop_size,
+        ),
+        Ordering::Equal => [
+            if index_backward != 0 { index_backward - 1 } else { index_backward },
+            if index_forward != tokens.len() - 1 { index_forward + 1 } else { index_forward },
+        ],
+        Ordering::Greater => [
+            index_backward,
+            get_adjusted_index_forward_for_too_many_words(
+                tokens,
+                index_forward,
+                words_count,
+                crop_size,
+            ),
+        ],
+    }
+}
+
+pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usize) -> usize {
+    let mut words_count = 0;
+    let mut index = 0;
+
+    while index != tokens.len() - 1 {
+        if matches!(SimpleTokenKind::new(&tokens[index]), SimpleTokenKind::NonSeparator) {
+            words_count += 1;
+
+            if words_count == crop_size {
+                break;
+            }
+        }
+
+        index += 1;
+    }
+
+    if index == tokens.len() - 1 {
+        return index;
+    }
+
+    index + 1
+}
--- a/crates/milli/src/search/new/matches/best_match_interval.rs
+++ b/crates/milli/src/search/new/matches/best_match_interval.rs
@ -1,139 +0,0 @@
-use super::matching_words::WordId;
-use super::{Match, MatchPosition};
-
-struct MatchIntervalWithScore {
-    interval: [usize; 2],
-    score: [i16; 3],
-}
-
-// count score for phrases
-fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
-    let words_in_phrase_minus_one = (lwp - fwp) as i16;
-    // will always be ordered, so +1 for each space between words
-    *order_score += words_in_phrase_minus_one;
-    // distance will always be 1, so -1 for each space between words
-    *distance_score -= words_in_phrase_minus_one;
-}
-
-/// Compute the score of a match interval:
-/// 1) count unique matches
-/// 2) calculate distance between matches
-/// 3) count ordered matches
-fn get_interval_score(matches: &[Match]) -> [i16; 3] {
-    let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
-    let mut order_score = 0;
-    let mut distance_score = 0;
-
-    let mut iter = matches.iter().peekable();
-    while let Some(m) = iter.next() {
-        if let Some(next_match) = iter.peek() {
-            // if matches are ordered
-            if next_match.ids.iter().min() > m.ids.iter().min() {
-                order_score += 1;
-            }
-
-            let m_last_word_pos = match m.position {
-                MatchPosition::Word { word_position, .. } => word_position,
-                MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
-                    tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
-                    lwp
-                }
-            };
-            let next_match_first_word_pos = next_match.get_first_word_pos();
-
-            // compute distance between matches
-            distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
-        } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
-            // in case last match is a phrase, count score for its words
-            tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
-        }
-
-        ids.extend(m.ids.iter());
-    }
-
-    ids.sort_unstable();
-    ids.dedup();
-    let uniq_score = ids.len() as i16;
-
-    // rank by unique match count, then by distance between matches, then by ordered match count.
-    [uniq_score, distance_score, order_score]
-}
-
-/// Returns the first and last match where the score computed by match_interval_score is the best.
-pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
-    if matches.is_empty() {
-        panic!("`matches` should not be empty at this point");
-    }
-
-    // positions of the first and the last match of the best matches interval in `matches`.
-    let mut best_interval: Option<MatchIntervalWithScore> = None;
-
-    let mut save_best_interval = |interval_first, interval_last| {
-        let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
-        let is_interval_score_better = &best_interval
-            .as_ref()
-            .is_none_or(|MatchIntervalWithScore { score, .. }| interval_score > *score);
-
-        if *is_interval_score_better {
-            best_interval = Some(MatchIntervalWithScore {
-                interval: [interval_first, interval_last],
-                score: interval_score,
-            });
-        }
-    };
-
-    // we compute the matches interval if we have at least 2 matches.
-    // current interval positions.
-    let mut interval_first = 0;
-    let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
-
-    for (index, next_match) in matches.iter().enumerate() {
-        // if next match would make interval gross more than crop_size,
-        // we compare the current interval with the best one,
-        // then we increase `interval_first` until next match can be added.
-        let next_match_last_word_pos = next_match.get_last_word_pos();
-
-        // if the next match would mean that we pass the crop size window,
-        // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
-        // and calculate a score for it, and check if it's better than our best so far
-        if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-            // if index is 0 there is no last viable match
-            if index != 0 {
-                let interval_last = index - 1;
-                // keep interval if it's the best
-                save_best_interval(interval_first, interval_last);
-            }
-
-            // advance start of the interval while interval is longer than crop_size.
-            loop {
-                interval_first += 1;
-                if interval_first == matches.len() {
-                    interval_first -= 1;
-                    break;
-                }
-
-                interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
-
-                if interval_first_match_first_word_pos > next_match_last_word_pos
-                    || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
-                {
-                    break;
-                }
-            }
-        }
-    }
-
-    // compute the last interval score and compare it to the best one.
-    let interval_last = matches.len() - 1;
-    // if it's the last match with itself, we need to make sure it's
-    // not a phrase longer than the crop window
-    if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
-        save_best_interval(interval_first, interval_last);
-    }
-
-    // if none of the matches fit the criteria above, default to the first one
-    best_interval.map_or(
-        [&matches[0], &matches[0]],
-        |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
-    )
-}
--- a/crates/milli/src/search/new/matches/best_match_range.rs
+++ b/crates/milli/src/search/new/matches/best_match_range.rs
@ -0,0 +1,169 @@
+use std::cell::Cell;
+
+use crate::search::new::matches::matching_words::QueryPosition;
+
+use super::r#match::{Match, MatchPosition};
+
+struct MatchesIndexRangeWithScore {
+    matches_index_range: [usize; 2],
+    score: [i16; 3],
+}
+
+/// Compute the score of a match interval:
+/// 1) count unique matches
+/// 2) calculate distance between matches
+/// 3) count ordered matches
+fn get_score(
+    matches: &[Match],
+    query_positions: &[QueryPosition],
+    index_first: usize,
+    index_last: usize,
+) -> [i16; 3] {
+    let order_score = Cell::new(0);
+    let distance_score = Cell::new(0);
+
+    let mut iter = (index_first..=index_last)
+        .filter_map(|index| {
+            query_positions.iter().find_map(move |v| (v.index == index).then(|| v.range[0]))
+        })
+        .peekable();
+    while let (Some(range_first), Some(next_range_first)) = (iter.next(), iter.peek()) {
+        if range_first < *next_range_first {
+            order_score.set(order_score.get() + 1);
+        }
+    }
+
+    // count score for phrases
+    let tally_phrase_scores = |fwp, lwp| {
+        let words_in_phrase_minus_one = (lwp - fwp) as i16;
+        // will always be in the order of query, so +1 for each space between words
+        order_score.set(order_score.get() + words_in_phrase_minus_one);
+        // distance will always be 1, so -1 for each space between words
+        distance_score.set(distance_score.get() - words_in_phrase_minus_one);
+    };
+
+    let mut iter = matches[index_first..=index_last].iter().peekable();
+    while let Some(r#match) = iter.next() {
+        if let Some(next_match) = iter.peek() {
+            let match_last_word_pos = match r#match.position {
+                MatchPosition::Word { word_position, .. } => word_position,
+                MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => {
+                    tally_phrase_scores(fwp, lwp);
+                    lwp
+                }
+            };
+            let next_match_first_word_pos = next_match.get_first_word_pos();
+
+            // compute distance between matches
+            distance_score.set(
+                distance_score.get()
+                    - (next_match_first_word_pos - match_last_word_pos).min(7) as i16,
+            );
+        } else if let MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } =
+            r#match.position
+        {
+            // in case last match is a phrase, count score for its words
+            tally_phrase_scores(fwp, lwp);
+        }
+    }
+
+    let mut uniqueness_score = 0i16;
+    let mut current_range: Option<super::matching_words::UserQueryPositionRange> = None;
+
+    for qp in query_positions.iter().filter(|v| v.index >= index_first && v.index <= index_last) {
+        match current_range.as_mut() {
+            Some([saved_range_start, saved_range_end]) => {
+                let [range_start, range_end] = qp.range;
+
+                if range_start > *saved_range_start {
+                    uniqueness_score += (*saved_range_end - *saved_range_start) as i16 + 1;
+
+                    *saved_range_start = range_start;
+                    *saved_range_end = range_end;
+                } else if range_end > *saved_range_end {
+                    *saved_range_end = range_end;
+                }
+            }
+            None => current_range = Some(qp.range),
+        }
+    }
+
+    if let Some([saved_range_start, saved_range_end]) = current_range {
+        uniqueness_score += (saved_range_end - saved_range_start) as i16 + 1;
+    }
+
+    // rank by unique match count, then by distance between matches, then by ordered match count.
+    [uniqueness_score, distance_score.into_inner(), order_score.into_inner()]
+}
+
+/// Returns the first and last match where the score computed by match_interval_score is the best.
+pub fn get_best_match_index_range(
+    matches: &[Match],
+    query_positions: &[QueryPosition],
+    crop_size: usize,
+) -> [usize; 2] {
+    // positions of the first and the last match of the best matches index range in `matches`.
+    let mut best_matches_index_range: Option<MatchesIndexRangeWithScore> = None;
+
+    let mut save_best_matches_index_range = |index_first, index_last| {
+        let score = get_score(matches, query_positions, index_first, index_last);
+        let is_score_better = best_matches_index_range.as_ref().is_none_or(|v| score > v.score);
+
+        if is_score_better {
+            best_matches_index_range = Some(MatchesIndexRangeWithScore {
+                matches_index_range: [index_first, index_last],
+                score,
+            });
+        }
+    };
+
+    // we compute the matches index range if we have at least 2 matches.
+    let mut index_first = 0;
+    let mut first_match_first_word_pos = matches[index_first].get_first_word_pos();
+
+    for (index, next_match) in matches.iter().enumerate() {
+        // if next match would make index range gross more than crop_size,
+        // we compare the current index range with the best one,
+        // then we increase `index_first` until next match can be added.
+        let next_match_last_word_pos = next_match.get_last_word_pos();
+
+        // if the next match would mean that we pass the crop size window,
+        // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
+        // and calculate a score for it, and check if it's better than our best so far
+        if next_match_last_word_pos - first_match_first_word_pos + 1 > crop_size {
+            // if index is 0 there is no previous viable match
+            if index != 0 {
+                // keep index range if it's the best
+                save_best_matches_index_range(index_first, index - 1);
+            }
+
+            // advance `index_first` while index range is longer than crop_size.
+            loop {
+                if index_first == matches.len() - 1 {
+                    break;
+                }
+
+                index_first += 1;
+                first_match_first_word_pos = matches[index_first].get_first_word_pos();
+
+                // also make sure that subtracting won't cause a panic
+                if next_match_last_word_pos < first_match_first_word_pos
+                    || next_match_last_word_pos - first_match_first_word_pos + 1 < crop_size
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    // compute the last index range score and compare it to the best one.
+    let index_last = matches.len() - 1;
+    // if it's the last match with itself, we need to make sure it's
+    // not a phrase longer than the crop window
+    if index_first != index_last || matches[index_first].get_word_count() < crop_size {
+        save_best_matches_index_range(index_first, index_last);
+    }
+
+    // if none of the matches fit the criteria above, default to the first one
+    best_matches_index_range.map_or([0, 0], |v| v.matches_index_range)
+}
--- a/crates/milli/src/search/new/matches/match.rs
+++ b/crates/milli/src/search/new/matches/match.rs
@ -1,62 +1,49 @@
-use super::matching_words::WordId;
-
-#[derive(Clone, Debug)]
+#[derive(Debug, PartialEq)]
 pub enum MatchPosition {
-    Word {
-        // position of the word in the whole text.
-        word_position: usize,
-        // position of the token in the whole text.
-        token_position: usize,
-    },
-    Phrase {
-        // position of the first and last word in the phrase in the whole text.
-        word_positions: [usize; 2],
-        // position of the first and last token in the phrase in the whole text.
-        token_positions: [usize; 2],
-    },
+    Word { word_position: usize, token_position: usize },
+    Phrase { word_position_range: [usize; 2], token_position_range: [usize; 2] },
 }

-#[derive(Clone, Debug)]
+#[derive(Debug, PartialEq)]
 pub struct Match {
    pub char_count: usize,
-    // ids of the query words that matches.
-    pub ids: Vec<WordId>,
+    pub byte_len: usize,
    pub position: MatchPosition,
 }

 impl Match {
-    pub(super) fn get_first_word_pos(&self) -> usize {
+    pub fn get_first_word_pos(&self) -> usize {
        match self.position {
            MatchPosition::Word { word_position, .. } => word_position,
-            MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
+            MatchPosition::Phrase { word_position_range: [fwp, _], .. } => fwp,
        }
    }

-    pub(super) fn get_last_word_pos(&self) -> usize {
+    pub fn get_last_word_pos(&self) -> usize {
        match self.position {
            MatchPosition::Word { word_position, .. } => word_position,
-            MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
+            MatchPosition::Phrase { word_position_range: [_, lwp], .. } => lwp,
        }
    }

-    pub(super) fn get_first_token_pos(&self) -> usize {
+    pub fn get_first_token_pos(&self) -> usize {
        match self.position {
            MatchPosition::Word { token_position, .. } => token_position,
-            MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
+            MatchPosition::Phrase { token_position_range: [ftp, _], .. } => ftp,
        }
    }

-    pub(super) fn get_last_token_pos(&self) -> usize {
+    pub fn get_last_token_pos(&self) -> usize {
        match self.position {
            MatchPosition::Word { token_position, .. } => token_position,
-            MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
+            MatchPosition::Phrase { token_position_range: [_, ltp], .. } => ltp,
        }
    }

-    pub(super) fn get_word_count(&self) -> usize {
+    pub fn get_word_count(&self) -> usize {
        match self.position {
            MatchPosition::Word { .. } => 1,
-            MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
+            MatchPosition::Phrase { word_position_range: [fwp, lwp], .. } => lwp - fwp + 1,
        }
    }
 }
--- a/crates/milli/src/search/new/matches/match_bounds.rs
+++ b/crates/milli/src/search/new/matches/match_bounds.rs
@ -0,0 +1,272 @@
+use std::cmp::{max, min};
+
+use super::{
+    matching_words::QueryPosition,
+    r#match::{Match, MatchPosition},
+};
+
+use super::adjust_indices::{
+    get_adjusted_index_forward_for_crop_size, get_adjusted_indices_for_highlights_and_crop_size,
+};
+use charabia::Token;
+use serde::Serialize;
+use utoipa::ToSchema;
+
+use super::FormatOptions;
+
+#[derive(Serialize, ToSchema)]
+#[serde(rename_all = "camelCase")]
+pub struct MatchBounds {
+    pub highlight_toggle: bool,
+    pub indices: Vec<usize>,
+}
+
+struct MatchBoundsHelper<'a> {
+    tokens: &'a [Token<'a>],
+    matches: &'a [Match],
+    query_positions: &'a [QueryPosition],
+}
+
+struct MatchesAndCropIndices {
+    matches_first_index: usize,
+    matches_last_index: usize,
+    crop_byte_start: usize,
+    crop_byte_end: usize,
+}
+
+enum CropThing {
+    Last(usize),
+    First(usize),
+}
+
+impl MatchBoundsHelper<'_> {
+    fn get_match_byte_position_range(&self, r#match: &Match) -> [usize; 2] {
+        let byte_start = match r#match.position {
+            MatchPosition::Word { token_position, .. } => self.tokens[token_position].byte_start,
+            MatchPosition::Phrase { token_position_range: [ftp, ..], .. } => {
+                self.tokens[ftp].byte_start
+            }
+        };
+
+        [byte_start, byte_start + r#match.byte_len]
+    }
+
+    // TODO: Rename this
+    fn get_match_byte_position_rangee(
+        &self,
+        index: &mut usize,
+        crop_thing: CropThing,
+    ) -> [usize; 2] {
+        let new_index = match crop_thing {
+            CropThing::First(_) if *index != 0 => *index - 1,
+            CropThing::Last(_) if *index != self.matches.len() - 1 => *index + 1,
+            _ => {
+                return self.get_match_byte_position_range(&self.matches[*index]);
+            }
+        };
+
+        let [byte_start, byte_end] = self.get_match_byte_position_range(&self.matches[new_index]);
+
+        // NOTE: This doesn't need additional checks, because `get_best_match_index_range` already
+        // guarantees that the next or preceding match contains the crop boundary
+        match crop_thing {
+            CropThing::First(crop_byte_start) if crop_byte_start < byte_end => {
+                *index -= 1;
+                [byte_start, byte_end]
+            }
+            CropThing::Last(crop_byte_end) if byte_start < crop_byte_end => {
+                *index += 1;
+                [byte_start, byte_end]
+            }
+            _ => self.get_match_byte_position_range(&self.matches[*index]),
+        }
+    }
+
+    /// TODO: Description
+    fn get_match_bounds(&self, mci: MatchesAndCropIndices) -> MatchBounds {
+        let MatchesAndCropIndices {
+            mut matches_first_index,
+            mut matches_last_index,
+            crop_byte_start,
+            crop_byte_end,
+        } = mci;
+
+        let [first_match_first_byte, first_match_last_byte] = self.get_match_byte_position_rangee(
+            &mut matches_first_index,
+            CropThing::First(crop_byte_start),
+        );
+        let first_match_first_byte = max(first_match_first_byte, crop_byte_start);
+
+        let [last_match_first_byte, last_match_last_byte] =
+            if matches_first_index != matches_last_index {
+                self.get_match_byte_position_rangee(
+                    &mut matches_last_index,
+                    CropThing::Last(crop_byte_end),
+                )
+            } else {
+                [first_match_first_byte, first_match_last_byte]
+            };
+        let last_match_last_byte = min(last_match_last_byte, crop_byte_end);
+
+        let selected_matches_len = matches_last_index - matches_first_index + 1;
+        let mut indices_size = 2 * selected_matches_len;
+
+        let crop_byte_start_is_not_first_match_start = crop_byte_start != first_match_first_byte;
+        let crop_byte_end_is_not_last_match_end = crop_byte_end != last_match_last_byte;
+
+        if crop_byte_start_is_not_first_match_start {
+            indices_size += 1;
+        }
+
+        if crop_byte_end_is_not_last_match_end {
+            indices_size += 1;
+        }
+
+        let mut indices = Vec::with_capacity(indices_size);
+
+        if crop_byte_start_is_not_first_match_start {
+            indices.push(crop_byte_start);
+        }
+
+        indices.push(first_match_first_byte);
+
+        if selected_matches_len > 1 {
+            indices.push(first_match_last_byte);
+        }
+
+        if selected_matches_len > 2 {
+            for index in (matches_first_index + 1)..matches_last_index {
+                let [m_byte_start, m_byte_end] =
+                    self.get_match_byte_position_range(&self.matches[index]);
+
+                indices.push(m_byte_start);
+                indices.push(m_byte_end);
+            }
+        }
+
+        if selected_matches_len > 1 {
+            indices.push(last_match_first_byte);
+        }
+
+        indices.push(last_match_last_byte);
+
+        if crop_byte_end_is_not_last_match_end {
+            indices.push(crop_byte_end);
+        }
+
+        MatchBounds { highlight_toggle: !crop_byte_start_is_not_first_match_start, indices }
+    }
+
+    /// For crop but no highlight.
+    fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> Option<MatchBounds> {
+        let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size);
+        let final_token = &self.tokens[final_token_index];
+
+        if final_token_index == self.tokens.len() - 1 {
+            return None;
+        }
+
+        // TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start?
+
+        Some(MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] })
+    }
+
+    fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices {
+        // TODO: This doesn't give back 2 phrases if one is out of crop window
+        // Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them
+        let [matches_first_index, matches_last_index] =
+            super::best_match_range::get_best_match_index_range(
+                self.matches,
+                self.query_positions,
+                crop_size,
+            );
+
+        let first_match = &self.matches[matches_first_index];
+        let last_match = &self.matches[matches_last_index];
+
+        let last_match_last_word_pos = last_match.get_last_word_pos();
+        let first_match_first_word_pos = first_match.get_first_word_pos();
+
+        let words_count = last_match_last_word_pos - first_match_first_word_pos + 1;
+        let [index_backward, index_forward] = get_adjusted_indices_for_highlights_and_crop_size(
+            self.tokens,
+            first_match.get_first_token_pos(),
+            last_match.get_last_token_pos(),
+            words_count,
+            crop_size,
+        );
+
+        let is_index_backward_at_limit = index_backward == 0;
+        let is_index_forward_at_limit = index_forward == self.tokens.len() - 1;
+
+        let backward_token = &self.tokens[index_backward];
+        let crop_byte_start = if is_index_backward_at_limit {
+            backward_token.byte_start
+        } else {
+            backward_token.byte_end
+        };
+
+        let forward_token = &self.tokens[index_forward];
+        let crop_byte_end = if is_index_forward_at_limit {
+            forward_token.byte_end
+        } else {
+            forward_token.byte_start
+        };
+
+        MatchesAndCropIndices {
+            matches_first_index,
+            matches_last_index,
+            crop_byte_start,
+            crop_byte_end,
+        }
+    }
+
+    /// TODO: description
+    fn get_crop_and_highlight_bounds_with_matches(&self, crop_size: usize) -> MatchBounds {
+        self.get_match_bounds(self.get_matches_and_crop_indices(crop_size))
+    }
+
+    /// For when there are no matches, but crop is required.
+    fn get_crop_bounds_with_matches(&self, crop_size: usize) -> MatchBounds {
+        let mci = self.get_matches_and_crop_indices(crop_size);
+
+        MatchBounds {
+            highlight_toggle: false,
+            indices: vec![mci.crop_byte_start, mci.crop_byte_end],
+        }
+    }
+}
+
+impl MatchBounds {
+    pub fn try_new(
+        tokens: &[Token],
+        matches: &[Match],
+        query_positions: &[QueryPosition],
+        format_options: FormatOptions,
+    ) -> Option<MatchBounds> {
+        let mbh = MatchBoundsHelper { tokens, matches, query_positions };
+
+        if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) {
+            if matches.is_empty() {
+                return mbh.get_crop_bounds_with_no_matches(crop_size);
+            }
+
+            if format_options.highlight {
+                return Some(mbh.get_crop_and_highlight_bounds_with_matches(crop_size));
+            }
+
+            return Some(mbh.get_crop_bounds_with_matches(crop_size));
+        }
+
+        if format_options.highlight && !matches.is_empty() {
+            Some(mbh.get_match_bounds(MatchesAndCropIndices {
+                matches_first_index: 0,
+                matches_last_index: matches.len() - 1,
+                crop_byte_start: 0,
+                crop_byte_end: tokens[tokens.len() - 1].byte_end,
+            }))
+        } else {
+            None
+        }
+    }
+}
--- a/crates/milli/src/search/new/matches/matching_words.rs
+++ b/crates/milli/src/search/new/matches/matching_words.rs
@ -1,24 +1,89 @@
 use std::cmp::Reverse;
-use std::fmt;
-use std::ops::RangeInclusive;
+use std::fmt::{Debug, Formatter, Result};

 use charabia::Token;

 use super::super::interner::Interned;
 use super::super::query_term::LocatedQueryTerm;
 use super::super::{DedupInterner, Phrase};
+use super::r#match::{Match, MatchPosition};
 use crate::SearchContext;

-pub struct LocatedMatchingPhrase {
-    pub value: Interned<Phrase>,
-    pub positions: RangeInclusive<WordId>,
+enum PrefixedOrEquality {
+    Prefixed,
+    Equality,
+    NotApplicable,
 }

-pub struct LocatedMatchingWords {
-    pub value: Vec<Interned<String>>,
-    pub positions: RangeInclusive<WordId>,
-    pub is_prefix: bool,
-    pub original_char_count: usize,
+impl PrefixedOrEquality {
+    fn new(string: &str, other_string: &str, is_other_string_prefix: bool) -> Self {
+        if string.is_empty() {
+            return if other_string.is_empty() { Self::Equality } else { Self::NotApplicable };
+        }
+
+        let mut other_string_iter = other_string.chars();
+
+        for c in string.chars() {
+            let Some(other_c) = other_string_iter.next() else {
+                return if is_other_string_prefix { Self::Prefixed } else { Self::NotApplicable };
+            };
+
+            if c != other_c {
+                return Self::NotApplicable;
+            }
+        }
+
+        if other_string_iter.next().is_some() {
+            return Self::NotApplicable;
+        }
+
+        Self::Equality
+    }
+}
+
+// TODO: Consider using a tuple here, because indexing this thing out of bounds only incurs a runtime error
+pub type UserQueryPositionRange = [u16; 2];
+
+struct LocatedMatchingPhrase {
+    value: Interned<Phrase>,
+    position: UserQueryPositionRange,
+}
+
+struct LocatedMatchingWords {
+    value: Vec<Interned<String>>,
+    position: UserQueryPositionRange,
+    is_prefix: bool,
+    original_char_count: usize,
+}
+
+struct TokenPositionHelper<'a> {
+    token: &'a Token<'a>,
+    position_by_word: usize,
+    position_by_token: usize,
+}
+
+impl<'a> TokenPositionHelper<'a> {
+    fn iter_from_tokens(tokens: &'a [Token]) -> impl Iterator<Item = Self> + Clone {
+        tokens
+            .iter()
+            .scan([0, 0], |[token_position, word_position], token| {
+                // TODO: Naming
+                let token_word_thingy = Self {
+                    position_by_token: *token_position,
+                    position_by_word: *word_position,
+                    token,
+                };
+
+                *token_position += 1;
+
+                if !token.is_separator() {
+                    *word_position += 1;
+                }
+
+                Some(token_word_thingy)
+            })
+            .filter(|t| !t.token.is_separator())
+    }
 }

 /// Structure created from a query tree
@ -27,180 +92,246 @@ pub struct LocatedMatchingWords {
 pub struct MatchingWords {
    word_interner: DedupInterner<String>,
    phrase_interner: DedupInterner<Phrase>,
-    phrases: Vec<LocatedMatchingPhrase>,
-    words: Vec<LocatedMatchingWords>,
+    located_matching_phrases: Vec<LocatedMatchingPhrase>,
+    located_matching_words: Vec<LocatedMatchingWords>,
+}
+
+#[cfg_attr(test, derive(Debug, PartialEq))]
+pub struct QueryPosition {
+    pub range: UserQueryPositionRange,
+    pub index: usize,
 }

 impl MatchingWords {
-    pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
-        let mut phrases = Vec::new();
-        let mut words = Vec::new();
+    pub fn new(ctx: SearchContext, located_terms: &[LocatedQueryTerm]) -> Self {
+        let mut located_matching_phrases = Vec::new();
+        let mut located_matching_words = Vec::new();

        // Extract and centralize the different phrases and words to match stored in a QueryTerm
        // and wrap them in dedicated structures.
-        for located_term in located_terms {
-            let term = ctx.term_interner.get(located_term.value);
+        for LocatedQueryTerm { value, positions } in located_terms {
+            let term = ctx.term_interner.get(*value);
            let (matching_words, matching_phrases) = term.all_computed_derivations();

-            for matching_phrase in matching_phrases {
-                phrases.push(LocatedMatchingPhrase {
-                    value: matching_phrase,
-                    positions: located_term.positions.clone(),
-                });
-            }
+            let position = [*positions.start(), *positions.end()];

-            words.push(LocatedMatchingWords {
+            located_matching_phrases.reserve(matching_phrases.len());
+            located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
+                LocatedMatchingPhrase { value: *matching_phrase, position }
+            }));
+
+            located_matching_words.push(LocatedMatchingWords {
                value: matching_words,
-                positions: located_term.positions.clone(),
+                position,
                is_prefix: term.is_prefix(),
                original_char_count: term.original_word(&ctx).chars().count(),
            });
        }

-        // Sort word to put prefixes at the bottom prioritizing the exact matches.
-        words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
+        // Sort words by having `is_prefix` as false first and then by their lengths in reverse order.
+        // This is only meant to help with what we match a token against first.
+        located_matching_words.sort_unstable_by_key(|lmw| {
+            (lmw.is_prefix, Reverse(lmw.position[1] - lmw.position[0]))
+        });

        Self {
-            phrases,
-            words,
+            located_matching_phrases,
+            located_matching_words,
            word_interner: ctx.word_interner,
            phrase_interner: ctx.phrase_interner,
        }
    }

-    /// Returns an iterator over terms that match or partially match the given token.
-    pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
-        MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
+    fn try_get_phrase_match<'a>(
+        &self,
+        token_position_helper_iter: &mut (impl Iterator<Item = TokenPositionHelper<'a>> + Clone),
+    ) -> Option<(Match, UserQueryPositionRange)> {
+        let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| {
+            let words_iter = self
+                .phrase_interner
+                .get(lmp.value)
+                .words
+                .iter()
+                .map(|word_option| word_option.map(|word| self.word_interner.get(word).as_str()))
+                .peekable();
+
+            (lmp.position, words_iter)
+        });
+
+        'outer: loop {
+            let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?;
+
+            // TODO: Is it worth only cloning if we have to?
+            let mut tph_iter = token_position_helper_iter.clone();
+
+            let mut first_tph_details = None;
+            let last_tph_details = loop {
+                // 1. get word from `words_iter` and token word thingy from `token_word_thingy_iter`
+                let (Some(word), Some(tph)) = (words_iter.next(), tph_iter.next()) else {
+                    // 2. if there are no more words or token word thingys, get to next phrase and reset `token_word_thingy_iter`
+                    continue 'outer;
+                };
+
+                // ?. save first token position bla bla bla
+                if first_tph_details.is_none() {
+                    first_tph_details = Some([
+                        tph.position_by_token,
+                        tph.position_by_word,
+                        tph.token.char_start,
+                        tph.token.byte_start,
+                    ]);
+                }
+
+                // 3. check if word matches our token
+                let is_matching = match word {
+                    Some(word) => tph.token.lemma() == word,
+                    // a `None` value in the phrase words iterator corresponds to a stop word,
+                    // the value is considered a match if the current token is categorized as a stop word.
+                    None => tph.token.is_stopword(),
+                };
+
+                // 4. if it does not, get to next phrase and restart `token_word_thingy_iter`
+                if !is_matching {
+                    continue 'outer;
+                }
+
+                // 5. if it does, and there are no words left, time to return
+                if words_iter.peek().is_none() {
+                    break [
+                        tph.position_by_token,
+                        tph.position_by_word,
+                        tph.token.char_end,
+                        tph.token.byte_end,
+                    ];
+                }
+            };
+
+            let [first_tph_position_by_token, first_tph_position_by_word, first_tph_char_start, first_tph_byte_start] =
+                first_tph_details.expect("TODO");
+            let [last_tph_position_by_token, last_tph_position_by_word, last_tph_char_end, last_tph_byte_end] =
+                last_tph_details;
+
+            // save new position in parameter iterator
+            *token_position_helper_iter = tph_iter;
+
+            return Some((
+                Match {
+                    // do not +1, because Token index ranges are exclusive
+                    byte_len: last_tph_byte_end - first_tph_byte_start,
+                    char_count: last_tph_char_end - first_tph_char_start,
+                    position: MatchPosition::Phrase {
+                        word_position_range: [
+                            first_tph_position_by_word,
+                            last_tph_position_by_word,
+                        ],
+                        token_position_range: [
+                            first_tph_position_by_token,
+                            last_tph_position_by_token,
+                        ],
+                    },
+                },
+                query_position_range,
+            ));
+        }
    }

    /// Try to match the token with one of the located_words.
-    fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
-        for located_words in &self.words {
-            for word in &located_words.value {
-                let word = self.word_interner.get(*word);
-                // if the word is a prefix we match using starts_with.
-                if located_words.is_prefix && token.lemma().starts_with(word) {
-                    let Some((char_index, c)) =
-                        word.char_indices().take(located_words.original_char_count).last()
-                    else {
-                        continue;
-                    };
-                    let prefix_length = char_index + c.len_utf8();
-                    let (char_count, byte_len) = token.original_lengths(prefix_length);
-                    let ids = &located_words.positions;
-                    return Some(MatchType::Full { ids, char_count, byte_len });
-                // else we exact match the token.
-                } else if token.lemma() == word {
-                    let ids = &located_words.positions;
-                    return Some(MatchType::Full {
-                        char_count: token.char_end - token.char_start,
-                        byte_len: token.byte_end - token.byte_start,
-                        ids,
-                    });
+    fn try_get_word_match(
+        &self,
+        tph: TokenPositionHelper,
+        text: &str,
+    ) -> Option<(Match, UserQueryPositionRange)> {
+        let mut iter =
+            self.located_matching_words.iter().flat_map(|lw| lw.value.iter().map(move |w| (lw, w)));
+
+        loop {
+            let (located_words, word) = iter.next()?;
+            let word = self.word_interner.get(*word);
+
+            let [char_count, byte_len] =
+                match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) {
+                    PrefixedOrEquality::Prefixed => {
+                        let prefix_byte_len = text[tph.token.byte_start..]
+                            .char_indices()
+                            .nth(located_words.original_char_count - 1)
+                            .map(|(i, c)| i + c.len_utf8())
+                            .expect("expected text to have n-th thing bal bla TODO");
+
+                        // TODO: Investigate token original byte length and similar methods and why they're not good enough
+
+                        [located_words.original_char_count, prefix_byte_len]
+                    }
+                    // do not +1, because Token index ranges are exclusive
+                    PrefixedOrEquality::Equality => [
+                        tph.token.char_end - tph.token.char_start,
+                        tph.token.byte_end - tph.token.byte_start,
+                    ],
+                    _ => continue,
+                };
+
+            return Some((
+                Match {
+                    char_count,
+                    byte_len,
+                    position: MatchPosition::Word {
+                        word_position: tph.position_by_word,
+                        token_position: tph.position_by_token,
+                    },
+                },
+                located_words.position,
+            ));
+        }
+    }
+
+    pub fn get_matches_and_query_positions(
+        &self,
+        tokens: &[Token],
+        text: &str,
+    ) -> (Vec<Match>, Vec<QueryPosition>) {
+        // TODO: Note in the doc that with the help of this iter, matches are guaranteed to be ordered
+        let mut token_position_helper_iter = TokenPositionHelper::iter_from_tokens(tokens);
+        let mut matches = Vec::new();
+        let mut query_positions = Vec::new();
+
+        loop {
+            // try and get a phrase match
+            if let Some((r#match, range)) =
+                self.try_get_phrase_match(&mut token_position_helper_iter)
+            {
+                matches.push(r#match);
+                query_positions.push(QueryPosition { range, index: matches.len() - 1 });
+
+                continue;
+            }
+
+            // if the above fails, try get next token position helper
+            if let Some(tph) = token_position_helper_iter.next() {
+                // and then try and get a word match
+                if let Some((r#match, range)) = self.try_get_word_match(tph, text) {
+                    matches.push(r#match);
+                    query_positions.push(QueryPosition { range, index: matches.len() - 1 });
                }
-            }
+            } else {
+                // there are no more items in the iterator, we are done searching for matches
+                break;
+            };
        }

-        None
+        // TODO: Explain why
+        query_positions.sort_unstable_by_key(|v| v.range[0]);
+
+        (matches, query_positions)
    }
 }

-/// Iterator over terms that match the given token,
-/// This allow to lazily evaluate matches.
-pub struct MatchesIter<'a, 'b> {
-    matching_words: &'a MatchingWords,
-    phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
-    token: &'b Token<'b>,
-}
-
-impl<'a> Iterator for MatchesIter<'a, '_> {
-    type Item = MatchType<'a>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        match self.phrases.next() {
-            // Try to match all the phrases first.
-            Some(located_phrase) => {
-                let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
-
-                // create a PartialMatch struct to make it compute the first match
-                // instead of duplicating the code.
-                let ids = &located_phrase.positions;
-                // collect the references of words from the interner.
-                let words = phrase
-                    .words
-                    .iter()
-                    .map(|word| {
-                        word.map(|word| self.matching_words.word_interner.get(word).as_str())
-                    })
-                    .collect();
-                let partial = PartialMatch { matching_words: words, ids };
-
-                partial.match_token(self.token).or_else(|| self.next())
-            }
-            // If no phrases matches, try to match uiques words.
-            None => self.matching_words.match_unique_words(self.token),
-        }
-    }
-}
-
-/// Id of a matching term corespounding to a word written by the end user.
-pub type WordId = u16;
-
-/// A given token can partially match a query word for several reasons:
-/// - split words
-/// - multi-word synonyms
-///   In these cases we need to match consecutively several tokens to consider that the match is full.
-#[derive(Debug, PartialEq)]
-pub enum MatchType<'a> {
-    Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
-    Partial(PartialMatch<'a>),
-}
-
-/// Structure helper to match several tokens in a row in order to complete a partial match.
-#[derive(Debug, PartialEq)]
-pub struct PartialMatch<'a> {
-    matching_words: Vec<Option<&'a str>>,
-    ids: &'a RangeInclusive<WordId>,
-}
-
-impl<'a> PartialMatch<'a> {
-    /// Returns:
-    /// - None if the given token breaks the partial match
-    /// - Partial if the given token matches the partial match but doesn't complete it
-    /// - Full if the given token completes the partial match
-    pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
-        let Self { mut matching_words, ids, .. } = self;
-
-        let is_matching = match matching_words.first()? {
-            Some(word) => &token.lemma() == word,
-            // a None value in the phrase corresponds to a stop word,
-            // the walue is considered a match if the current token is categorized as a stop word.
-            None => token.is_stopword(),
-        };
-
-        // if there are remaining words to match in the phrase and the current token is matching,
-        // return a new Partial match allowing the highlighter to continue.
-        if is_matching && matching_words.len() > 1 {
-            matching_words.remove(0);
-            Some(MatchType::Partial(Self { matching_words, ids }))
-        // if there is no remaining word to match in the phrase and the current token is matching,
-        // return a Full match.
-        } else if is_matching {
-            Some(MatchType::Full {
-                char_count: token.char_end - token.char_start,
-                byte_len: token.byte_end - token.byte_start,
-                ids,
-            })
-        // if the current token doesn't match, return None to break the match sequence.
-        } else {
-            None
-        }
-    }
-}
-
-impl fmt::Debug for MatchingWords {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
+impl Debug for MatchingWords {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result {
+        let MatchingWords {
+            word_interner,
+            phrase_interner,
+            located_matching_phrases: phrases,
+            located_matching_words: words,
+        } = self;

        let phrases: Vec<_> = phrases
            .iter()
@ -213,119 +344,110 @@ impl fmt::Debug for MatchingWords {
                        .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
                        .collect::<Vec<_>>()
                        .join(" "),
-                    p.positions.clone(),
+                    p.position,
                )
            })
            .collect();
-
        let words: Vec<_> = words
            .iter()
            .flat_map(|w| {
                w.value
                    .iter()
-                    .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
+                    .map(|s| (word_interner.get(*s), w.position, w.is_prefix))
                    .collect::<Vec<_>>()
            })
            .collect();
-
        f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
    }
 }

 #[cfg(test)]
 pub(crate) mod tests {
-    use std::borrow::Cow;
-
-    use charabia::{TokenKind, TokenizerBuilder};
-
    use super::super::super::located_query_terms_from_tokens;
    use super::*;
-    use crate::index::tests::TempIndex;
+    use crate::search::new::matches::tests::temp_index_with_documents;
    use crate::search::new::query_term::ExtractedTokens;
-
-    pub(crate) fn temp_index_with_documents() -> TempIndex {
-        let temp_index = TempIndex::new();
-        temp_index
-            .add_documents(documents!([
-                { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
-                { "id": 2, "name": "Westfália" },
-                { "id": 3, "name": "Ŵôřlḑôle" },
-            ]))
-            .unwrap();
-        temp_index
-    }
+    use charabia::{TokenKind, TokenizerBuilder};
+    use std::borrow::Cow;

    #[test]
    fn matching_words() {
-        let temp_index = temp_index_with_documents();
+        let temp_index = temp_index_with_documents(None);
        let rtxn = temp_index.read_txn().unwrap();
        let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
        let mut builder = TokenizerBuilder::default();
        let tokenizer = builder.build();
-        let tokens = tokenizer.tokenize("split this world");
+        let text = "split this world";
+        let tokens = tokenizer.tokenize(text);
        let ExtractedTokens { query_terms, .. } =
            located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
-        let matching_words = MatchingWords::new(ctx, query_terms);
+        let matching_words = MatchingWords::new(ctx, &query_terms);

        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("split"),
-                    char_end: "split".chars().count(),
-                    byte_end: "split".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("nyc"),
-                    char_end: "nyc".chars().count(),
-                    byte_end: "nyc".len(),
-                    ..Default::default()
-                })
-                .next(),
-            None
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("world"),
-                    char_end: "world".chars().count(),
-                    byte_end: "world".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("worlded"),
-                    char_end: "worlded".chars().count(),
-                    byte_end: "worlded".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("thisnew"),
-                    char_end: "thisnew".chars().count(),
-                    byte_end: "thisnew".len(),
-                    ..Default::default()
-                })
-                .next(),
-            None
+            matching_words.get_matches_and_query_positions(
+                &[
+                    Token {
+                        kind: TokenKind::Word,
+                        lemma: Cow::Borrowed("split"),
+                        char_end: "split".chars().count(),
+                        byte_end: "split".len(),
+                        ..Default::default()
+                    },
+                    Token {
+                        kind: TokenKind::Word,
+                        lemma: Cow::Borrowed("nyc"),
+                        char_end: "nyc".chars().count(),
+                        byte_end: "nyc".len(),
+                        ..Default::default()
+                    },
+                    Token {
+                        kind: TokenKind::Word,
+                        lemma: Cow::Borrowed("world"),
+                        char_end: "world".chars().count(),
+                        byte_end: "world".len(),
+                        ..Default::default()
+                    },
+                    Token {
+                        kind: TokenKind::Word,
+                        lemma: Cow::Borrowed("worlded"),
+                        char_end: "worlded".chars().count(),
+                        byte_end: "worlded".len(),
+                        ..Default::default()
+                    },
+                    Token {
+                        kind: TokenKind::Word,
+                        lemma: Cow::Borrowed("thisnew"),
+                        char_end: "thisnew".chars().count(),
+                        byte_end: "thisnew".len(),
+                        ..Default::default()
+                    }
+                ],
+                text
+            ),
+            (
+                vec![
+                    Match {
+                        char_count: 5,
+                        byte_len: 5,
+                        position: MatchPosition::Word { word_position: 0, token_position: 0 }
+                    },
+                    Match {
+                        char_count: 5,
+                        byte_len: 5,
+                        position: MatchPosition::Word { word_position: 2, token_position: 2 }
+                    },
+                    Match {
+                        char_count: 5,
+                        byte_len: 5,
+                        position: MatchPosition::Word { word_position: 3, token_position: 3 }
+                    }
+                ],
+                vec![
+                    QueryPosition { range: [0, 0], index: 0 },
+                    QueryPosition { range: [2, 2], index: 1 },
+                    QueryPosition { range: [2, 2], index: 2 }
+                ]
+            )
        );
    }
 }
--- a/crates/milli/src/search/new/matches/mod.rs
+++ b/crates/milli/src/search/new/matches/mod.rs
--- a/crates/milli/src/search/new/matches/simple_token_kind.rs
+++ b/crates/milli/src/search/new/matches/simple_token_kind.rs
@ -1,15 +0,0 @@
-use charabia::{SeparatorKind, Token, TokenKind};
-
-pub enum SimpleTokenKind {
-    Separator(SeparatorKind),
-    NotSeparator,
-}
-
-impl SimpleTokenKind {
-    pub fn new(token: &&Token<'_>) -> Self {
-        match token.kind {
-            TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
-            _ => Self::NotSeparator,
-        }
-    }
-}