MeiliSearch/milli/src/search/new/matches/matching_words.rs

use std::cmp::Reverse;
use std::fmt;
use std::ops::RangeInclusive;

use charabia::Token;

use super::super::interner::Interned;
use super::super::query_term::LocatedQueryTerm;
use super::super::{DedupInterner, Phrase};
use crate::SearchContext;

pub struct LocatedMatchingPhrase {
    pub value: Interned<Phrase>,
    pub positions: RangeInclusive<WordId>,
}

pub struct LocatedMatchingWords {
    pub value: Vec<Interned<String>>,
    pub positions: RangeInclusive<WordId>,
    pub is_prefix: bool,
    pub original_char_count: usize,
}

/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
    word_interner: DedupInterner<String>,
    phrase_interner: DedupInterner<Phrase>,
    phrases: Vec<LocatedMatchingPhrase>,
    words: Vec<LocatedMatchingWords>,
}

impl MatchingWords {
    pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
        let mut phrases = Vec::new();
        let mut words = Vec::new();

        // Extract and centralize the different phrases and words to match stored in a QueryTerm
        // and wrap them in dedicated structures.
        for located_term in located_terms {
            let term = ctx.term_interner.get(located_term.value);
            let (matching_words, matching_phrases) = term.all_computed_derivations();

            for matching_phrase in matching_phrases {
                phrases.push(LocatedMatchingPhrase {
                    value: matching_phrase,
                    positions: located_term.positions.clone(),
                });
            }

            words.push(LocatedMatchingWords {
                value: matching_words,
                positions: located_term.positions.clone(),
                is_prefix: term.is_prefix(),
                original_char_count: term.original_word(&ctx).chars().count(),
            });
        }

        // Sort word to put prefixes at the bottom prioritizing the exact matches.
        words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));

        Self {
            phrases,
            words,
            word_interner: ctx.word_interner,
            phrase_interner: ctx.phrase_interner,
        }
    }

    /// Returns an iterator over terms that match or partially match the given token.
    pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
        MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
    }

    /// Try to match the token with one of the located_words.
    fn match_unique_words<'a>(&'a self, token: &Token) -> Option<MatchType<'a>> {
        for located_words in &self.words {
            for word in &located_words.value {
                let word = self.word_interner.get(*word);
                // if the word is a prefix we match using starts_with.
                if located_words.is_prefix && token.lemma().starts_with(word) {
                    let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else {
                        continue;
                    };
                    let prefix_length = char_index + c.len_utf8();
                    let char_len = token.original_lengths(prefix_length).0;
                    let ids = &located_words.positions;
                    return Some(MatchType::Full { char_len, ids });
                // else we exact match the token.
                } else if token.lemma() == word {
                    let char_len = token.char_end - token.char_start;
                    let ids = &located_words.positions;
                    return Some(MatchType::Full { char_len, ids });
                }
            }
        }

        None
    }
}

/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
pub struct MatchesIter<'a, 'b> {
    matching_words: &'a MatchingWords,
    phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
    token: &'b Token<'b>,
}

impl<'a> Iterator for MatchesIter<'a, '_> {
    type Item = MatchType<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        match self.phrases.next() {
            // Try to match all the phrases first.
            Some(located_phrase) => {
                let phrase = self.matching_words.phrase_interner.get(located_phrase.value);

                // create a PartialMatch struct to make it compute the first match
                // instead of duplicating the code.
                let ids = &located_phrase.positions;
                // collect the references of words from the interner.
                let words = phrase
                    .words
                    .iter()
                    .map(|word| {
                        word.map(|word| self.matching_words.word_interner.get(word).as_str())
                    })
                    .collect();
                let partial = PartialMatch { matching_words: words, ids, char_len: 0 };

                partial.match_token(self.token).or_else(|| self.next())
            }
            // If no phrases matches, try to match uiques words.
            None => self.matching_words.match_unique_words(self.token),
        }
    }
}

/// Id of a matching term corespounding to a word written by the end user.
pub type WordId = u16;

/// A given token can partially match a query word for several reasons:
/// - split words
/// - multi-word synonyms
/// In these cases we need to match consecutively several tokens to consider that the match is full.
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
    Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
    Partial(PartialMatch<'a>),
}

/// Structure helper to match several tokens in a row in order to complete a partial match.
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
    matching_words: Vec<Option<&'a str>>,
    ids: &'a RangeInclusive<WordId>,
    char_len: usize,
}

impl<'a> PartialMatch<'a> {
    /// Returns:
    /// - None if the given token breaks the partial match
    /// - Partial if the given token matches the partial match but doesn't complete it
    /// - Full if the given token completes the partial match
    pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
        let Self { mut matching_words, ids, .. } = self;

        let is_matching = match matching_words.first()? {
            Some(word) => &token.lemma() == word,
            // a None value in the phrase corresponds to a stop word,
            // the walue is considered a match if the current token is categorized as a stop word.
            None => token.is_stopword(),
        };

        let char_len = token.char_end - token.char_start;
        // if there are remaining words to match in the phrase and the current token is matching,
        // return a new Partial match allowing the highlighter to continue.
        if is_matching && matching_words.len() > 1 {
            matching_words.remove(0);
            Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
        // if there is no remaining word to match in the phrase and the current token is matching,
        // return a Full match.
        } else if is_matching {
            Some(MatchType::Full { char_len, ids })
        // if the current token doesn't match, return None to break the match sequence.
        } else {
            None
        }
    }

    pub fn char_len(&self) -> usize {
        self.char_len
    }
}

impl fmt::Debug for MatchingWords {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let MatchingWords { word_interner, phrase_interner, phrases, words } = self;

        let phrases: Vec<_> = phrases
            .iter()
            .map(|p| {
                (
                    phrase_interner
                        .get(p.value)
                        .words
                        .iter()
                        .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
                        .collect::<Vec<_>>()
                        .join(" "),
                    p.positions.clone(),
                )
            })
            .collect();

        let words: Vec<_> = words
            .iter()
            .flat_map(|w| {
                w.value
                    .iter()
                    .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
                    .collect::<Vec<_>>()
            })
            .collect();

        f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
    }
}

#[cfg(test)]
pub(crate) mod tests {
    use std::borrow::Cow;

    use charabia::{TokenKind, TokenizerBuilder};

    use super::super::super::located_query_terms_from_tokens;
    use super::*;
    use crate::index::tests::TempIndex;

    pub(crate) fn temp_index_with_documents() -> TempIndex {
        let temp_index = TempIndex::new();
        temp_index
            .add_documents(documents!([
                { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
                { "id": 2, "name": "Westfália" },
                { "id": 3, "name": "Ŵôřlḑôle" },
            ]))
            .unwrap();
        temp_index
    }

    #[test]
    fn matching_words() {
        let temp_index = temp_index_with_documents();
        let rtxn = temp_index.read_txn().unwrap();
        let mut ctx = SearchContext::new(&temp_index, &rtxn);
        let mut builder = TokenizerBuilder::default();
        let tokenizer = builder.build();
        let tokens = tokenizer.tokenize("split this world");
        let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
        let matching_words = MatchingWords::new(ctx, query_terms);

        assert_eq!(
            matching_words
                .match_token(&Token {
                    kind: TokenKind::Word,
                    lemma: Cow::Borrowed("split"),
                    char_end: "split".chars().count(),
                    byte_end: "split".len(),
                    ..Default::default()
                })
                .next(),
            Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
        );
        assert_eq!(
            matching_words
                .match_token(&Token {
                    kind: TokenKind::Word,
                    lemma: Cow::Borrowed("nyc"),
                    char_end: "nyc".chars().count(),
                    byte_end: "nyc".len(),
                    ..Default::default()
                })
                .next(),
            None
        );
        assert_eq!(
            matching_words
                .match_token(&Token {
                    kind: TokenKind::Word,
                    lemma: Cow::Borrowed("world"),
                    char_end: "world".chars().count(),
                    byte_end: "world".len(),
                    ..Default::default()
                })
                .next(),
            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
        );
        assert_eq!(
            matching_words
                .match_token(&Token {
                    kind: TokenKind::Word,
                    lemma: Cow::Borrowed("worlded"),
                    char_end: "worlded".chars().count(),
                    byte_end: "worlded".len(),
                    ..Default::default()
                })
                .next(),
            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
        );
        assert_eq!(
            matching_words
                .match_token(&Token {
                    kind: TokenKind::Word,
                    lemma: Cow::Borrowed("thisnew"),
                    char_end: "thisnew".chars().count(),
                    byte_end: "thisnew".len(),
                    ..Default::default()
                })
                .next(),
            None
        );
    }
}
add new highlighter 2023-04-06 12:15:37 +02:00			`use std::cmp::Reverse;`
Integrate the new Highlighter in the search 2023-04-06 13:58:56 +02:00			`use std::fmt;`
add new highlighter 2023-04-06 12:15:37 +02:00			`use std::ops::RangeInclusive;`

			`use charabia::Token;`

			`use super::super::interner::Interned;`
Matching words fixes 2023-04-11 15:41:44 +02:00			`use super::super::query_term::LocatedQueryTerm;`
add new highlighter 2023-04-06 12:15:37 +02:00			`use super::super::{DedupInterner, Phrase};`
			`use crate::SearchContext;`

			`pub struct LocatedMatchingPhrase {`
			`pub value: Interned<Phrase>,`
			`pub positions: RangeInclusive<WordId>,`
			`}`

			`pub struct LocatedMatchingWords {`
			`pub value: Vec<Interned<String>>,`
			`pub positions: RangeInclusive<WordId>,`
			`pub is_prefix: bool,`
Patch prefix match 2023-04-06 17:22:31 +02:00			`pub original_char_count: usize,`
add new highlighter 2023-04-06 12:15:37 +02:00			`}`

			`/// Structure created from a query tree`
			`/// referencing words that match the given query tree.`
Integrate the new Highlighter in the search 2023-04-06 13:58:56 +02:00			`#[derive(Default)]`
Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`pub struct MatchingWords {`
			`word_interner: DedupInterner<String>,`
			`phrase_interner: DedupInterner<Phrase>,`
add new highlighter 2023-04-06 12:15:37 +02:00			`phrases: Vec<LocatedMatchingPhrase>,`
			`words: Vec<LocatedMatchingWords>,`
			`}`

Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`impl MatchingWords {`
			`pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {`
add new highlighter 2023-04-06 12:15:37 +02:00			`let mut phrases = Vec::new();`
			`let mut words = Vec::new();`

Matching words fixes 2023-04-11 15:41:44 +02:00			`// Extract and centralize the different phrases and words to match stored in a QueryTerm`
add new highlighter 2023-04-06 12:15:37 +02:00			`// and wrap them in dedicated structures.`
			`for located_term in located_terms {`
			`let term = ctx.term_interner.get(located_term.value);`
Matching words fixes 2023-04-11 15:41:44 +02:00			`let (matching_words, matching_phrases) = term.all_computed_derivations();`
add new highlighter 2023-04-06 12:15:37 +02:00
			`for matching_phrase in matching_phrases {`
			`phrases.push(LocatedMatchingPhrase {`
			`value: matching_phrase,`
			`positions: located_term.positions.clone(),`
			`});`
			`}`
Patch prefix match 2023-04-06 17:22:31 +02:00
add new highlighter 2023-04-06 12:15:37 +02:00			`words.push(LocatedMatchingWords {`
			`value: matching_words,`
			`positions: located_term.positions.clone(),`
Fix prefix highlighting 2023-05-04 16:53:50 +02:00			`is_prefix: term.is_prefix(),`
Matching words fixes 2023-04-11 15:41:44 +02:00			`original_char_count: term.original_word(&ctx).chars().count(),`
add new highlighter 2023-04-06 12:15:37 +02:00			`});`
			`}`

			`// Sort word to put prefixes at the bottom prioritizing the exact matches.`
			`words.sort_unstable_by_key(\|lmw\| (lmw.is_prefix, Reverse(lmw.positions.len())));`

			`Self {`
			`phrases,`
			`words,`
Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`word_interner: ctx.word_interner,`
			`phrase_interner: ctx.phrase_interner,`
add new highlighter 2023-04-06 12:15:37 +02:00			`}`
			`}`

			`/// Returns an iterator over terms that match or partially match the given token.`
Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {`
add new highlighter 2023-04-06 12:15:37 +02:00			`MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }`
			`}`

			`/// Try to match the token with one of the located_words.`
Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`fn match_unique_words<'a>(&'a self, token: &Token) -> Option<MatchType<'a>> {`
add new highlighter 2023-04-06 12:15:37 +02:00			`for located_words in &self.words {`
			`for word in &located_words.value {`
			`let word = self.word_interner.get(*word);`
			`// if the word is a prefix we match using starts_with.`
			`if located_words.is_prefix && token.lemma().starts_with(word) {`
Patch prefix match 2023-04-06 17:22:31 +02:00			`let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else {`
			`continue;`
			`};`
			`let prefix_length = char_index + c.len_utf8();`
			`let char_len = token.original_lengths(prefix_length).0;`
add new highlighter 2023-04-06 12:15:37 +02:00			`let ids = &located_words.positions;`
			`return Some(MatchType::Full { char_len, ids });`
			`// else we exact match the token.`
			`} else if token.lemma() == word {`
			`let char_len = token.char_end - token.char_start;`
			`let ids = &located_words.positions;`
			`return Some(MatchType::Full { char_len, ids });`
			`}`
			`}`
			`}`

			`None`
			`}`
			`}`

			`/// Iterator over terms that match the given token,`
			`/// This allow to lazily evaluate matches.`
			`pub struct MatchesIter<'a, 'b> {`
Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`matching_words: &'a MatchingWords,`
add new highlighter 2023-04-06 12:15:37 +02:00			`phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,`
			`token: &'b Token<'b>,`
			`}`

			`impl<'a> Iterator for MatchesIter<'a, '_> {`
			`type Item = MatchType<'a>;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`match self.phrases.next() {`
			`// Try to match all the phrases first.`
			`Some(located_phrase) => {`
			`let phrase = self.matching_words.phrase_interner.get(located_phrase.value);`

			`// create a PartialMatch struct to make it compute the first match`
			`// instead of duplicating the code.`
			`let ids = &located_phrase.positions;`
			`// collect the references of words from the interner.`
			`let words = phrase`
			`.words`
			`.iter()`
			`.map(\|word\| {`
			`word.map(\|word\| self.matching_words.word_interner.get(word).as_str())`
			`})`
			`.collect();`
			`let partial = PartialMatch { matching_words: words, ids, char_len: 0 };`

			`partial.match_token(self.token).or_else(\|\| self.next())`
			`}`
			`// If no phrases matches, try to match uiques words.`
			`None => self.matching_words.match_unique_words(self.token),`
			`}`
			`}`
			`}`

			`/// Id of a matching term corespounding to a word written by the end user.`
			`pub type WordId = u16;`

			`/// A given token can partially match a query word for several reasons:`
			`/// - split words`
			`/// - multi-word synonyms`
			`/// In these cases we need to match consecutively several tokens to consider that the match is full.`
			`#[derive(Debug, PartialEq)]`
			`pub enum MatchType<'a> {`
			`Full { char_len: usize, ids: &'a RangeInclusive<WordId> },`
			`Partial(PartialMatch<'a>),`
			`}`

			`/// Structure helper to match several tokens in a row in order to complete a partial match.`
			`#[derive(Debug, PartialEq)]`
			`pub struct PartialMatch<'a> {`
			`matching_words: Vec<Option<&'a str>>,`
			`ids: &'a RangeInclusive<WordId>,`
			`char_len: usize,`
			`}`

			`impl<'a> PartialMatch<'a> {`
			`/// Returns:`
			`/// - None if the given token breaks the partial match`
			`/// - Partial if the given token matches the partial match but doesn't complete it`
			`/// - Full if the given token completes the partial match`
			`pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {`
			`let Self { mut matching_words, ids, .. } = self;`

			`let is_matching = match matching_words.first()? {`
			`Some(word) => &token.lemma() == word,`
			`// a None value in the phrase corresponds to a stop word,`
			`// the walue is considered a match if the current token is categorized as a stop word.`
			`None => token.is_stopword(),`
			`};`

			`let char_len = token.char_end - token.char_start;`
			`// if there are remaining words to match in the phrase and the current token is matching,`
			`// return a new Partial match allowing the highlighter to continue.`
			`if is_matching && matching_words.len() > 1 {`
			`matching_words.remove(0);`
			`Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))`
			`// if there is no remaining word to match in the phrase and the current token is matching,`
			`// return a Full match.`
			`} else if is_matching {`
			`Some(MatchType::Full { char_len, ids })`
			`// if the current token doesn't match, return None to break the match sequence.`
			`} else {`
			`None`
			`}`
			`}`

			`pub fn char_len(&self) -> usize {`
			`self.char_len`
			`}`
			`}`

Integrate the new Highlighter in the search 2023-04-06 13:58:56 +02:00			`impl fmt::Debug for MatchingWords {`
			`fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {`
			`let MatchingWords { word_interner, phrase_interner, phrases, words } = self;`

			`let phrases: Vec<_> = phrases`
			`.iter()`
			`.map(\|p\| {`
			`(`
			`phrase_interner`
			`.get(p.value)`
			`.words`
			`.iter()`
			`.map(\|w\| w.map_or("STOP_WORD", \|w\| word_interner.get(w)))`
			`.collect::<Vec<_>>()`
			`.join(" "),`
			`p.positions.clone(),`
			`)`
			`})`
			`.collect();`

			`let words: Vec<_> = words`
			`.iter()`
			`.flat_map(\|w\| {`
			`w.value`
			`.iter()`
			`.map(\|s\| (word_interner.get(*s), w.positions.clone(), w.is_prefix))`
			`.collect::<Vec<_>>()`
			`})`
			`.collect();`

			`f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()`
			`}`
			`}`

add new highlighter 2023-04-06 12:15:37 +02:00			`#[cfg(test)]`
			`pub(crate) mod tests {`
			`use std::borrow::Cow;`

			`use charabia::{TokenKind, TokenizerBuilder};`

rename located_query_terms_from_string -> located_query_terms_from_tokens 2023-05-02 18:53:01 +02:00			`use super::super::super::located_query_terms_from_tokens;`
add new highlighter 2023-04-06 12:15:37 +02:00			`use super::*;`
			`use crate::index::tests::TempIndex;`

			`pub(crate) fn temp_index_with_documents() -> TempIndex {`
			`let temp_index = TempIndex::new();`
			`temp_index`
			`.add_documents(documents!([`
Make all search tests pass, fix distinctAttribute bug 2023-04-24 12:11:25 +02:00			`{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },`
Fix prefix highlighting 2023-05-04 16:53:50 +02:00			`{ "id": 2, "name": "Westfália" },`
			`{ "id": 3, "name": "Ŵôřlḑôle" },`
add new highlighter 2023-04-06 12:15:37 +02:00			`]))`
			`.unwrap();`
			`temp_index`
			`}`

			`#[test]`
			`fn matching_words() {`
			`let temp_index = temp_index_with_documents();`
			`let rtxn = temp_index.read_txn().unwrap();`
			`let mut ctx = SearchContext::new(&temp_index, &rtxn);`
Update Charabia 2023-06-28 18:52:32 +02:00			`let mut builder = TokenizerBuilder::default();`
			`let tokenizer = builder.build();`
add new highlighter 2023-04-06 12:15:37 +02:00			`let tokens = tokenizer.tokenize("split this world");`
rename located_query_terms_from_string -> located_query_terms_from_tokens 2023-05-02 18:53:01 +02:00			`let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();`
Make the matcher consume the search context 2023-04-06 12:28:28 +02:00			`let matching_words = MatchingWords::new(ctx, query_terms);`
add new highlighter 2023-04-06 12:15:37 +02:00
			`assert_eq!(`
			`matching_words`
			`.match_token(&Token {`
			`kind: TokenKind::Word,`
			`lemma: Cow::Borrowed("split"),`
			`char_end: "split".chars().count(),`
			`byte_end: "split".len(),`
			`..Default::default()`
			`})`
			`.next(),`
			`Some(MatchType::Full { char_len: 5, ids: &(0..=0) })`
			`);`
			`assert_eq!(`
			`matching_words`
			`.match_token(&Token {`
			`kind: TokenKind::Word,`
			`lemma: Cow::Borrowed("nyc"),`
			`char_end: "nyc".chars().count(),`
			`byte_end: "nyc".len(),`
			`..Default::default()`
			`})`
			`.next(),`
			`None`
			`);`
			`assert_eq!(`
			`matching_words`
			`.match_token(&Token {`
			`kind: TokenKind::Word,`
			`lemma: Cow::Borrowed("world"),`
			`char_end: "world".chars().count(),`
			`byte_end: "world".len(),`
			`..Default::default()`
			`})`
			`.next(),`
			`Some(MatchType::Full { char_len: 5, ids: &(2..=2) })`
			`);`
			`assert_eq!(`
			`matching_words`
			`.match_token(&Token {`
			`kind: TokenKind::Word,`
			`lemma: Cow::Borrowed("worlded"),`
			`char_end: "worlded".chars().count(),`
			`byte_end: "worlded".len(),`
			`..Default::default()`
			`})`
			`.next(),`
Fix prefix highlighting 2023-05-04 16:53:50 +02:00			`Some(MatchType::Full { char_len: 5, ids: &(2..=2) })`
add new highlighter 2023-04-06 12:15:37 +02:00			`);`
			`assert_eq!(`
			`matching_words`
			`.match_token(&Token {`
			`kind: TokenKind::Word,`
			`lemma: Cow::Borrowed("thisnew"),`
			`char_end: "thisnew".chars().count(),`
			`byte_end: "thisnew".len(),`
			`..Default::default()`
			`})`
			`.next(),`
			`None`
			`);`
			`}`
			`}`