Merge #4033

4033: Fix thai synonyms r=Kerollmops a=Kerollmops Fixes #4031 Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
2025-07-01 19:08:29 +02:00 · 2023-09-05 13:54:33 +00:00 · 2023-09-05 13:54:33 +00:00 · 287cf25d39
commit 287cf25d39
parent cea93e9a37 66aa6d5871
2 changed files with 41 additions and 4 deletions
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -226,9 +226,9 @@ fn process_tokens<'a>(
 ) -> impl Iterator<Item = (usize, Token<'a>)> {
    tokens
        .skip_while(|token| token.is_separator())
-        .scan((0, None), |(offset, prev_kind), token| {
+        .scan((0, None), |(offset, prev_kind), mut token| {
            match token.kind {
-                TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
                    *offset += match *prev_kind {
                        Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
                        Some(_) => 1,
@ -244,7 +244,7 @@ fn process_tokens<'a>(
                {
                    *prev_kind = Some(token.kind);
                }
-                _ => (),
+                _ => token.kind = TokenKind::Unknown,
            }
            Some((*offset, token))
        })
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    tokenizer
                        .tokenize(text)
                        .filter_map(|token| {
-                            if token.is_word() {
+                            if token.is_word() && !token.lemma().is_empty() {
                                Some(token.lemma().to_string())
                            } else {
                                None
@ -1422,6 +1422,43 @@ mod tests {
        assert!(result.documents_ids.is_empty());
    }
    #[test]
    fn thai_synonyms() {
        let mut index = TempIndex::new();
        index.index_documents_config.autogenerate_docids = true;
        let mut wtxn = index.write_txn().unwrap();
        // Send 3 documents with ids from 1 to 3.
        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "name": "ยี่ปุ่น" },
                    { "name": "ญี่ปุ่น" },
                ]),
            )
            .unwrap();
        // In the same transaction provide some synonyms
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_synonyms(btreemap! {
                    "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
                });
            })
            .unwrap();
        wtxn.commit().unwrap();
        // Ensure synonyms are effectively stored
        let rtxn = index.read_txn().unwrap();
        let synonyms = index.synonyms(&rtxn).unwrap();
        assert!(!synonyms.is_empty()); // at this point the index should return something
        // Check that we can use synonyms
        let result = index.search(&rtxn).query("japanese").execute().unwrap();
        assert_eq!(result.documents_ids.len(), 2);
    }
    #[test]
    fn setting_searchable_recomputes_other_settings() {
        let index = TempIndex::new();