diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ac041a8b0..1c24a0fcf 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -226,9 +226,9 @@ fn process_tokens<'a>( ) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator()) - .scan((0, None), |(offset, prev_kind), token| { + .scan((0, None), |(offset, prev_kind), mut token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, @@ -244,7 +244,7 @@ fn process_tokens<'a>( { *prev_kind = Some(token.kind); } - _ => (), + _ => token.kind = TokenKind::Unknown, } Some((*offset, token)) }) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7e52c04c1..b0452315d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { tokenizer .tokenize(text) .filter_map(|token| { - if token.is_word() { + if token.is_word() && !token.lemma().is_empty() { Some(token.lemma().to_string()) } else { None @@ -1422,6 +1422,43 @@ mod tests { assert!(result.documents_ids.is_empty()); } + #[test] + fn thai_synonyms() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "ยี่ปุ่น" }, + { "name": "ญี่ปุ่น" }, + ]), + ) + .unwrap(); + + // In the same transaction provide some synonyms + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(btreemap! { + "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")], + }); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("japanese").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + #[test] fn setting_searchable_recomputes_other_settings() { let index = TempIndex::new();