4033: Fix thai synonyms r=Kerollmops a=Kerollmops

Fixes #4031

Co-authored-by: Kerollmops <clement@meilisearch.com>
Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2023-09-05 13:54:33 +00:00 committed by GitHub
commit 287cf25d39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 41 additions and 4 deletions

View File

@ -226,9 +226,9 @@ fn process_tokens<'a>(
) -> impl Iterator<Item = (usize, Token<'a>)> { ) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens tokens
.skip_while(|token| token.is_separator()) .skip_while(|token| token.is_separator())
.scan((0, None), |(offset, prev_kind), token| { .scan((0, None), |(offset, prev_kind), mut token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
*offset += match *prev_kind { *offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1, Some(_) => 1,
@ -244,7 +244,7 @@ fn process_tokens<'a>(
{ {
*prev_kind = Some(token.kind); *prev_kind = Some(token.kind);
} }
_ => (), _ => token.kind = TokenKind::Unknown,
} }
Some((*offset, token)) Some((*offset, token))
}) })

View File

@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
tokenizer tokenizer
.tokenize(text) .tokenize(text)
.filter_map(|token| { .filter_map(|token| {
if token.is_word() { if token.is_word() && !token.lemma().is_empty() {
Some(token.lemma().to_string()) Some(token.lemma().to_string())
} else { } else {
None None
@ -1422,6 +1422,43 @@ mod tests {
assert!(result.documents_ids.is_empty()); assert!(result.documents_ids.is_empty());
} }
#[test]
fn thai_synonyms() {
let mut index = TempIndex::new();
index.index_documents_config.autogenerate_docids = true;
let mut wtxn = index.write_txn().unwrap();
// Send 3 documents with ids from 1 to 3.
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "name": "ยี่ปุ่น" },
{ "name": "ญี่ปุ่น" },
]),
)
.unwrap();
// In the same transaction provide some synonyms
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_synonyms(btreemap! {
"japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
});
})
.unwrap();
wtxn.commit().unwrap();
// Ensure synonyms are effectively stored
let rtxn = index.read_txn().unwrap();
let synonyms = index.synonyms(&rtxn).unwrap();
assert!(!synonyms.is_empty()); // at this point the index should return something
// Check that we can use synonyms
let result = index.search(&rtxn).query("japanese").execute().unwrap();
assert_eq!(result.documents_ids.len(), 2);
}
#[test] #[test]
fn setting_searchable_recomputes_other_settings() { fn setting_searchable_recomputes_other_settings() {
let index = TempIndex::new(); let index = TempIndex::new();