From 085aad0a94b6431e72849f4807181a5015994ff7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 4 Sep 2023 14:39:33 +0200 Subject: [PATCH 1/3] Add a test --- milli/src/update/settings.rs | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7e52c04c1..023e09aa0 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1422,6 +1422,43 @@ mod tests { assert!(result.documents_ids.is_empty()); } + #[test] + fn thai_synonyms() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "ยี่ปุ่น" }, + { "name": "ญี่ปุ่น" }, + ]), + ) + .unwrap(); + + // In the same transaction provide some synonyms + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(btreemap! { + "japanese".to_string() => vec!["ญี่ปุ่น", "ยี่ปุ่น"], + }); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("japanese").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + #[test] fn setting_searchable_recomputes_other_settings() { let index = TempIndex::new(); From 8ac5b765bc0d22dd8448e671c66529ca668e3389 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 4 Sep 2023 14:39:52 +0200 Subject: [PATCH 2/3] Fix synonyms normalization --- milli/src/update/settings.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 023e09aa0..b0452315d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { tokenizer .tokenize(text) .filter_map(|token| { - if token.is_word() { + if token.is_word() && !token.lemma().is_empty() { Some(token.lemma().to_string()) } else { None @@ -1443,7 +1443,7 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_synonyms(btreemap! { - "japanese".to_string() => vec!["ญี่ปุ่น", "ยี่ปุ่น"], + "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")], }); }) .unwrap(); From 66aa6d5871050ea0c18bd672dba83681ded8869a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 5 Sep 2023 15:44:14 +0200 Subject: [PATCH 3/3] Ignore tokens with empty normalized value during indexing process --- .../index_documents/extract/extract_docid_word_positions.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ac041a8b0..1c24a0fcf 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -226,9 +226,9 @@ fn process_tokens<'a>( ) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator()) - .scan((0, None), |(offset, prev_kind), token| { + .scan((0, None), |(offset, prev_kind), mut token| { match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { *offset += match *prev_kind { Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, Some(_) => 1, @@ -244,7 +244,7 @@ fn process_tokens<'a>( { *prev_kind = Some(token.kind); } - _ => (), + _ => token.kind = TokenKind::Unknown, } Some((*offset, token)) })