Check that the unidecoded words are not empty

This commit is contained in:
Clément Renault 2019-11-04 16:58:02 +01:00
parent 4571b80a49
commit 3b1cbed238
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -139,11 +139,12 @@ fn index_token(
if !lower.contains(is_cjk) { if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, ""); let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower { if unidecoded != lower && !unidecoded.is_empty() {
let token = Token { let token = Token {
word: &unidecoded, word: &unidecoded,
..token ..token
}; };
match token_to_docindex(id, attr, token) { match token_to_docindex(id, attr, token) {
Some(docindex) => { Some(docindex) => {
let word = Vec::from(token.word); let word = Vec::from(token.word);
@ -252,4 +253,22 @@ mod tests {
.get(&"éteindre".to_owned().into_bytes()) .get(&"éteindre".to_owned().into_bytes())
.is_some()); .is_some());
} }
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "🇯🇵";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes
.get(&"🇯🇵".to_owned().into_bytes())
.is_some());
}
} }