Check that the unidecoded words are not empty

This commit is contained in:
Clément Renault 2019-11-04 16:58:02 +01:00
parent 4571b80a49
commit 3b1cbed238
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
1 changed files with 20 additions and 1 deletions

View File

@ -139,11 +139,12 @@ fn index_token(
if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower {
if unidecoded != lower && !unidecoded.is_empty() {
let token = Token {
word: &unidecoded,
..token
};
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
@ -252,4 +253,22 @@ mod tests {
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "🇯🇵";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes
.get(&"🇯🇵".to_owned().into_bytes())
.is_some());
}
}