integration with new tokenizer wip

This commit is contained in:
mpostma 2020-11-19 18:23:08 +01:00 committed by many
parent 8a4d05b7bb
commit 5e00842087
No known key found for this signature in database
GPG key ID: 2CEF23B75189EACA
5 changed files with 94 additions and 60 deletions

View file

@ -2,13 +2,3 @@ mod dfa;
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}