mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Introduce a simple FST based chinese word segmenter
This commit is contained in:
parent
6cc6addc2f
commit
acd2a63879
@ -11,7 +11,7 @@ bstr = "0.2.13"
|
|||||||
byteorder = "1.3.4"
|
byteorder = "1.3.4"
|
||||||
csv = "1.1.3"
|
csv = "1.1.3"
|
||||||
flate2 = "1.0.17"
|
flate2 = "1.0.17"
|
||||||
fst = "0.4.3"
|
fst = "0.4.4"
|
||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
|
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
|
||||||
human_format = "1.0.3"
|
human_format = "1.0.3"
|
||||||
@ -48,6 +48,9 @@ warp = "0.2.2"
|
|||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion = "0.3"
|
criterion = "0.3"
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
fst = "0.4.4"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
|
|
||||||
|
27
build.rs
Normal file
27
build.rs
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
use std::env;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufRead, BufReader};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use fst::SetBuilder;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let chinese_words_txt = "chinese-words.txt";
|
||||||
|
let out_dir = env::var("OUT_DIR").unwrap();
|
||||||
|
let chinese_words_fst = PathBuf::from(out_dir).join("chinese-words.fst");
|
||||||
|
|
||||||
|
// Tell Cargo that if the given file changes, to rerun this build script.
|
||||||
|
println!("cargo:rerun-if-changed={}", chinese_words_txt);
|
||||||
|
|
||||||
|
let chinese_words_txt = File::open(chinese_words_txt).map(BufReader::new).unwrap();
|
||||||
|
let chinese_words_fst = File::create(chinese_words_fst).unwrap();
|
||||||
|
|
||||||
|
let mut builder = SetBuilder::new(chinese_words_fst).unwrap();
|
||||||
|
for result in chinese_words_txt.lines() {
|
||||||
|
let line = result.unwrap();
|
||||||
|
if let Some(s) = line.split(' ').next() {
|
||||||
|
builder.insert(s).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
builder.finish().unwrap();
|
||||||
|
}
|
349046
chinese-words.txt
Normal file
349046
chinese-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
162
src/tokenizer.rs
162
src/tokenizer.rs
@ -1,5 +1,14 @@
|
|||||||
|
use std::{str, iter, mem};
|
||||||
|
|
||||||
|
use fst::raw::{Fst, Output};
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
|
|
||||||
|
use CharCategory::*;
|
||||||
|
|
||||||
|
const CHINESE_FST_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/chinese-words.fst"));
|
||||||
|
static CHINESE_WORDS_FST: Lazy<Fst<&[u8]>> = Lazy::new(|| Fst::new(CHINESE_FST_BYTES).unwrap());
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
pub enum TokenType {
|
pub enum TokenType {
|
||||||
Word,
|
Word,
|
||||||
@ -8,14 +17,157 @@ pub enum TokenType {
|
|||||||
|
|
||||||
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
||||||
text
|
text
|
||||||
.linear_group_by_key(|c| c.is_alphanumeric())
|
.linear_group_by_key(CharCategory::new)
|
||||||
.map(|s| {
|
.flat_map(|mut string| {
|
||||||
let first = s.chars().next().unwrap();
|
let first = string.chars().next().unwrap();
|
||||||
let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
|
let category = CharCategory::new(first);
|
||||||
(type_, s)
|
iter::from_fn(move || {
|
||||||
|
if string.is_empty() { return None }
|
||||||
|
match category {
|
||||||
|
Chinese => {
|
||||||
|
let fst = &CHINESE_WORDS_FST;
|
||||||
|
match find_longest_prefix(fst, string.as_bytes()) {
|
||||||
|
Some((_, l)) => {
|
||||||
|
let s = &string[..l];
|
||||||
|
string = &string[l..];
|
||||||
|
Some((TokenType::Word, s))
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
let first = string.chars().next().unwrap();
|
||||||
|
let len = first.len_utf8();
|
||||||
|
let (head, tail) = string.split_at(len);
|
||||||
|
string = tail;
|
||||||
|
Some((TokenType::Word, head))
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Alphanumeric => Some((TokenType::Word, mem::take(&mut string))),
|
||||||
|
Space => Some((TokenType::Space, mem::take(&mut string))),
|
||||||
|
}
|
||||||
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
|
pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> {
|
||||||
if t == TokenType::Word { Some(w) } else { None }
|
if t == TokenType::Word { Some(w) } else { None }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||||
|
enum CharCategory {
|
||||||
|
Chinese,
|
||||||
|
Alphanumeric,
|
||||||
|
Space,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CharCategory {
|
||||||
|
fn new(c: char) -> Self {
|
||||||
|
if c.is_alphanumeric() {
|
||||||
|
if is_chinese(c) { Chinese } else { Alphanumeric }
|
||||||
|
} else {
|
||||||
|
Space
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_chinese(c: char) -> bool {
|
||||||
|
match u32::from(c) {
|
||||||
|
0x4E00..=0x9FEF
|
||||||
|
| 0x3400..=0x4DBF
|
||||||
|
| 0x20000..=0x2A6DF
|
||||||
|
| 0x2A700..=0x2B73F
|
||||||
|
| 0x2B740..=0x2B81F
|
||||||
|
| 0x2B820..=0x2CEAF
|
||||||
|
| 0x2CEB0..=0x2EBEF
|
||||||
|
| 0x3007..=0x3007 => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find the longest key that is prefix of the given value.
|
||||||
|
///
|
||||||
|
/// If the key exists, then `Some((value, key_len))` is returned, where
|
||||||
|
/// `value` is the value associated with the key, and `key_len` is the
|
||||||
|
/// length of the found key. Otherwise `None` is returned.
|
||||||
|
///
|
||||||
|
/// This can be used to e.g. build tokenizing functions.
|
||||||
|
//
|
||||||
|
// https://github.com/BurntSushi/fst/pull/104
|
||||||
|
#[inline]
|
||||||
|
fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> {
|
||||||
|
let mut node = fst.root();
|
||||||
|
let mut out = Output::zero();
|
||||||
|
let mut last_match = None;
|
||||||
|
for (i, &b) in value.iter().enumerate() {
|
||||||
|
if let Some(trans_index) = node.find_input(b) {
|
||||||
|
let t = node.transition(trans_index);
|
||||||
|
node = fst.node(t.addr);
|
||||||
|
out = out.cat(t.out);
|
||||||
|
if node.is_final() {
|
||||||
|
last_match = Some((out.cat(node.final_output()).value(), i + 1));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return last_match;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
last_match
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn without_chinese() {
|
||||||
|
let mut iter = simple_tokenizer("hello world!");
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "hello")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "world")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Space, "!")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn only_chinese() {
|
||||||
|
let mut iter = simple_tokenizer("今天的天气真好");
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "的")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mixup_chinese_with_alphabet() {
|
||||||
|
let mut iter = simple_tokenizer("今天的天气真好Apple is good今天的天气真好");
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "的")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "Apple")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "is")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "good")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "今天")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "的")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "天气")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "真好")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unknown_chinese() {
|
||||||
|
let mut iter = simple_tokenizer("被虾头大讚好识𠱁女仔");
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "被")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "虾")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "头")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "大")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "讚")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "好")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "识")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "𠱁")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "女")));
|
||||||
|
assert_eq!(iter.next(), Some((TokenType::Word, "仔")));
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user