Improve and simplify the query tokenizer

This commit is contained in:
Kerollmops 2020-10-07 11:51:33 +02:00
parent 4e9bd1fef5
commit 38820bc75c
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 42 additions and 60 deletions

View File

@ -1,6 +1,20 @@
use std::{mem, str}; use std::str;
use crate::tokenizer::{simple_tokenizer, TokenType};
use QueryToken::{Quoted, Free}; #[derive(Debug)]
enum State {
Free,
Quoted,
}
impl State {
fn swap(&mut self) {
match self {
State::Quoted => *self = State::Free,
State::Free => *self = State::Quoted,
}
}
}
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub enum QueryToken<'a> { pub enum QueryToken<'a> {
@ -8,35 +22,16 @@ pub enum QueryToken<'a> {
Quoted(&'a str), Quoted(&'a str),
} }
#[derive(Debug)]
enum State {
Free(usize),
Quoted(usize),
Fused,
}
impl State {
fn is_quoted(&self) -> bool {
match self { State::Quoted(_) => true, _ => false }
}
fn replace_by(&mut self, state: State) -> State {
mem::replace(self, state)
}
}
pub struct QueryTokens<'a> { pub struct QueryTokens<'a> {
state: State, state: State,
string: &'a str, iter: Box<dyn Iterator<Item=(TokenType, &'a str)> + 'a>,
string_chars: str::CharIndices<'a>,
} }
impl<'a> QueryTokens<'a> { impl QueryTokens<'_> {
pub fn new(query: &'a str) -> QueryTokens<'a> { pub fn new(query: &str) -> QueryTokens {
QueryTokens { QueryTokens {
state: State::Free(0), state: State::Free,
string: query, iter: Box::new(simple_tokenizer(query)),
string_chars: query.char_indices(),
} }
} }
} }
@ -46,33 +41,16 @@ impl<'a> Iterator for QueryTokens<'a> {
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
loop { loop {
let (i, afteri, c) = match self.string_chars.next() { match self.iter.next()? {
Some((i, c)) => (i, i + c.len_utf8(), c), (TokenType::Other, "\"") => self.state.swap(),
None => return match self.state.replace_by(State::Fused) { (TokenType::Word, token) => {
State::Free(s) => if !self.string[s..].is_empty() { let token = match self.state {
Some(Free(&self.string[s..])) State::Quoted => QueryToken::Quoted(token),
} else { State::Free => QueryToken::Free(token),
None };
}, return Some(token);
State::Quoted(s) => Some(Quoted(&self.string[s..])),
State::Fused => None,
}, },
}; (_, _) => (),
if c == '"' {
match self.state.replace_by(State::Free(afteri)) {
State::Quoted(s) => return Some(Quoted(&self.string[s..i])),
State::Free(s) => {
self.state = State::Quoted(afteri);
if i > s { return Some(Free(&self.string[s..i])) }
},
State::Fused => return None,
}
} else if !self.state.is_quoted() && !c.is_alphanumeric() {
match self.state.replace_by(State::Free(afteri)) {
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
_ => self.state = State::Free(afteri),
}
} }
} }
} }
@ -158,19 +136,20 @@ mod tests {
#[test] #[test]
fn multi_quoted_strings() { fn multi_quoted_strings() {
let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\""); let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\"");
assert_eq!(iter.next(), Some(Quoted("hello world"))); assert_eq!(iter.next(), Some(Quoted("hello")));
assert_eq!(iter.next(), Some(Quoted("world")));
assert_eq!(iter.next(), Some(Free("coucou"))); assert_eq!(iter.next(), Some(Free("coucou")));
assert_eq!(iter.next(), Some(Quoted("monde est beau"))); assert_eq!(iter.next(), Some(Quoted("monde")));
assert_eq!(iter.next(), Some(Quoted("est")));
assert_eq!(iter.next(), Some(Quoted("beau")));
assert_eq!(iter.next(), None); assert_eq!(iter.next(), None);
} }
#[test] #[test]
fn chinese() { fn chinese() {
let mut iter = QueryTokens::new("汽车男生"); let mut iter = QueryTokens::new("汽车男生");
assert_eq!(iter.next(), Some(Free(""))); assert_eq!(iter.next(), Some(Free("汽车")));
assert_eq!(iter.next(), Some(Free(""))); assert_eq!(iter.next(), Some(Free("男生")));
assert_eq!(iter.next(), Some(Free("")));
assert_eq!(iter.next(), Some(Free("")));
assert_eq!(iter.next(), None); assert_eq!(iter.next(), None);
} }
} }

View File

@ -13,6 +13,7 @@ static CHINESE_WORDS_FST: Lazy<Fst<&[u8]>> = Lazy::new(|| Fst::new(CHINESE_FST_B
pub enum TokenType { pub enum TokenType {
Word, Word,
Space, Space,
Other,
} }
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> { pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
@ -43,6 +44,7 @@ pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
}, },
Alphanumeric => Some((TokenType::Word, mem::take(&mut string))), Alphanumeric => Some((TokenType::Word, mem::take(&mut string))),
Space => Some((TokenType::Space, mem::take(&mut string))), Space => Some((TokenType::Space, mem::take(&mut string))),
Other => Some((TokenType::Other, mem::take(&mut string))),
} }
}) })
}) })
@ -57,6 +59,7 @@ enum CharCategory {
Chinese, Chinese,
Alphanumeric, Alphanumeric,
Space, Space,
Other,
} }
impl CharCategory { impl CharCategory {
@ -64,7 +67,7 @@ impl CharCategory {
if c.is_alphanumeric() { if c.is_alphanumeric() {
if is_chinese(c) { Chinese } else { Alphanumeric } if is_chinese(c) { Chinese } else { Alphanumeric }
} else { } else {
Space if c.is_whitespace() { Space } else { Other }
} }
} }
} }
@ -122,7 +125,7 @@ mod tests {
assert_eq!(iter.next(), Some((TokenType::Word, "hello"))); assert_eq!(iter.next(), Some((TokenType::Word, "hello")));
assert_eq!(iter.next(), Some((TokenType::Space, " "))); assert_eq!(iter.next(), Some((TokenType::Space, " ")));
assert_eq!(iter.next(), Some((TokenType::Word, "world"))); assert_eq!(iter.next(), Some((TokenType::Word, "world")));
assert_eq!(iter.next(), Some((TokenType::Space, "!"))); assert_eq!(iter.next(), Some((TokenType::Other, "!")));
assert_eq!(iter.next(), None); assert_eq!(iter.next(), None);
} }