mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 22:14:32 +01:00
Improve and simplify the query tokenizer
This commit is contained in:
parent
4e9bd1fef5
commit
38820bc75c
@ -1,6 +1,20 @@
|
|||||||
use std::{mem, str};
|
use std::str;
|
||||||
|
use crate::tokenizer::{simple_tokenizer, TokenType};
|
||||||
|
|
||||||
use QueryToken::{Quoted, Free};
|
#[derive(Debug)]
|
||||||
|
enum State {
|
||||||
|
Free,
|
||||||
|
Quoted,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl State {
|
||||||
|
fn swap(&mut self) {
|
||||||
|
match self {
|
||||||
|
State::Quoted => *self = State::Free,
|
||||||
|
State::Free => *self = State::Quoted,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub enum QueryToken<'a> {
|
pub enum QueryToken<'a> {
|
||||||
@ -8,35 +22,16 @@ pub enum QueryToken<'a> {
|
|||||||
Quoted(&'a str),
|
Quoted(&'a str),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
enum State {
|
|
||||||
Free(usize),
|
|
||||||
Quoted(usize),
|
|
||||||
Fused,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl State {
|
|
||||||
fn is_quoted(&self) -> bool {
|
|
||||||
match self { State::Quoted(_) => true, _ => false }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn replace_by(&mut self, state: State) -> State {
|
|
||||||
mem::replace(self, state)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct QueryTokens<'a> {
|
pub struct QueryTokens<'a> {
|
||||||
state: State,
|
state: State,
|
||||||
string: &'a str,
|
iter: Box<dyn Iterator<Item=(TokenType, &'a str)> + 'a>,
|
||||||
string_chars: str::CharIndices<'a>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTokens<'a> {
|
impl QueryTokens<'_> {
|
||||||
pub fn new(query: &'a str) -> QueryTokens<'a> {
|
pub fn new(query: &str) -> QueryTokens {
|
||||||
QueryTokens {
|
QueryTokens {
|
||||||
state: State::Free(0),
|
state: State::Free,
|
||||||
string: query,
|
iter: Box::new(simple_tokenizer(query)),
|
||||||
string_chars: query.char_indices(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -46,33 +41,16 @@ impl<'a> Iterator for QueryTokens<'a> {
|
|||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
loop {
|
loop {
|
||||||
let (i, afteri, c) = match self.string_chars.next() {
|
match self.iter.next()? {
|
||||||
Some((i, c)) => (i, i + c.len_utf8(), c),
|
(TokenType::Other, "\"") => self.state.swap(),
|
||||||
None => return match self.state.replace_by(State::Fused) {
|
(TokenType::Word, token) => {
|
||||||
State::Free(s) => if !self.string[s..].is_empty() {
|
let token = match self.state {
|
||||||
Some(Free(&self.string[s..]))
|
State::Quoted => QueryToken::Quoted(token),
|
||||||
} else {
|
State::Free => QueryToken::Free(token),
|
||||||
None
|
|
||||||
},
|
|
||||||
State::Quoted(s) => Some(Quoted(&self.string[s..])),
|
|
||||||
State::Fused => None,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
return Some(token);
|
||||||
if c == '"' {
|
|
||||||
match self.state.replace_by(State::Free(afteri)) {
|
|
||||||
State::Quoted(s) => return Some(Quoted(&self.string[s..i])),
|
|
||||||
State::Free(s) => {
|
|
||||||
self.state = State::Quoted(afteri);
|
|
||||||
if i > s { return Some(Free(&self.string[s..i])) }
|
|
||||||
},
|
},
|
||||||
State::Fused => return None,
|
(_, _) => (),
|
||||||
}
|
|
||||||
} else if !self.state.is_quoted() && !c.is_alphanumeric() {
|
|
||||||
match self.state.replace_by(State::Free(afteri)) {
|
|
||||||
State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
|
|
||||||
_ => self.state = State::Free(afteri),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -158,19 +136,20 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn multi_quoted_strings() {
|
fn multi_quoted_strings() {
|
||||||
let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\"");
|
let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\"");
|
||||||
assert_eq!(iter.next(), Some(Quoted("hello world")));
|
assert_eq!(iter.next(), Some(Quoted("hello")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("world")));
|
||||||
assert_eq!(iter.next(), Some(Free("coucou")));
|
assert_eq!(iter.next(), Some(Free("coucou")));
|
||||||
assert_eq!(iter.next(), Some(Quoted("monde est beau")));
|
assert_eq!(iter.next(), Some(Quoted("monde")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("est")));
|
||||||
|
assert_eq!(iter.next(), Some(Quoted("beau")));
|
||||||
assert_eq!(iter.next(), None);
|
assert_eq!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn chinese() {
|
fn chinese() {
|
||||||
let mut iter = QueryTokens::new("汽车男生");
|
let mut iter = QueryTokens::new("汽车男生");
|
||||||
assert_eq!(iter.next(), Some(Free("汽")));
|
assert_eq!(iter.next(), Some(Free("汽车")));
|
||||||
assert_eq!(iter.next(), Some(Free("车")));
|
assert_eq!(iter.next(), Some(Free("男生")));
|
||||||
assert_eq!(iter.next(), Some(Free("男")));
|
|
||||||
assert_eq!(iter.next(), Some(Free("生")));
|
|
||||||
assert_eq!(iter.next(), None);
|
assert_eq!(iter.next(), None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,7 @@ static CHINESE_WORDS_FST: Lazy<Fst<&[u8]>> = Lazy::new(|| Fst::new(CHINESE_FST_B
|
|||||||
pub enum TokenType {
|
pub enum TokenType {
|
||||||
Word,
|
Word,
|
||||||
Space,
|
Space,
|
||||||
|
Other,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
||||||
@ -43,6 +44,7 @@ pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
|||||||
},
|
},
|
||||||
Alphanumeric => Some((TokenType::Word, mem::take(&mut string))),
|
Alphanumeric => Some((TokenType::Word, mem::take(&mut string))),
|
||||||
Space => Some((TokenType::Space, mem::take(&mut string))),
|
Space => Some((TokenType::Space, mem::take(&mut string))),
|
||||||
|
Other => Some((TokenType::Other, mem::take(&mut string))),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@ -57,6 +59,7 @@ enum CharCategory {
|
|||||||
Chinese,
|
Chinese,
|
||||||
Alphanumeric,
|
Alphanumeric,
|
||||||
Space,
|
Space,
|
||||||
|
Other,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CharCategory {
|
impl CharCategory {
|
||||||
@ -64,7 +67,7 @@ impl CharCategory {
|
|||||||
if c.is_alphanumeric() {
|
if c.is_alphanumeric() {
|
||||||
if is_chinese(c) { Chinese } else { Alphanumeric }
|
if is_chinese(c) { Chinese } else { Alphanumeric }
|
||||||
} else {
|
} else {
|
||||||
Space
|
if c.is_whitespace() { Space } else { Other }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -122,7 +125,7 @@ mod tests {
|
|||||||
assert_eq!(iter.next(), Some((TokenType::Word, "hello")));
|
assert_eq!(iter.next(), Some((TokenType::Word, "hello")));
|
||||||
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
assert_eq!(iter.next(), Some((TokenType::Space, " ")));
|
||||||
assert_eq!(iter.next(), Some((TokenType::Word, "world")));
|
assert_eq!(iter.next(), Some((TokenType::Word, "world")));
|
||||||
assert_eq!(iter.next(), Some((TokenType::Space, "!")));
|
assert_eq!(iter.next(), Some((TokenType::Other, "!")));
|
||||||
assert_eq!(iter.next(), None);
|
assert_eq!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user