2020-12-23 19:09:01 +01:00
|
|
|
use meilisearch_tokenizer::{Token, TokenKind};
|
2020-06-05 09:48:46 +02:00
|
|
|
|
2020-08-15 20:37:13 +02:00
|
|
|
#[derive(Debug)]
|
2020-06-05 09:48:46 +02:00
|
|
|
enum State {
|
2020-10-07 11:51:33 +02:00
|
|
|
Free,
|
|
|
|
Quoted,
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
impl State {
|
2020-10-07 11:51:33 +02:00
|
|
|
fn swap(&mut self) {
|
|
|
|
match self {
|
|
|
|
State::Quoted => *self = State::Free,
|
|
|
|
State::Free => *self = State::Quoted,
|
|
|
|
}
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
2020-10-07 11:51:33 +02:00
|
|
|
}
|
2020-06-05 09:48:46 +02:00
|
|
|
|
2020-10-07 11:51:33 +02:00
|
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
|
|
pub enum QueryToken<'a> {
|
2020-12-23 19:09:01 +01:00
|
|
|
Free(Token<'a>),
|
|
|
|
Quoted(Token<'a>),
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
2020-12-23 19:09:01 +01:00
|
|
|
pub fn query_tokens<'a>(mut tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = QueryToken<'a>> {
|
|
|
|
let mut state = State::Free;
|
|
|
|
let f = move || {
|
2020-06-05 09:48:46 +02:00
|
|
|
loop {
|
2020-12-23 19:09:01 +01:00
|
|
|
let token = tokens.next()?;
|
|
|
|
match token.kind() {
|
|
|
|
_ if token.text().trim() == "\"" => state.swap(),
|
|
|
|
TokenKind::Word => {
|
|
|
|
let token = match state {
|
2020-10-07 11:51:33 +02:00
|
|
|
State::Quoted => QueryToken::Quoted(token),
|
|
|
|
State::Free => QueryToken::Free(token),
|
|
|
|
};
|
|
|
|
return Some(token);
|
2020-06-05 09:48:46 +02:00
|
|
|
},
|
2020-12-23 19:09:01 +01:00
|
|
|
_ => (),
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
}
|
2020-12-23 19:09:01 +01:00
|
|
|
};
|
|
|
|
std::iter::from_fn(f)
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
2020-08-10 14:37:18 +02:00
|
|
|
use QueryToken::{Quoted, Free};
|
2020-12-23 19:09:01 +01:00
|
|
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
|
|
|
use fst::Set;
|
|
|
|
|
|
|
|
macro_rules! assert_eq_query_token {
|
|
|
|
($test:expr, Quoted($val:literal)) => {
|
|
|
|
match $test {
|
|
|
|
Quoted(val) => assert_eq!(val.text(), $val),
|
|
|
|
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
($test:expr, Free($val:literal)) => {
|
|
|
|
match $test {
|
|
|
|
Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()),
|
|
|
|
Free(val) => assert_eq!(val.text(), $val),
|
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2020-06-05 09:48:46 +02:00
|
|
|
|
2020-08-15 20:37:13 +02:00
|
|
|
#[test]
|
|
|
|
fn empty() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert!(iter.next().is_none());
|
|
|
|
|
|
|
|
let query = " ";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert!(iter.next().is_none());
|
2020-08-15 20:37:13 +02:00
|
|
|
}
|
|
|
|
|
2020-06-05 09:48:46 +02:00
|
|
|
#[test]
|
|
|
|
fn one_quoted_string() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "\"hello\"";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn one_pending_quoted_string() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "\"hello";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn one_non_quoted_string() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "hello";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn quoted_directly_followed_by_free_strings() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "\"hello\"world";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn free_directly_followed_by_quoted_strings() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "hello\"world\"";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn free_followed_by_quoted_strings() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "hello \"world\"";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn multiple_spaces_separated_strings() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "hello world ";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn multi_interleaved_quoted_free_strings() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "hello \"world\" coucou \"monde\"";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|
2020-08-10 14:37:18 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn multi_quoted_strings() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "\"hello world\" coucou \"monde est beau\"";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("est"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Quoted("beau"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-08-10 14:37:18 +02:00
|
|
|
}
|
2020-08-15 20:37:13 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn chinese() {
|
2020-12-23 19:09:01 +01:00
|
|
|
let stop_words = Set::default();
|
|
|
|
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
|
|
|
|
let query = "汽车男生";
|
|
|
|
let analyzed = analyzer.analyze(query);
|
|
|
|
let tokens = analyzed.tokens();
|
|
|
|
let mut iter = query_tokens(tokens);
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("汽车"));
|
|
|
|
assert_eq_query_token!(iter.next().unwrap(), Free("男生"));
|
|
|
|
assert!(iter.next().is_none());
|
2020-08-15 20:37:13 +02:00
|
|
|
}
|
2020-06-05 09:48:46 +02:00
|
|
|
}
|