mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-25 20:57:35 +01:00
512 lines
14 KiB
Rust
512 lines
14 KiB
Rust
use self::SeparatorCategory::*;
|
|
use deunicode::deunicode_char;
|
|
use slice_group_by::StrGroupBy;
|
|
use std::iter::Peekable;
|
|
|
|
pub fn is_cjk(c: char) -> bool {
|
|
(c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo
|
|
|| (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement
|
|
|| (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical
|
|
|| (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation
|
|
|| (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana
|
|
|| (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana
|
|
|| (c >= '\u{3100}' && c <= '\u{312f}')
|
|
|| (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo
|
|
|| (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months
|
|
|| (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A
|
|
|| (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs
|
|
|| (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A
|
|
|| (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables
|
|
|| (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B
|
|
|| (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs
|
|
|| (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana
|
|
}
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
|
enum SeparatorCategory {
|
|
Soft,
|
|
Hard,
|
|
}
|
|
|
|
impl SeparatorCategory {
|
|
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
|
|
if let (Soft, Soft) = (self, other) {
|
|
Soft
|
|
} else {
|
|
Hard
|
|
}
|
|
}
|
|
|
|
fn to_usize(self) -> usize {
|
|
match self {
|
|
Soft => 1,
|
|
Hard => 8,
|
|
}
|
|
}
|
|
}
|
|
|
|
fn is_separator(c: char) -> bool {
|
|
classify_separator(c).is_some()
|
|
}
|
|
|
|
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
|
match c {
|
|
c if c.is_whitespace() => Some(Soft), // whitespaces
|
|
c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
|
|
c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
|
|
'-' | '_' | '\'' | ':' | '/' | '\\' => Some(Soft),
|
|
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
|
enum CharCategory {
|
|
Separator(SeparatorCategory),
|
|
Cjk,
|
|
Other,
|
|
}
|
|
|
|
fn classify_char(c: char) -> CharCategory {
|
|
if let Some(category) = classify_separator(c) {
|
|
CharCategory::Separator(category)
|
|
} else if is_cjk(c) {
|
|
CharCategory::Cjk
|
|
} else {
|
|
CharCategory::Other
|
|
}
|
|
}
|
|
|
|
fn is_str_word(s: &str) -> bool {
|
|
!s.chars().any(is_separator)
|
|
}
|
|
|
|
fn same_group_category(a: char, b: char) -> bool {
|
|
match (classify_char(a), classify_char(b)) {
|
|
(CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
|
|
(CharCategory::Separator(_), CharCategory::Separator(_)) => true,
|
|
(a, b) => a == b,
|
|
}
|
|
}
|
|
|
|
// fold the number of chars along with the index position
|
|
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
|
|
(n + 1, i + c.len_utf8())
|
|
}
|
|
|
|
pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
|
|
Tokenizer::new(query).map(|t| t.word)
|
|
}
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
|
pub struct Token<'a> {
|
|
pub word: &'a str,
|
|
pub word_index: usize,
|
|
pub char_index: usize,
|
|
}
|
|
|
|
pub struct Tokenizer<'a> {
|
|
inner: &'a str,
|
|
word_index: usize,
|
|
char_index: usize,
|
|
}
|
|
|
|
impl<'a> Tokenizer<'a> {
|
|
pub fn new(string: &str) -> Tokenizer {
|
|
// skip every separator and set `char_index`
|
|
// to the number of char trimmed
|
|
let (count, index) = string
|
|
.char_indices()
|
|
.take_while(|(_, c)| is_separator(*c))
|
|
.fold((0, 0), chars_count_index);
|
|
|
|
Tokenizer {
|
|
inner: &string[index..],
|
|
word_index: 0,
|
|
char_index: count,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for Tokenizer<'a> {
|
|
type Item = Token<'a>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let mut iter = self.inner.linear_group_by(same_group_category).peekable();
|
|
|
|
while let (Some(string), next_string) = (iter.next(), iter.peek()) {
|
|
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
|
|
|
|
if !is_str_word(string) {
|
|
self.word_index += string
|
|
.chars()
|
|
.filter_map(classify_separator)
|
|
.fold(Soft, |a, x| a.merge(x))
|
|
.to_usize();
|
|
self.char_index += count;
|
|
self.inner = &self.inner[index..];
|
|
continue;
|
|
}
|
|
|
|
let token = Token {
|
|
word: string,
|
|
word_index: self.word_index,
|
|
char_index: self.char_index,
|
|
};
|
|
|
|
if next_string.filter(|s| is_str_word(s)).is_some() {
|
|
self.word_index += 1;
|
|
}
|
|
|
|
self.char_index += count;
|
|
self.inner = &self.inner[index..];
|
|
|
|
return Some(token);
|
|
}
|
|
|
|
self.inner = "";
|
|
None
|
|
}
|
|
}
|
|
|
|
pub struct SeqTokenizer<'a, I>
|
|
where
|
|
I: Iterator<Item = &'a str>,
|
|
{
|
|
inner: I,
|
|
current: Option<Peekable<Tokenizer<'a>>>,
|
|
word_offset: usize,
|
|
char_offset: usize,
|
|
}
|
|
|
|
impl<'a, I> SeqTokenizer<'a, I>
|
|
where
|
|
I: Iterator<Item = &'a str>,
|
|
{
|
|
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
|
|
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
|
|
SeqTokenizer {
|
|
inner: iter,
|
|
current,
|
|
word_offset: 0,
|
|
char_offset: 0,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a, I> Iterator for SeqTokenizer<'a, I>
|
|
where
|
|
I: Iterator<Item = &'a str>,
|
|
{
|
|
type Item = Token<'a>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
match &mut self.current {
|
|
Some(current) => {
|
|
match current.next() {
|
|
Some(token) => {
|
|
// we must apply the word and char offsets
|
|
// to the token before returning it
|
|
let token = Token {
|
|
word: token.word,
|
|
word_index: token.word_index + self.word_offset,
|
|
char_index: token.char_index + self.char_offset,
|
|
};
|
|
|
|
// if this is the last iteration on this text
|
|
// we must save the offsets for next texts
|
|
if current.peek().is_none() {
|
|
let hard_space = SeparatorCategory::Hard.to_usize();
|
|
self.word_offset = token.word_index + hard_space;
|
|
self.char_offset = token.char_index + hard_space;
|
|
}
|
|
|
|
Some(token)
|
|
}
|
|
None => {
|
|
// no more words in this text we must
|
|
// start tokenizing the next text
|
|
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
|
|
self.next()
|
|
}
|
|
}
|
|
}
|
|
// no more texts available
|
|
None => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn easy() {
|
|
let mut tokenizer = Tokenizer::new("salut");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "salut",
|
|
word_index: 0,
|
|
char_index: 0
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
|
|
let mut tokenizer = Tokenizer::new("yo ");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "yo",
|
|
word_index: 0,
|
|
char_index: 0
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn hard() {
|
|
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "yo",
|
|
word_index: 0,
|
|
char_index: 4
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lolo",
|
|
word_index: 1,
|
|
char_index: 7
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "aïe",
|
|
word_index: 9,
|
|
char_index: 13
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "ouch",
|
|
word_index: 17,
|
|
char_index: 18
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
|
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "yo",
|
|
word_index: 0,
|
|
char_index: 0
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lolo",
|
|
word_index: 8,
|
|
char_index: 5
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "wtf",
|
|
word_index: 16,
|
|
char_index: 12
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lol",
|
|
word_index: 17,
|
|
char_index: 18
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "aïe",
|
|
word_index: 25,
|
|
char_index: 24
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn hard_long_chars() {
|
|
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "yo",
|
|
word_index: 0,
|
|
char_index: 4
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "😂",
|
|
word_index: 1,
|
|
char_index: 7
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "aïe",
|
|
word_index: 9,
|
|
char_index: 10
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
|
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "yo",
|
|
word_index: 0,
|
|
char_index: 0
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lolo",
|
|
word_index: 8,
|
|
char_index: 5
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "😱",
|
|
word_index: 16,
|
|
char_index: 12
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lol",
|
|
word_index: 17,
|
|
char_index: 16
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "😣",
|
|
word_index: 25,
|
|
char_index: 22
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn hard_kanjis() {
|
|
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "\u{2ec4}",
|
|
word_index: 0,
|
|
char_index: 0
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lolilol",
|
|
word_index: 1,
|
|
char_index: 1
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "\u{2ec7}",
|
|
word_index: 2,
|
|
char_index: 8
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
|
|
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
|
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "\u{2ec4}",
|
|
word_index: 0,
|
|
char_index: 0
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "\u{2ed3}",
|
|
word_index: 1,
|
|
char_index: 1
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "\u{2ef2}",
|
|
word_index: 2,
|
|
char_index: 2
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "lolilol",
|
|
word_index: 3,
|
|
char_index: 4
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "hello",
|
|
word_index: 4,
|
|
char_index: 14
|
|
})
|
|
);
|
|
assert_eq!(
|
|
tokenizer.next(),
|
|
Some(Token {
|
|
word: "\u{2ec7}",
|
|
word_index: 5,
|
|
char_index: 23
|
|
})
|
|
);
|
|
assert_eq!(tokenizer.next(), None);
|
|
}
|
|
}
|