MeiliSearch/meilisearch-tokenizer/src/lib.rs

use self::SeparatorCategory::*;
use deunicode::deunicode_char;
use slice_group_by::StrGroupBy;
use std::iter::Peekable;

pub fn is_cjk(c: char) -> bool {
    (c >= '\u{1100}' && c <= '\u{11ff}')  // Hangul Jamo
        || (c >= '\u{2e80}' && c <= '\u{2eff}')  // CJK Radicals Supplement
        || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical
        || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation
        || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana
        || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana
        || (c >= '\u{3100}' && c <= '\u{312f}')
        || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo
        || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months
        || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A
        || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs
        || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A
        || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables
        || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B
        || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs
        || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana
}

#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum SeparatorCategory {
    Soft,
    Hard,
}

impl SeparatorCategory {
    fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
        if let (Soft, Soft) = (self, other) {
            Soft
        } else {
            Hard
        }
    }

    fn to_usize(self) -> usize {
        match self {
            Soft => 1,
            Hard => 8,
        }
    }
}

fn is_separator(c: char) -> bool {
    classify_separator(c).is_some()
}

fn classify_separator(c: char) -> Option<SeparatorCategory> {
    match c {
        c if c.is_whitespace() => Some(Soft), // whitespaces
        c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
        c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
        '-' | '_' | '\'' | ':' | '/' | '\\' => Some(Soft),
        '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
        _ => None,
    }
}

#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CharCategory {
    Separator(SeparatorCategory),
    Cjk,
    Other,
}

fn classify_char(c: char) -> CharCategory {
    if let Some(category) = classify_separator(c) {
        CharCategory::Separator(category)
    } else if is_cjk(c) {
        CharCategory::Cjk
    } else {
        CharCategory::Other
    }
}

fn is_str_word(s: &str) -> bool {
    !s.chars().any(is_separator)
}

fn same_group_category(a: char, b: char) -> bool {
    match (classify_char(a), classify_char(b)) {
        (CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
        (CharCategory::Separator(_), CharCategory::Separator(_)) => true,
        (a, b) => a == b,
    }
}

// fold the number of chars along with the index position
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
    (n + 1, i + c.len_utf8())
}

pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
    Tokenizer::new(query).map(|t| t.word)
}

#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> {
    pub word: &'a str,
    pub word_index: usize,
    pub char_index: usize,
}

pub struct Tokenizer<'a> {
    inner: &'a str,
    word_index: usize,
    char_index: usize,
}

impl<'a> Tokenizer<'a> {
    pub fn new(string: &str) -> Tokenizer {
        // skip every separator and set `char_index`
        // to the number of char trimmed
        let (count, index) = string
            .char_indices()
            .take_while(|(_, c)| is_separator(*c))
            .fold((0, 0), chars_count_index);

        Tokenizer {
            inner: &string[index..],
            word_index: 0,
            char_index: count,
        }
    }
}

impl<'a> Iterator for Tokenizer<'a> {
    type Item = Token<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut iter = self.inner.linear_group_by(same_group_category).peekable();

        while let (Some(string), next_string) = (iter.next(), iter.peek()) {
            let (count, index) = string.char_indices().fold((0, 0), chars_count_index);

            if !is_str_word(string) {
                self.word_index += string
                    .chars()
                    .filter_map(classify_separator)
                    .fold(Soft, |a, x| a.merge(x))
                    .to_usize();
                self.char_index += count;
                self.inner = &self.inner[index..];
                continue;
            }

            let token = Token {
                word: string,
                word_index: self.word_index,
                char_index: self.char_index,
            };

            if next_string.filter(|s| is_str_word(s)).is_some() {
                self.word_index += 1;
            }

            self.char_index += count;
            self.inner = &self.inner[index..];

            return Some(token);
        }

        self.inner = "";
        None
    }
}

pub struct SeqTokenizer<'a, I>
where
    I: Iterator<Item = &'a str>,
{
    inner: I,
    current: Option<Peekable<Tokenizer<'a>>>,
    word_offset: usize,
    char_offset: usize,
}

impl<'a, I> SeqTokenizer<'a, I>
where
    I: Iterator<Item = &'a str>,
{
    pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
        let current = iter.next().map(|s| Tokenizer::new(s).peekable());
        SeqTokenizer {
            inner: iter,
            current,
            word_offset: 0,
            char_offset: 0,
        }
    }
}

impl<'a, I> Iterator for SeqTokenizer<'a, I>
where
    I: Iterator<Item = &'a str>,
{
    type Item = Token<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        match &mut self.current {
            Some(current) => {
                match current.next() {
                    Some(token) => {
                        // we must apply the word and char offsets
                        // to the token before returning it
                        let token = Token {
                            word: token.word,
                            word_index: token.word_index + self.word_offset,
                            char_index: token.char_index + self.char_offset,
                        };

                        // if this is the last iteration on this text
                        // we must save the offsets for next texts
                        if current.peek().is_none() {
                            let hard_space = SeparatorCategory::Hard.to_usize();
                            self.word_offset = token.word_index + hard_space;
                            self.char_offset = token.char_index + hard_space;
                        }

                        Some(token)
                    }
                    None => {
                        // no more words in this text we must
                        // start tokenizing the next text
                        self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
                        self.next()
                    }
                }
            }
            // no more texts available
            None => None,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn easy() {
        let mut tokenizer = Tokenizer::new("salut");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "salut",
                word_index: 0,
                char_index: 0
            })
        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo    ");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "yo",
                word_index: 0,
                char_index: 0
            })
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn hard() {
        let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "yo",
                word_index: 0,
                char_index: 4
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lolo",
                word_index: 1,
                char_index: 7
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "aïe",
                word_index: 9,
                char_index: 13
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "ouch",
                word_index: 17,
                char_index: 18
            })
        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "yo",
                word_index: 0,
                char_index: 0
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lolo",
                word_index: 8,
                char_index: 5
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "wtf",
                word_index: 16,
                char_index: 12
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lol",
                word_index: 17,
                char_index: 18
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "aïe",
                word_index: 25,
                char_index: 24
            })
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn hard_long_chars() {
        let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "yo",
                word_index: 0,
                char_index: 4
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "😂",
                word_index: 1,
                char_index: 7
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "aïe",
                word_index: 9,
                char_index: 10
            })
        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "yo",
                word_index: 0,
                char_index: 0
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lolo",
                word_index: 8,
                char_index: 5
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "😱",
                word_index: 16,
                char_index: 12
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lol",
                word_index: 17,
                char_index: 16
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "😣",
                word_index: 25,
                char_index: 22
            })
        );
        assert_eq!(tokenizer.next(), None);
    }

    #[test]
    fn hard_kanjis() {
        let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "\u{2ec4}",
                word_index: 0,
                char_index: 0
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lolilol",
                word_index: 1,
                char_index: 1
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "\u{2ec7}",
                word_index: 2,
                char_index: 8
            })
        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello    \u{2ec7}");

        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "\u{2ec4}",
                word_index: 0,
                char_index: 0
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "\u{2ed3}",
                word_index: 1,
                char_index: 1
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "\u{2ef2}",
                word_index: 2,
                char_index: 2
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "lolilol",
                word_index: 3,
                char_index: 4
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "hello",
                word_index: 4,
                char_index: 14
            })
        );
        assert_eq!(
            tokenizer.next(),
            Some(Token {
                word: "\u{2ec7}",
                word_index: 5,
                char_index: 23
            })
        );
        assert_eq!(tokenizer.next(), None);
    }
}
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`use self::SeparatorCategory::*;`
Make the tokenizer understand strange whitespaces/quotes 2019-11-04 16:10:13 +01:00			`use deunicode::deunicode_char;`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`use slice_group_by::StrGroupBy;`
			`use std::iter::Peekable;`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00
feat: Move tokenizer things into the meilidb-tokenizer workspace 2019-02-25 18:24:46 +01:00			`pub fn is_cjk(c: char) -> bool {`
Update cjk filter 2020-01-30 01:55:16 +01:00			`(c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo`
			`\|\| (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement`
			`\|\| (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical`
			`\|\| (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation`
			`\|\| (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana`
			`\|\| (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`\|\| (c >= '\u{3100}' && c <= '\u{312f}')`
Update cjk filter 2020-01-30 01:55:16 +01:00			`\|\| (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo`
			`\|\| (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months`
			`\|\| (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A`
			`\|\| (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs`
			`\|\| (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A`
			`\|\| (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables`
			`\|\| (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B`
			`\|\| (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs`
			`\|\| (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana`
feat: Move tokenizer things into the meilidb-tokenizer workspace 2019-02-25 18:24:46 +01:00			`}`

feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`#[derive(Debug, Copy, Clone, PartialEq, Eq)]`
			`enum SeparatorCategory {`
			`Soft,`
			`Hard,`
			`}`

			`impl SeparatorCategory {`
			`fn merge(self, other: SeparatorCategory) -> SeparatorCategory {`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`if let (Soft, Soft) = (self, other) {`
			`Soft`
			`} else {`
			`Hard`
			`}`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`}`

			`fn to_usize(self) -> usize {`
			`match self {`
			`Soft => 1,`
			`Hard => 8,`
			`}`
			`}`
			`}`

			`fn is_separator(c: char) -> bool {`
			`classify_separator(c).is_some()`
			`}`

			`fn classify_separator(c: char) -> Option<SeparatorCategory> {`
			`match c {`
Make the tokenizer understand strange whitespaces/quotes 2019-11-04 16:10:13 +01:00			`c if c.is_whitespace() => Some(Soft), // whitespaces`
			`c if deunicode_char(c) == Some("'") => Some(Soft), // quotes`
			`c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes`
Add support for back/slashes 2019-11-11 21:23:08 +01:00			`'-' \| '_' \| '\'' \| ':' \| '/' \| '\\' => Some(Soft),`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`'.' \| ';' \| ',' \| '!' \| '?' \| '(' \| ')' => Some(Hard),`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`_ => None,`
			`}`
			`}`

			`#[derive(Debug, Copy, Clone, PartialEq, Eq)]`
feat: Move query splitting into the tokenizer workspace 2019-02-25 18:34:51 +01:00			`enum CharCategory {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`Separator(SeparatorCategory),`
feat: Move query splitting into the tokenizer workspace 2019-02-25 18:34:51 +01:00			`Cjk,`
			`Other,`
			`}`

			`fn classify_char(c: char) -> CharCategory {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`if let Some(category) = classify_separator(c) {`
			`CharCategory::Separator(category)`
			`} else if is_cjk(c) {`
			`CharCategory::Cjk`
			`} else {`
			`CharCategory::Other`
			`}`
feat: Move query splitting into the tokenizer workspace 2019-02-25 18:34:51 +01:00			`}`

feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`fn is_str_word(s: &str) -> bool {`
			`!s.chars().any(is_separator)`
feat: Move query splitting into the tokenizer workspace 2019-02-25 18:34:51 +01:00			`}`

			`fn same_group_category(a: char, b: char) -> bool {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`match (classify_char(a), classify_char(b)) {`
			`(CharCategory::Cjk, _) \| (_, CharCategory::Cjk) => false,`
			`(CharCategory::Separator(_), CharCategory::Separator(_)) => true,`
			`(a, b) => a == b,`
			`}`
feat: Move query splitting into the tokenizer workspace 2019-02-25 18:34:51 +01:00			`}`

feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`// fold the number of chars along with the index position`
			`fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {`
			`(n + 1, i + c.len_utf8())`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`

Cargo fmt pass 2019-10-18 13:05:28 +02:00			`pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`Tokenizer::new(query).map(\|t\| t.word)`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`}`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`#[derive(Debug, Copy, Clone, PartialEq, Eq)]`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`pub struct Token<'a> {`
			`pub word: &'a str,`
			`pub word_index: usize,`
			`pub char_index: usize,`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`pub struct Tokenizer<'a> {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`inner: &'a str,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`word_index: usize,`
			`char_index: usize,`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`

feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`impl<'a> Tokenizer<'a> {`
			`pub fn new(string: &str) -> Tokenizer {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			// skip every separator and set `char_index`
			`// to the number of char trimmed`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`let (count, index) = string`
			`.char_indices()`
			`.take_while(\|(_, c)\| is_separator(*c))`
			`.fold((0, 0), chars_count_index);`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`Tokenizer {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`inner: &string[index..],`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`word_index: 0,`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`char_index: count,`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`
			`}`
			`}`

feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`impl<'a> Iterator for Tokenizer<'a> {`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`type Item = Token<'a>;`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00
			`fn next(&mut self) -> Option<Self::Item> {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`let mut iter = self.inner.linear_group_by(same_group_category).peekable();`

			`while let (Some(string), next_string) = (iter.next(), iter.peek()) {`
			`let (count, index) = string.char_indices().fold((0, 0), chars_count_index);`

			`if !is_str_word(string) {`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`self.word_index += string`
			`.chars()`
			`.filter_map(classify_separator)`
			`.fold(Soft, \|a, x\| a.merge(x))`
			`.to_usize();`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`self.char_index += count;`
			`self.inner = &self.inner[index..];`
			`continue;`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`

feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`let token = Token {`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`word: string,`
			`word_index: self.word_index,`
			`char_index: self.char_index,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`};`
feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00
			`if next_string.filter(\|s\| is_str_word(s)).is_some() {`
			`self.word_index += 1;`
			`}`

			`self.char_index += count;`
			`self.inner = &self.inner[index..];`

			`return Some(token);`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`

feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-02-26 12:16:10 +01:00			`self.inner = "";`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`None`
			`}`
			`}`

feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`pub struct SeqTokenizer<'a, I>`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`where`
			`I: Iterator<Item = &'a str>,`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`{`
			`inner: I,`
			`current: Option<Peekable<Tokenizer<'a>>>,`
			`word_offset: usize,`
			`char_offset: usize,`
			`}`

			`impl<'a, I> SeqTokenizer<'a, I>`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`where`
			`I: Iterator<Item = &'a str>,`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`{`
			`pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {`
			`let current = iter.next().map(\|s\| Tokenizer::new(s).peekable());`
			`SeqTokenizer {`
			`inner: iter,`
Cargo clippy pass 2019-10-18 13:21:41 +02:00			`current,`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`word_offset: 0,`
			`char_offset: 0,`
			`}`
			`}`
			`}`

			`impl<'a, I> Iterator for SeqTokenizer<'a, I>`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`where`
			`I: Iterator<Item = &'a str>,`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`{`
			`type Item = Token<'a>;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`match &mut self.current {`
			`Some(current) => {`
			`match current.next() {`
			`Some(token) => {`
			`// we must apply the word and char offsets`
			`// to the token before returning it`
			`let token = Token {`
			`word: token.word,`
			`word_index: token.word_index + self.word_offset,`
			`char_index: token.char_index + self.char_offset,`
			`};`

			`// if this is the last iteration on this text`
			`// we must save the offsets for next texts`
			`if current.peek().is_none() {`
			`let hard_space = SeparatorCategory::Hard.to_usize();`
			`self.word_offset = token.word_index + hard_space;`
			`self.char_offset = token.char_index + hard_space;`
			`}`

			`Some(token)`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`}`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`None => {`
			`// no more words in this text we must`
			`// start tokenizing the next text`
			`self.current = self.inner.next().map(\|s\| Tokenizer::new(s).peekable());`
			`self.next()`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`}`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`}`
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`}`
feat: Make the Tokenizer able to support tokenizing sequences 2019-03-18 14:42:59 +01:00			`// no more texts available`
			`None => None,`
			`}`
			`}`
			`}`

feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn easy() {`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`let mut tokenizer = Tokenizer::new("salut");`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "salut",`
			`word_index: 0,`
			`char_index: 0`
			`})`
			`);`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`assert_eq!(tokenizer.next(), None);`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`let mut tokenizer = Tokenizer::new("yo ");`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "yo",`
			`word_index: 0,`
			`char_index: 0`
			`})`
			`);`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`assert_eq!(tokenizer.next(), None);`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`

			`#[test]`
			`fn hard() {`
feat: Make the tokenizer support parentheses Interpreting them as hard ponctuation (like a dot). 2019-02-22 15:40:39 +01:00			`let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00
Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "yo",`
			`word_index: 0,`
			`char_index: 4`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lolo",`
			`word_index: 1,`
			`char_index: 7`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "aïe",`
			`word_index: 9,`
			`char_index: 13`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "ouch",`
			`word_index: 17,`
			`char_index: 18`
			`})`
			`);`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`assert_eq!(tokenizer.next(), None);`

			`let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");`

Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "yo",`
			`word_index: 0,`
			`char_index: 0`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lolo",`
			`word_index: 8,`
			`char_index: 5`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "wtf",`
			`word_index: 16,`
			`char_index: 12`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lol",`
			`word_index: 17,`
			`char_index: 18`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "aïe",`
			`word_index: 25,`
			`char_index: 24`
			`})`
			`);`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`assert_eq!(tokenizer.next(), None);`
			`}`

			`#[test]`
			`fn hard_long_chars() {`
			`let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");`

Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "yo",`
			`word_index: 0,`
			`char_index: 4`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "😂",`
			`word_index: 1,`
			`char_index: 7`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "aïe",`
			`word_index: 9,`
			`char_index: 10`
			`})`
			`);`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 16:46:49 +01:00			`assert_eq!(tokenizer.next(), None);`

			`let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");`

Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "yo",`
			`word_index: 0,`
			`char_index: 0`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lolo",`
			`word_index: 8,`
			`char_index: 5`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "😱",`
			`word_index: 16,`
			`char_index: 12`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lol",`
			`word_index: 17,`
			`char_index: 16`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "😣",`
			`word_index: 25,`
			`char_index: 22`
			`})`
			`);`
feat: Introduce an Index system based on RocksDB 2018-11-15 17:55:20 +01:00			`assert_eq!(tokenizer.next(), None);`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`
feat: Make the Tokenizer support Kanjis 2019-02-22 18:17:43 +01:00
			`#[test]`
			`fn hard_kanjis() {`
			`let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");`

Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "\u{2ec4}",`
			`word_index: 0,`
			`char_index: 0`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lolilol",`
			`word_index: 1,`
			`char_index: 1`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "\u{2ec7}",`
			`word_index: 2,`
			`char_index: 8`
			`})`
			`);`
feat: Make the Tokenizer support Kanjis 2019-02-22 18:17:43 +01:00			`assert_eq!(tokenizer.next(), None);`

			`let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");`

Cargo fmt pass 2019-10-18 13:05:28 +02:00			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "\u{2ec4}",`
			`word_index: 0,`
			`char_index: 0`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "\u{2ed3}",`
			`word_index: 1,`
			`char_index: 1`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "\u{2ef2}",`
			`word_index: 2,`
			`char_index: 2`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "lolilol",`
			`word_index: 3,`
			`char_index: 4`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "hello",`
			`word_index: 4,`
			`char_index: 14`
			`})`
			`);`
			`assert_eq!(`
			`tokenizer.next(),`
			`Some(Token {`
			`word: "\u{2ec7}",`
			`word_index: 5,`
			`char_index: 23`
			`})`
			`);`
feat: Make the Tokenizer support Kanjis 2019-02-22 18:17:43 +01:00			`assert_eq!(tokenizer.next(), None);`
			`}`
feat: introduce a better simple word lexer 2018-09-27 16:32:17 +02:00			`}`