From abf7191eec965e569317ef7b6ab4b38eef556216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 18 Mar 2019 14:42:59 +0100 Subject: [PATCH] feat: Make the Tokenizer able to support tokenizing sequences --- meilidb-tokenizer/src/lib.rs | 66 ++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 48bce151b..88e389a46 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,3 +1,4 @@ +use std::iter::Peekable; use slice_group_by::StrGroupBy; use self::SeparatorCategory::*; @@ -151,6 +152,71 @@ impl<'a> Iterator for Tokenizer<'a> { } } +pub struct SeqTokenizer<'a, I> +where I: Iterator, +{ + inner: I, + current: Option>>, + word_offset: usize, + char_offset: usize, +} + +impl<'a, I> SeqTokenizer<'a, I> +where I: Iterator, +{ + pub fn new(mut iter: I) -> SeqTokenizer<'a, I> { + let current = iter.next().map(|s| Tokenizer::new(s).peekable()); + SeqTokenizer { + inner: iter, + current: current, + word_offset: 0, + char_offset: 0, + } + } +} + +impl<'a, I> Iterator for SeqTokenizer<'a, I> +where I: Iterator, +{ + type Item = Token<'a>; + + fn next(&mut self) -> Option { + match &mut self.current { + Some(current) => { + match current.next() { + Some(token) => { + // we must apply the word and char offsets + // to the token before returning it + let token = Token { + word: token.word, + word_index: token.word_index + self.word_offset, + char_index: token.char_index + self.char_offset, + }; + + // if this is the last iteration on this text + // we must save the offsets for next texts + if current.peek().is_none() { + let hard_space = SeparatorCategory::Hard.to_usize(); + self.word_offset = token.word_index + hard_space; + self.char_offset = token.char_index + hard_space; + } + + Some(token) + }, + None => { + // no more words in this text we must + // start tokenizing the next text + self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable()); + self.next() + }, + } + }, + // no more texts available + None => None, + } + } +} + #[cfg(test)] mod tests { use super::*;