From 5070b2772858a84670de7e20225c6d88285f5d9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 22 Feb 2019 15:40:39 +0100 Subject: [PATCH] feat: Make the tokenizer support parentheses Interpreting them as hard ponctuation (like a dot). --- src/tokenizer/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index a2910728d..e86e6203a 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -75,9 +75,9 @@ impl Separator { fn detect_separator(c: char) -> Option { match c { - '.' | ';' | ',' | '!' | '?' | '-' => Some(Long), - ' ' | '\'' | '"' => Some(Short), - _ => None, + '.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long), + ' ' | '\'' | '"' => Some(Short), + _ => None, } } @@ -150,11 +150,12 @@ mod tests { #[test] fn hard() { - let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe"); + let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)"); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); + assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 })); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");