From 175b3dcb7528ce7c7a6596274cf3c4fb3ee20a49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 24 Sep 2019 20:14:20 +0200
Subject: [PATCH 1/2] fix: Do not consider underscores and middle dash hard

---
 meilidb-tokenizer/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs
index 50d577db8..25fd8af8b 100644
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@@ -39,8 +39,8 @@ fn is_separator(c: char) -> bool {
 
 fn classify_separator(c: char) -> Option<SeparatorCategory> {
     match c {
-        ' ' | '\'' | ':' | '"' => Some(Soft),
-        '.' | ';' | ',' | '!' | '?' | '-' | '_' | '(' | ')' => Some(Hard),
+        ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
+        '.' | ';' | ',' | '!' | '?' |  '(' | ')' => Some(Hard),
         _ => None,
     }
 }

From cdd69290c351551baff9d41212084c6554f1e0e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 24 Sep 2019 20:49:42 +0200
Subject: [PATCH 2/2] test: Make the tests work with new separator limits

---
 meilidb-tokenizer/src/lib.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs
index 25fd8af8b..3cea72ffc 100644
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@@ -249,8 +249,8 @@ mod tests {
         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
         assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
         assert_eq!(tokenizer.next(), None);
     }
 
@@ -268,8 +268,8 @@ mod tests {
         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
         assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
         assert_eq!(tokenizer.next(), None);
     }
 
@@ -288,8 +288,8 @@ mod tests {
         assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
         assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
         assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
         assert_eq!(tokenizer.next(), None);
     }
 }