From ccded7b4299b5923c457147b6fea53e1f583ba77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Mon, 4 Nov 2019 16:09:32 +0100
Subject: [PATCH 1/4] Improve the indexer to not not deunicode before indexing

Revert of #179
---
 meilidb-core/src/raw_indexer.rs | 99 ++++++++++++---------------------
 1 file changed, 37 insertions(+), 62 deletions(-)
diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs
index 85adaf750..13c87f5c3 100644
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@@ -37,54 +37,8 @@ impl RawIndexer {
 
     pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
         let mut number_of_words = 0;
-        let lowercase_text = text.to_lowercase();
-        let deunicoded = deunicode_with_tofu(&lowercase_text, "");
 
-        // TODO compute the deunicoded version after the cjk check
-        let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
-            Some(deunicoded)
-        } else {
-            None
-        };
-        let iter = Some(lowercase_text).into_iter().chain(next);
-
-        for text in iter {
-            // we must not count 2 times the same words
-            number_of_words = 0;
-
-            for token in Tokenizer::new(&text) {
-                let must_continue = index_token(
-                    token,
-                    id,
-                    attr,
-                    self.word_limit,
-                    &self.stop_words,
-                    &mut self.words_doc_indexes,
-                    &mut self.docs_words,
-                );
-
-                if !must_continue {
-                    break;
-                }
-
-                number_of_words += 1;
-            }
-        }
-
-        number_of_words
-    }
-
-    pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
-    where
-        I: IntoIterator<Item = &'a str, IntoIter = IT>,
-        IT: Iterator<Item = &'a str> + Clone,
-    {
-        // TODO serialize this to one call to the SeqTokenizer loop
-
-        let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
-        let iter = lowercased.iter().map(|t| t.as_str());
-
-        for token in SeqTokenizer::new(iter) {
+        for token in Tokenizer::new(text) {
             let must_continue = index_token(
                 token,
                 id,
@@ -95,27 +49,21 @@ impl RawIndexer {
                 &mut self.docs_words,
             );
 
+            number_of_words += 1;
+
             if !must_continue {
                 break;
             }
         }
 
-        let deunicoded: Vec<_> = lowercased
-            .into_iter()
-            .map(|lowercase_text| {
-                if lowercase_text.contains(is_cjk) {
-                    return lowercase_text;
-                }
-                let deunicoded = deunicode_with_tofu(&lowercase_text, "");
-                if lowercase_text != deunicoded {
-                    deunicoded
-                } else {
-                    lowercase_text
-                }
-            })
-            .collect();
-        let iter = deunicoded.iter().map(|t| t.as_str());
+        number_of_words
+    }
 
+    pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
+    where
+        I: IntoIterator<Item = &'a str>,
+    {
+        let iter = iter.into_iter();
         for token in SeqTokenizer::new(iter) {
             let must_continue = index_token(
                 token,
@@ -170,6 +118,12 @@ fn index_token(
         return false;
     }
 
+    let lower = token.word.to_lowercase();
+    let token = Token {
+        word: &lower,
+        ..token
+    };
+
     if !stop_words.contains(&token.word) {
         match token_to_docindex(id, attr, token) {
             Some(docindex) => {
@@ -182,6 +136,27 @@ fn index_token(
             }
             None => return false,
         }
+
+        if !lower.contains(is_cjk) {
+            let unidecoded = deunicode_with_tofu(&lower, "");
+            if unidecoded != lower {
+                let token = Token {
+                    word: &unidecoded,
+                    ..token
+                };
+                match token_to_docindex(id, attr, token) {
+                    Some(docindex) => {
+                        let word = Vec::from(token.word);
+                        words_doc_indexes
+                            .entry(word.clone())
+                            .or_insert_with(Vec::new)
+                            .push(docindex);
+                        docs_words.entry(id).or_insert_with(Vec::new).push(word);
+                    }
+                    None => return false,
+                }
+            }
+        }
     }
 
     true

From de2b8672d48e4d75444cd34e321ba8d14f7e6465 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Mon, 4 Nov 2019 16:10:13 +0100
Subject: [PATCH 2/4] Make the tokenizer understand strange whitespaces/quotes

---
 meilidb-tokenizer/Cargo.toml | 1 +
 meilidb-tokenizer/src/lib.rs | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml
index 06a3be9ad..97220ef24 100644
--- a/meilidb-tokenizer/Cargo.toml
+++ b/meilidb-tokenizer/Cargo.toml
@@ -5,4 +5,5 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
 edition = "2018"
 
 [dependencies]
+deunicode = "1.0.0"
 slice-group-by = "0.2.4"
diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs
index 106d0f91f..6bbcbea3d 100644
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@@ -1,4 +1,5 @@
 use self::SeparatorCategory::*;
+use deunicode::deunicode_char;
 use slice_group_by::StrGroupBy;
 use std::iter::Peekable;
 
@@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
 
 fn classify_separator(c: char) -> Option<SeparatorCategory> {
     match c {
-        ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
+        c if c.is_whitespace() => Some(Soft), // whitespaces
+        c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
+        c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
+        '-' | '_' | '\'' | ':' => Some(Soft),
         '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
         _ => None,
     }

From 4571b80a4945f13b812956ebdb7cd5b7719da7cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Mon, 4 Nov 2019 16:10:31 +0100
Subject: [PATCH 3/4] Update the tests

---
 meilidb-core/src/raw_indexer.rs | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs
index 13c87f5c3..988a5182e 100644
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@@ -199,10 +199,8 @@ mod tests {
         assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
         assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
         assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
-
-        // with the ugly apostrophe...
         assert!(words_doc_indexes
-            .get(&"l’éteindre".to_owned().into_bytes())
+            .get(&"éteindre".to_owned().into_bytes())
             .is_some());
     }
 
@@ -223,10 +221,8 @@ mod tests {
         assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
         assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
         assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
-
-        // with the ugly apostrophe...
         assert!(words_doc_indexes
-            .get(&"l’éteindre".to_owned().into_bytes())
+            .get(&"éteindre".to_owned().into_bytes())
             .is_some());
     }
 
@@ -252,10 +248,8 @@ mod tests {
         assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
         assert!(words_doc_indexes.get(&b"de"[..]).is_none());
         assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
-
-        // with the ugly apostrophe...
         assert!(words_doc_indexes
-            .get(&"l’éteindre".to_owned().into_bytes())
+            .get(&"éteindre".to_owned().into_bytes())
             .is_some());
     }
 }

From 3b1cbed2385edcc776a2b84b6c484a9fe25114a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Mon, 4 Nov 2019 16:58:02 +0100
Subject: [PATCH 4/4] Check that the unidecoded words are not empty

---
 meilidb-core/src/raw_indexer.rs | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs
index 988a5182e..3e0f212f7 100644
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@@ -139,11 +139,12 @@ fn index_token(
 
         if !lower.contains(is_cjk) {
             let unidecoded = deunicode_with_tofu(&lower, "");
-            if unidecoded != lower {
+            if unidecoded != lower && !unidecoded.is_empty() {
                 let token = Token {
                     word: &unidecoded,
                     ..token
                 };
+
                 match token_to_docindex(id, attr, token) {
                     Some(docindex) => {
                         let word = Vec::from(token.word);
@@ -252,4 +253,22 @@ mod tests {
             .get(&"éteindre".to_owned().into_bytes())
             .is_some());
     }
+
+    #[test]
+    fn no_empty_unidecode() {
+        let mut indexer = RawIndexer::new(fst::Set::default());
+
+        let docid = DocumentId(0);
+        let attr = SchemaAttr(0);
+        let text = "🇯🇵";
+        indexer.index_text(docid, attr, text);
+
+        let Indexed {
+            words_doc_indexes, ..
+        } = indexer.build();
+
+        assert!(words_doc_indexes
+            .get(&"🇯🇵".to_owned().into_bytes())
+            .is_some());
+    }
 }