From a3a28c56fad032cf832bc610b8051d0ffd7c76a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Sat, 2 Feb 2019 14:17:50 +0100
Subject: [PATCH] feat: Replace compressed Match fields by uncompressed ones

---
 examples/query-database.rs               | 10 +++----
 src/data/doc_indexes.rs                  | 36 ++++++++++++++++--------
 src/database/serde/indexer_serializer.rs | 22 ++++++---------
 src/lib.rs                               | 26 +++++++++++------
 src/rank/query_builder.rs                |  4 ++-
 5 files changed, 56 insertions(+), 42 deletions(-)
diff --git a/examples/query-database.rs b/examples/query-database.rs
index 0a8771a51..d1e6a0e17 100644
--- a/examples/query-database.rs
+++ b/examples/query-database.rs
@@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
     let mut byte_indexes = BTreeMap::new();
 
     for match_ in matches {
-        let match_attribute = match_.attribute.attribute();
+        let match_attribute = match_.attribute;
         if SchemaAttr::new(match_attribute) == attribute {
-            let word_area = match_.word_area;
-
-            let char_index = word_area.char_index() as usize;
-            let char_length = word_area.length() as usize;
+            let char_index = match_.char_index as usize;
+            let char_length = match_.char_length as usize;
             let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
 
             match byte_indexes.entry(byte_index) {
@@ -151,7 +149,7 @@ fn main() -> Result<(), Box<Error>> {
 
             let mut matching_attributes = HashSet::new();
             for _match in doc.matches {
-                let attr = SchemaAttr::new(_match.attribute.attribute());
+                let attr = SchemaAttr::new(_match.attribute);
                 let name = schema.attribute_name(attr);
                 matching_attributes.insert(name);
             }
diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs
index b760765bf..4919b9fa0 100644
--- a/src/data/doc_indexes.rs
+++ b/src/data/doc_indexes.rs
@@ -158,18 +158,24 @@ mod tests {
     fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
         let a = DocIndex {
             document_id: DocumentId(0),
-            attribute: Attribute::new_faillible(3, 11),
-            word_area: WordArea::new_faillible(30, 4)
+            attribute: 3,
+            word_index: 11,
+            char_index: 30,
+            char_length: 4,
         };
         let b = DocIndex {
             document_id: DocumentId(1),
-            attribute: Attribute::new_faillible(4, 21),
-            word_area: WordArea::new_faillible(35, 6)
+            attribute: 4,
+            word_index: 21,
+            char_index: 35,
+            char_length: 6,
         };
         let c = DocIndex {
             document_id: DocumentId(2),
-            attribute: Attribute::new_faillible(8, 2),
-            word_area: WordArea::new_faillible(89, 6)
+            attribute: 8,
+            word_index: 2,
+            char_index: 89,
+            char_length: 6,
         };
 
         let mut builder = DocIndexesBuilder::memory();
@@ -193,18 +199,24 @@ mod tests {
     fn serialize_deserialize() -> Result<(), Box<Error>> {
         let a = DocIndex {
             document_id: DocumentId(0),
-            attribute: Attribute::new_faillible(3, 11),
-            word_area: WordArea::new_faillible(30, 4)
+            attribute: 3,
+            word_index: 11,
+            char_index: 30,
+            char_length: 4,
         };
         let b = DocIndex {
             document_id: DocumentId(1),
-            attribute: Attribute::new_faillible(4, 21),
-            word_area: WordArea::new_faillible(35, 6)
+            attribute: 4,
+            word_index: 21,
+            char_index: 35,
+            char_length: 6,
         };
         let c = DocIndex {
             document_id: DocumentId(2),
-            attribute: Attribute::new_faillible(8, 2),
-            word_area: WordArea::new_faillible(89, 6)
+            attribute: 8,
+            word_index: 2,
+            char_index: 89,
+            char_length: 6,
         };
 
         let mut builder = DocIndexesBuilder::memory();
diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs
index bdbfb281d..6271e1b7b 100644
--- a/src/database/serde/indexer_serializer.rs
+++ b/src/database/serde/indexer_serializer.rs
@@ -54,10 +54,8 @@ where B: TokenizerBuilder
             let document_id = self.document_id;
 
             // FIXME must u32::try_from instead
-            let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
-                Ok(attribute) => attribute,
-                Err(_) => return Ok(()),
-            };
+            let attribute = self.attribute.0;
+            let word_index = word_index as u32;
 
             // insert the exact representation
             let word_lower = word.to_lowercase();
@@ -68,21 +66,17 @@ where B: TokenizerBuilder
             // and the unidecoded lowercased version
             let word_unidecoded = unidecode::unidecode(word).to_lowercase();
             if word_lower != word_unidecoded {
-                let word_area = match WordArea::new(char_index as u32, length) {
-                    Ok(word_area) => word_area,
-                    Err(_) => return Ok(()),
-                };
+                let char_index = char_index as u32;
+                let char_length = length;
 
-                let doc_index = DocIndex { document_id, attribute, word_area };
+                let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
                 self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
             }
 
-            let word_area = match WordArea::new(char_index as u32, length) {
-                Ok(word_area) => word_area,
-                Err(_) => return Ok(()),
-            };
+            let char_index = char_index as u32;
+            let char_length = length;
 
-            let doc_index = DocIndex { document_id, attribute, word_area };
+            let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
             self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
         }
         Ok(())
diff --git a/src/lib.rs b/src/lib.rs
index 19e451f63..5f824b39a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -36,14 +36,16 @@ pub struct DocIndex {
 
     /// The attribute in the document where the word was found
     /// along with the index in it.
-    pub attribute: Attribute,
+    pub attribute: u16,
+    pub word_index: u32,
 
     /// The position in bytes where the word was found
     /// along with the length of it.
     ///
     /// It informs on the original word area in the text indexed
     /// without needing to run the tokenizer again.
-    pub word_area: WordArea,
+    pub char_index: u32,
+    pub char_length: u16,
 }
 
 /// This structure represent a matching word with informations
@@ -68,7 +70,8 @@ pub struct Match {
 
     /// The attribute in the document where the word was found
     /// along with the index in it.
-    pub attribute: Attribute,
+    pub attribute: u16,
+    pub word_index: u32,
 
     /// Whether the word that match is an exact match or a prefix.
     pub is_exact: bool,
@@ -78,7 +81,8 @@ pub struct Match {
     ///
     /// It informs on the original word area in the text indexed
     /// without needing to run the tokenizer again.
-    pub word_area: WordArea,
+    pub char_index: u32,
+    pub char_length: u16,
 }
 
 impl Match {
@@ -86,9 +90,11 @@ impl Match {
         Match {
             query_index: 0,
             distance: 0,
-            attribute: Attribute::new_faillible(0, 0),
+            attribute: 0,
+            word_index: 0,
             is_exact: false,
-            word_area: WordArea::new_faillible(0, 0),
+            char_index: 0,
+            char_length: 0,
         }
     }
 
@@ -96,9 +102,11 @@ impl Match {
         Match {
             query_index: u32::max_value(),
             distance: u8::max_value(),
-            attribute: Attribute::max_value(),
+            attribute: u16::max_value(),
+            word_index: u32::max_value(),
             is_exact: true,
-            word_area: WordArea::max_value(),
+            char_index: u32::max_value(),
+            char_length: u16::max_value(),
         }
     }
 }
@@ -110,6 +118,6 @@ mod tests {
 
     #[test]
     fn docindex_mem_size() {
-        assert_eq!(mem::size_of::<DocIndex>(), 16);
+        assert_eq!(mem::size_of::<DocIndex>(), 24);
     }
 }
diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs
index 8146fc7fa..91d645160 100644
--- a/src/rank/query_builder.rs
+++ b/src/rank/query_builder.rs
@@ -111,8 +111,10 @@ where D: Deref<Target=DB>,
                         query_index: iv.index as u32,
                         distance: distance,
                         attribute: doc_index.attribute,
+                        word_index: doc_index.word_index,
                         is_exact: is_exact,
-                        word_area: doc_index.word_area,
+                        char_index: doc_index.char_index,
+                        char_length: doc_index.char_length,
                     };
                     matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
                 }