feat: Make WordArea be based on char index and length

2025-07-04 20:37:15 +02:00 · 2019-01-09 20:14:08 +01:00 · 2019-01-09 20:14:08 +01:00 · b53ef08d05
commit b53ef08d05
parent 86bfb173ef
4 changed files with 73 additions and 41 deletions
--- a/src/database/serde/indexer_serializer.rs
+++ b/src/database/serde/indexer_serializer.rs
@ -51,24 +51,14 @@ where B: TokenizerBuilder
    fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
        for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {

+            let document_id = self.document_id;
+
            // FIXME must u32::try_from instead
            let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
                Ok(attribute) => attribute,
                Err(_) => return Ok(()),
            };

-            // FIXME must u16/u32::try_from instead
-            let word_area = match WordArea::new(char_index as u32, word.len() as u16) {
-                Ok(word_area) => word_area,
-                Err(_) => return Ok(()),
-            };
-
-            let doc_index = DocIndex {
-                document_id: self.document_id,
-                attribute,
-                word_area
-            };
-
            // insert the exact representation
            let word_lower = word.to_lowercase();

@ -77,9 +67,26 @@ where B: TokenizerBuilder
            // and the unidecoded lowercased version
            let word_unidecoded = unidecode::unidecode(word).to_lowercase();
            if word_lower != word_unidecoded {
+
+                // FIXME must u16/u32::try_from instead
+                let length = word_unidecoded.chars().count() as u16;
+                let word_area = match WordArea::new(char_index as u32, length) {
+                    Ok(word_area) => word_area,
+                    Err(_) => return Ok(()),
+                };
+
+                let doc_index = DocIndex { document_id, attribute, word_area };
                self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
            }

+            // FIXME must u16/u32::try_from instead
+            let length = word.chars().count() as u16;
+            let word_area = match WordArea::new(char_index as u32, length) {
+                Ok(word_area) => word_area,
+                Err(_) => return Ok(()),
+            };
+
+            let doc_index = DocIndex { document_id, attribute, word_area };
            self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
        }
        Ok(())