fix: Reduce the size of the DocIndex type

2025-07-15 13:58:36 +02:00 · 2019-03-04 14:59:58 +01:00 · 2019-03-04 14:59:58 +01:00 · a45cc4b618
commit a45cc4b618
parent aef7d7825f
6 changed files with 23 additions and 23 deletions
--- a/examples/query-database.rs
+++ b/examples/query-database.rs
@ -126,7 +126,7 @@ fn crop_text(
            (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
        })
        .map(|match_| {
-            Match { char_index: match_.char_index - start as u32, ..match_ }
+            Match { char_index: match_.char_index - start as u16, ..match_ }
        })
        .collect();

--- a/src/database/serde/indexer_serializer.rs
+++ b/src/database/serde/indexer_serializer.rs
@ -56,7 +56,7 @@ where B: TokenizerBuilder

            // FIXME must u32::try_from instead
            let attribute = self.attribute.0;
-            let word_index = word_index as u32;
+            let word_index = word_index as u16;

            // insert the exact representation
            let word_lower = word.to_lowercase();
@ -69,7 +69,7 @@ where B: TokenizerBuilder
                let word_unidecoded = unidecode::unidecode(word).to_lowercase();
                let word_unidecoded = word_unidecoded.trim();
                if word_lower != word_unidecoded {
-                    let char_index = char_index as u32;
+                    let char_index = char_index as u16;
                    let char_length = length;

                    let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
@ -77,7 +77,7 @@ where B: TokenizerBuilder
                }
            }

-            let char_index = char_index as u32;
+            let char_index = char_index as u16;
            let char_length = length;

            let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
--- a/src/lib.rs
+++ b/src/lib.rs
@ -50,14 +50,14 @@ pub struct DocIndex {
    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: u16,
-    pub word_index: u32,
+    pub word_index: u16,

    /// The position in bytes where the word was found
    /// along with the length of it.
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
-    pub char_index: u32,
+    pub char_index: u16,
    pub char_length: u16,
 }

@ -84,7 +84,7 @@ pub struct Match {
    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: u16,
-    pub word_index: u32,
+    pub word_index: u16,

    /// Whether the word that match is an exact match or a prefix.
    pub is_exact: bool,
@ -94,7 +94,7 @@ pub struct Match {
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
-    pub char_index: u32,
+    pub char_index: u16,
    pub char_length: u16,
 }

@ -116,9 +116,9 @@ impl Match {
            query_index: u32::max_value(),
            distance: u8::max_value(),
            attribute: u16::max_value(),
-            word_index: u32::max_value(),
+            word_index: u16::max_value(),
            is_exact: true,
-            char_index: u32::max_value(),
+            char_index: u16::max_value(),
            char_length: u16::max_value(),
        }
    }
@ -131,6 +131,6 @@ mod tests {

    #[test]
    fn docindex_mem_size() {
-        assert_eq!(mem::size_of::<DocIndex>(), 24);
+        assert_eq!(mem::size_of::<DocIndex>(), 16);
    }
 }
--- a/src/rank/criterion/sum_of_words_position.rs
+++ b/src/rank/criterion/sum_of_words_position.rs
@ -6,7 +6,7 @@ use crate::rank::criterion::Criterion;
 use crate::rank::RawDocument;

 #[inline]
-fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
+fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
    let mut sum_word_index = 0;
    let mut index = 0;

--- a/src/rank/criterion/words_proximity.rs
+++ b/src/rank/criterion/words_proximity.rs
@ -5,14 +5,14 @@ use slice_group_by::GroupBy;
 use crate::rank::criterion::Criterion;
 use crate::rank::RawDocument;

-const MAX_DISTANCE: u32 = 8;
+const MAX_DISTANCE: u16 = 8;

 #[inline]
 fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
    (a.clone(), b.clone())
 }

-fn index_proximity(lhs: u32, rhs: u32) -> u32 {
+fn index_proximity(lhs: u16, rhs: u16) -> u16 {
    if lhs < rhs {
        cmp::min(rhs - lhs, MAX_DISTANCE)
    } else {
@ -20,13 +20,13 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
    }
 }

-fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
+fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
    if lattr != rattr { return MAX_DISTANCE }
    index_proximity(lwi, rwi)
 }

-fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
-    let mut min_prox = u32::max_value();
+fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
+    let mut min_prox = u16::max_value();

    for a in lattr.iter().zip(lwi) {
        for b in rattr.iter().zip(rwi) {
@ -43,8 +43,8 @@ fn matches_proximity(
    query_index: &[u32],
    distance: &[u8],
    attribute: &[u16],
-    word_index: &[u32],
-) -> u32
+    word_index: &[u16],
+) -> u16
 {
    let mut query_index_groups = query_index.linear_group();
    let mut proximity = 0;
--- a/src/rank/mod.rs
+++ b/src/rank/mod.rs
@ -79,7 +79,7 @@ impl RawDocument {
        unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
    }

-    pub fn word_index(&self) -> &[u32] {
+    pub fn word_index(&self) -> &[u16] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
@ -93,7 +93,7 @@ impl RawDocument {
        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
    }

-    pub fn char_index(&self) -> &[u32] {
+    pub fn char_index(&self) -> &[u16] {
        let r = self.matches.range;
        // it is safe because construction/modifications
        // can only be done in this module
@ -150,9 +150,9 @@ struct Matches {
    query_index: Vec<u32>,
    distance: Vec<u8>,
    attribute: Vec<u16>,
-    word_index: Vec<u32>,
+    word_index: Vec<u16>,
    is_exact: Vec<bool>,
-    char_index: Vec<u32>,
+    char_index: Vec<u16>,
    char_length: Vec<u16>,
 }