Reduce the DocumentId size from 64 to 32bits

2025-07-04 20:37:15 +02:00 · 2020-05-19 13:53:31 +02:00 · 2020-05-19 13:53:31 +02:00 · 788e2202c9
commit 788e2202c9
parent 3bca31856d
12 changed files with 33 additions and 32 deletions
--- a/meilisearch-core/src/lib.rs
+++ b/meilisearch-core/src/lib.rs
@ -191,6 +191,6 @@ mod tests {

    #[test]
    fn docindex_mem_size() {
-        assert_eq!(mem::size_of::<DocIndex>(), 16);
+        assert_eq!(mem::size_of::<DocIndex>(), 12);
    }
 }
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -228,7 +228,7 @@ mod tests {
        builder.into_inner().and_then(Set::from_bytes).unwrap()
    }

-    const fn doc_index(document_id: u64, word_index: u16) -> DocIndex {
+    const fn doc_index(document_id: u32, word_index: u16) -> DocIndex {
        DocIndex {
            document_id: DocumentId(document_id),
            attribute: 0,
@ -238,7 +238,7 @@ mod tests {
        }
    }

-    const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex {
+    const fn doc_char_index(document_id: u32, word_index: u16, char_index: u16) -> DocIndex {
        DocIndex {
            document_id: DocumentId(document_id),
            attribute: 0,
--- a/meilisearch-core/src/store/docs_words.rs
+++ b/meilisearch-core/src/store/docs_words.rs
@ -1,4 +1,4 @@
-use super::BEU64;
+use super::BEU32;
 use crate::database::MainT;
 use crate::DocumentId;
 use heed::types::{ByteSlice, OwnedType};
@ -7,7 +7,7 @@ use std::sync::Arc;

 #[derive(Copy, Clone)]
 pub struct DocsWords {
-    pub(crate) docs_words: heed::Database<OwnedType<BEU64>, ByteSlice>,
+    pub(crate) docs_words: heed::Database<OwnedType<BEU32>, ByteSlice>,
 }

 impl DocsWords {
@ -17,13 +17,13 @@ impl DocsWords {
        document_id: DocumentId,
        words: &fst::Set,
    ) -> ZResult<()> {
-        let document_id = BEU64::new(document_id.0);
+        let document_id = BEU32::new(document_id.0);
        let bytes = words.as_fst().as_bytes();
        self.docs_words.put(writer, &document_id, bytes)
    }

    pub fn del_doc_words(self, writer: &mut heed::RwTxn<MainT>, document_id: DocumentId) -> ZResult<bool> {
-        let document_id = BEU64::new(document_id.0);
+        let document_id = BEU32::new(document_id.0);
        self.docs_words.delete(writer, &document_id)
    }

@ -36,7 +36,7 @@ impl DocsWords {
        reader: &heed::RoTxn<MainT>,
        document_id: DocumentId,
    ) -> ZResult<Option<fst::Set>> {
-        let document_id = BEU64::new(document_id.0);
+        let document_id = BEU32::new(document_id.0);
        match self.docs_words.get(reader, &document_id)? {
            Some(bytes) => {
                let len = bytes.len();
--- a/meilisearch-core/src/store/documents_ids.rs
+++ b/meilisearch-core/src/store/documents_ids.rs
@ -26,16 +26,16 @@ impl<'a> BytesDecode<'a> for DocumentsIds {

 pub struct DiscoverIds<'a> {
    ids_iter: std::slice::Iter<'a, DocumentId>,
-    left_id: Option<u64>,
-    right_id: Option<u64>,
-    available_range: std::ops::Range<u64>,
+    left_id: Option<u32>,
+    right_id: Option<u32>,
+    available_range: std::ops::Range<u32>,
 }

 impl DiscoverIds<'_> {
    pub fn new(ids: &Set<DocumentId>) -> DiscoverIds {
        let mut ids_iter = ids.iter();
        let right_id = ids_iter.next().map(|id| id.0);
-        let available_range = 0..right_id.unwrap_or(u64::max_value());
+        let available_range = 0..right_id.unwrap_or(u32::max_value());
        DiscoverIds { ids_iter, left_id: None, right_id, available_range }
    }
 }
@ -49,7 +49,7 @@ impl Iterator for DiscoverIds<'_> {
                // The available range gives us a new id, we return it.
                Some(id) => return Some(DocumentId(id)),
                // The available range is exhausted, we need to find the next one.
-                None if self.available_range.end == u64::max_value() => return None,
+                None if self.available_range.end == u32::max_value() => return None,
                None => loop {
                    self.left_id = self.right_id.take();
                    self.right_id = self.ids_iter.next().map(|id| id.0);
@ -61,9 +61,9 @@ impl Iterator for DiscoverIds<'_> {
                            break;
                        },
                        // The last used id has been reached, we can use all ids
-                        // until u64 MAX
+                        // until u32 MAX
                        (Some(l), None) => {
-                            self.available_range = l.saturating_add(1)..u64::max_value();
+                            self.available_range = l.saturating_add(1)..u32::max_value();
                            break;
                        },
                        _ => (),
--- a/meilisearch-core/src/store/main.rs
+++ b/meilisearch-core/src/store/main.rs
@ -153,7 +153,7 @@ impl Main {

    pub fn user_to_internal_id(self, reader: &heed::RoTxn<MainT>, userid: &str) -> ZResult<Option<DocumentId>> {
        let user_ids = self.user_ids(reader)?;
-        Ok(user_ids.get(userid).map(DocumentId))
+        Ok(user_ids.get(userid).map(|id| DocumentId(id as u32)))
    }

    pub fn put_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
--- a/meilisearch-core/src/store/mod.rs
+++ b/meilisearch-core/src/store/mod.rs
@ -45,20 +45,21 @@ use crate::serde::Deserializer;
 use crate::settings::SettingsUpdate;
 use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};

+type BEU32 = zerocopy::U32<byteorder::BigEndian>;
 type BEU64 = zerocopy::U64<byteorder::BigEndian>;
 pub type BEU16 = zerocopy::U16<byteorder::BigEndian>;

 #[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
 #[repr(C)]
 pub struct DocumentFieldIndexedKey {
-    docid: BEU64,
+    docid: BEU32,
    indexed_pos: BEU16,
 }

 impl DocumentFieldIndexedKey {
    fn new(docid: DocumentId, indexed_pos: IndexedPos) -> DocumentFieldIndexedKey {
        DocumentFieldIndexedKey {
-            docid: BEU64::new(docid.0),
+            docid: BEU32::new(docid.0),
            indexed_pos: BEU16::new(indexed_pos.0),
        }
    }
@ -67,14 +68,14 @@ impl DocumentFieldIndexedKey {
 #[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
 #[repr(C)]
 pub struct DocumentFieldStoredKey {
-    docid: BEU64,
+    docid: BEU32,
    field_id: BEU16,
 }

 impl DocumentFieldStoredKey {
    fn new(docid: DocumentId, field_id: FieldId) -> DocumentFieldStoredKey {
        DocumentFieldStoredKey {
-            docid: BEU64::new(docid.0),
+            docid: BEU32::new(docid.0),
            field_id: BEU16::new(field_id.0),
        }
    }
@ -98,7 +99,7 @@ impl<'a> BytesEncode<'a> for PostingsCodec {

        let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);

-        let docids_len = item.docids.len();
+        let docids_len = item.docids.len() as u64;
        buffer.extend_from_slice(&docids_len.to_be_bytes());
        buffer.extend_from_slice(item.docids.as_bytes());
        buffer.extend_from_slice(item.matches.as_bytes());
--- a/meilisearch-core/src/store/prefix_documents_cache.rs
+++ b/meilisearch-core/src/store/prefix_documents_cache.rs
@ -4,7 +4,7 @@ use heed::types::{OwnedType, CowSlice};
 use heed::Result as ZResult;
 use zerocopy::{AsBytes, FromBytes};

-use super::BEU64;
+use super::{BEU64, BEU32};
 use crate::{DocumentId, Highlight};
 use crate::database::MainT;

@ -13,15 +13,15 @@ use crate::database::MainT;
 pub struct PrefixKey {
    prefix: [u8; 4],
    index: BEU64,
-    docid: BEU64,
+    docid: BEU32,
 }

 impl PrefixKey {
-    pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey {
+    pub fn new(prefix: [u8; 4], index: u64, docid: u32) -> PrefixKey {
        PrefixKey {
            prefix,
            index: BEU64::new(index),
-            docid: BEU64::new(docid),
+            docid: BEU32::new(docid),
        }
    }
 }
@ -54,7 +54,7 @@ impl PrefixDocumentsCache {
        prefix: [u8; 4],
    ) -> ZResult<PrefixDocumentsIter<'txn>> {
        let start = PrefixKey::new(prefix, 0, 0);
-        let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value());
+        let end = PrefixKey::new(prefix, u64::max_value(), u32::max_value());
        let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
        Ok(PrefixDocumentsIter { iter })
    }
--- a/meilisearch-core/src/update/documents_addition.rs
+++ b/meilisearch-core/src/update/documents_addition.rs
@ -242,7 +242,7 @@ pub fn apply_addition<'a, 'b>(

    index.main.put_schema(writer, &schema)?;

-    let new_user_ids = fst::Map::from_iter(new_user_ids)?;
+    let new_user_ids = fst::Map::from_iter(new_user_ids.iter().map(|(u, i)| (u, *i as u64)))?;
    let new_internal_ids = sdset::SetBuf::from_dirty(new_internal_ids);
    index.main.merge_user_ids(writer, &new_user_ids)?;
    index.main.merge_internal_ids(writer, &new_internal_ids)?;
--- a/meilisearch-core/src/update/documents_deletion.rs
+++ b/meilisearch-core/src/update/documents_deletion.rs
@ -80,7 +80,7 @@ pub fn apply_documents_deletion(
        let user_ids = index.main.user_ids(writer)?;
        for userid in new_user_ids.as_slice() {
            if let Some(id) = user_ids.get(userid) {
-                internal_ids.push(DocumentId(id));
+                internal_ids.push(DocumentId(id as u32));
            }
        }

--- a/meilisearch-core/src/update/helpers.rs
+++ b/meilisearch-core/src/update/helpers.rs
@ -105,7 +105,7 @@ pub fn discover_document_id(
 {
    if userid.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') {
        match user_ids.get(userid) {
-            Some(internal_id) => Ok(DocumentId(internal_id)),
+            Some(id) => Ok(DocumentId(id as u32)),
            None => {
                let internal_id = available_ids.next().expect("no more ids available");
                Ok(internal_id)
--- a/meilisearch-http/src/error.rs
+++ b/meilisearch-http/src/error.rs
@ -22,7 +22,7 @@ pub enum ResponseError {
    NotFound(String),
    OpenIndex(String),
    FilterParsing(String),
-    RetrieveDocument(u64, String),
+    RetrieveDocument(u32, String),
    SearchDocuments(String),
    PayloadTooLarge,
    UnsupportedMediaType,
@ -116,7 +116,7 @@ impl ResponseError {
        ResponseError::Maintenance
    }

-    pub fn retrieve_document(doc_id: u64, err: impl fmt::Display) -> ResponseError {
+    pub fn retrieve_document(doc_id: u32, err: impl fmt::Display) -> ResponseError {
        ResponseError::RetrieveDocument(doc_id, err.to_string())
    }

--- a/meilisearch-types/src/lib.rs
+++ b/meilisearch-types/src/lib.rs
@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize};
 #[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[repr(C)]
-pub struct DocumentId(pub u64);
+pub struct DocumentId(pub u32);

 /// This structure represent the position of a word
 /// in a document and its attributes.