From ee64f4a9367785dcafb0ac2a9738bc20d146c862 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 11 Apr 2022 15:43:18 +0200 Subject: [PATCH] Use smartstring to store the external id in our hashmap We need to store all the external id (primary key) in a hashmap associated to their internal id during. The smartstring remove heap allocation / memory usage and should improve the cache locality. --- milli/Cargo.toml | 1 + milli/src/update/index_documents/mod.rs | 7 +++++-- milli/src/update/index_documents/transform.rs | 9 ++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 641fb71e8..1295c4384 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -32,6 +32,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } slice-group-by = "0.3.0" smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.8.0" +smartstring = "1.0.1" tempfile = "3.3.0" time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "0.8.2", features = ["v4"] } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index eb50a85ed..ae353b0df 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1109,8 +1109,11 @@ mod tests { let mut big_object = HashMap::new(); big_object.insert(S("id"), "wow"); - let content: String = - (0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); + let content: String = (0..=u16::MAX) + .into_iter() + .map(|p| p.to_string()) + .reduce(|a, b| a + " " + b.as_ref()) + .unwrap(); big_object.insert("content".to_string(), &content); let mut cursor = Cursor::new(Vec::new()); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4413e00ca..cbb6ed428 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -11,6 +11,7 @@ use itertools::Itertools; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use serde_json::{Map, Value}; +use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; @@ -55,7 +56,8 @@ pub struct Transform<'a, 'i> { flattened_sorter: grenad::Sorter, replaced_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap, - new_external_documents_ids_builder: FxHashMap, u64>, + // To increase the cache locality and the heap usage we use smartstring. + new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, } @@ -254,10 +256,7 @@ impl<'a, 'i> Transform<'a, 'i> { None => { // if the document has already been inserted in this // batch we need to get its docid - match self - .new_external_documents_ids_builder - .entry(external_id.as_bytes().to_vec()) - { + match self.new_external_documents_ids_builder.entry(external_id.into()) { Entry::Occupied(entry) => (*entry.get() as u32, false), // if the document has never been encountered we give it a new docid // and push this new docid to the external documents ids builder