Use smartstring to store the external id in our hashmap

We need to store all the external id (primary key) in a hashmap
associated to their internal id during.
The smartstring remove heap allocation / memory usage and should
improve the cache locality.
This commit is contained in:
Tamo 2022-04-11 15:43:18 +02:00 committed by Irevoire
parent 456887a54a
commit ee64f4a936
No known key found for this signature in database
GPG Key ID: 7A6A970C96104F1B
3 changed files with 10 additions and 7 deletions

View File

@ -32,6 +32,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] }
slice-group-by = "0.3.0" slice-group-by = "0.3.0"
smallstr = { version = "0.3.0", features = ["serde"] } smallstr = { version = "0.3.0", features = ["serde"] }
smallvec = "1.8.0" smallvec = "1.8.0"
smartstring = "1.0.1"
tempfile = "3.3.0" tempfile = "3.3.0"
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
uuid = { version = "0.8.2", features = ["v4"] } uuid = { version = "0.8.2", features = ["v4"] }

View File

@ -1109,8 +1109,11 @@ mod tests {
let mut big_object = HashMap::new(); let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow"); big_object.insert(S("id"), "wow");
let content: String = let content: String = (0..=u16::MAX)
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); .into_iter()
.map(|p| p.to_string())
.reduce(|a, b| a + " " + b.as_ref())
.unwrap();
big_object.insert("content".to_string(), &content); big_object.insert("content".to_string(), &content);
let mut cursor = Cursor::new(Vec::new()); let mut cursor = Cursor::new(Vec::new());

View File

@ -11,6 +11,7 @@ use itertools::Itertools;
use obkv::{KvReader, KvWriter}; use obkv::{KvReader, KvWriter};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
use smartstring::SmartString;
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
use super::{IndexDocumentsMethod, IndexerConfig}; use super::{IndexDocumentsMethod, IndexerConfig};
@ -55,7 +56,8 @@ pub struct Transform<'a, 'i> {
flattened_sorter: grenad::Sorter<MergeFn>, flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap, replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap,
new_external_documents_ids_builder: FxHashMap<Vec<u8>, u64>, // To increase the cache locality and the heap usage we use smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize, documents_count: usize,
} }
@ -254,10 +256,7 @@ impl<'a, 'i> Transform<'a, 'i> {
None => { None => {
// if the document has already been inserted in this // if the document has already been inserted in this
// batch we need to get its docid // batch we need to get its docid
match self match self.new_external_documents_ids_builder.entry(external_id.into()) {
.new_external_documents_ids_builder
.entry(external_id.as_bytes().to_vec())
{
Entry::Occupied(entry) => (*entry.get() as u32, false), Entry::Occupied(entry) => (*entry.get() as u32, false),
// if the document has never been encountered we give it a new docid // if the document has never been encountered we give it a new docid
// and push this new docid to the external documents ids builder // and push this new docid to the external documents ids builder