493: Use smartstring to store the external id in our hashmap r=Kerollmops a=irevoire

We need to store all the external id (primary key) in a hashmap
associated to their internal id.
The smartstring remove heap allocation / memory usage and should
improve the cache locality.

I ran the benchmarks to measure the impact of this PR on the indexing time.
I think we should merge it whatever happens thought because it'll decrease the memory consumption.

---------

This improve really sliiiiiightly the performances but improve the memory usage thus it should be merged.
```
group                                                             indexing_main_6b073738                 indexing_use-smartsring_3f343511
-----                                                             ----------------------                 --------------------------------
indexing/Indexing geo_point                                       1.02      25.2±0.20s        ? ?/sec    1.00      24.8±0.13s        ? ?/sec
indexing/Indexing movies in three batches                         1.00      18.2±0.10s        ? ?/sec    1.00      18.2±0.23s        ? ?/sec
indexing/Indexing movies with default settings                    1.00      17.5±0.09s        ? ?/sec    1.01      17.7±0.11s        ? ?/sec
indexing/Indexing songs in three batches with default settings    1.00      68.3±1.01s        ? ?/sec    1.00      68.0±0.95s        ? ?/sec
indexing/Indexing songs with default settings                     1.00      63.2±0.78s        ? ?/sec    1.00      63.0±0.58s        ? ?/sec
indexing/Indexing songs without any facets                        1.02      59.6±1.00s        ? ?/sec    1.00      58.5±1.03s        ? ?/sec
indexing/Indexing songs without faceted numbers                   1.00      62.8±0.38s        ? ?/sec    1.00      62.6±1.02s        ? ?/sec
indexing/Indexing wiki                                            1.01   1009.2±25.25s        ? ?/sec    1.00    998.1±11.27s        ? ?/sec
indexing/Indexing wiki in three batches                           1.01    1142.0±9.97s        ? ?/sec    1.00   1134.4±11.21s        ? ?/sec
```

Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2022-04-13 20:28:28 +00:00 committed by GitHub
commit 7791ef90e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 10 additions and 7 deletions

View File

@ -32,6 +32,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] }
slice-group-by = "0.3.0"
smallstr = { version = "0.3.0", features = ["serde"] }
smallvec = "1.8.0"
smartstring = "1.0.1"
tempfile = "3.3.0"
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
uuid = { version = "0.8.2", features = ["v4"] }

View File

@ -1109,8 +1109,11 @@ mod tests {
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let content: String =
(0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap();
let content: String = (0..=u16::MAX)
.into_iter()
.map(|p| p.to_string())
.reduce(|a, b| a + " " + b.as_ref())
.unwrap();
big_object.insert("content".to_string(), &content);
let mut cursor = Cursor::new(Vec::new());

View File

@ -11,6 +11,7 @@ use itertools::Itertools;
use obkv::{KvReader, KvWriter};
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
use smartstring::SmartString;
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
use super::{IndexDocumentsMethod, IndexerConfig};
@ -55,7 +56,8 @@ pub struct Transform<'a, 'i> {
flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap,
new_external_documents_ids_builder: FxHashMap<Vec<u8>, u64>,
// To increase the cache locality and the heap usage we use smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize,
}
@ -254,10 +256,7 @@ impl<'a, 'i> Transform<'a, 'i> {
None => {
// if the document has already been inserted in this
// batch we need to get its docid
match self
.new_external_documents_ids_builder
.entry(external_id.as_bytes().to_vec())
{
match self.new_external_documents_ids_builder.entry(external_id.into()) {
Entry::Occupied(entry) => (*entry.get() as u32, false),
// if the document has never been encountered we give it a new docid
// and push this new docid to the external documents ids builder