Make it use random lookups before a whole scan

This commit is contained in:
Kerollmops 2025-02-13 10:14:34 +01:00
parent 081f614a5e
commit a2eb64a5de
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
3 changed files with 41 additions and 21 deletions

1
Cargo.lock generated
View File

@ -3733,6 +3733,7 @@ dependencies = [
"indexmap", "indexmap",
"meilisearch-auth", "meilisearch-auth",
"meilisearch-types", "meilisearch-types",
"rand",
"serde", "serde",
"serde_json", "serde_json",
"tempfile", "tempfile",

View File

@ -17,6 +17,7 @@ file-store = { path = "../file-store" }
indexmap = { version = "2.7.0", features = ["serde"] } indexmap = { version = "2.7.0", features = ["serde"] }
meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" } meilisearch-types = { path = "../meilisearch-types" }
rand = { version = "0.8", default-features = false }
serde = { version = "1.0.217", features = ["derive"] } serde = { version = "1.0.217", features = ["derive"] }
serde_json = { version = "1.0.135", features = ["preserve_order"] } serde_json = { version = "1.0.135", features = ["preserve_order"] }
tempfile = "3.15.0" tempfile = "3.15.0"

View File

@ -1,7 +1,6 @@
use std::fs::{read_dir, read_to_string, remove_file, File}; use std::fs::{read_dir, read_to_string, remove_file, File};
use std::hint::black_box; use std::hint::black_box;
use std::io::{BufWriter, Write as _}; use std::io::{BufWriter, Write as _};
use std::ops::Bound;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
use std::thread; use std::thread;
@ -24,6 +23,8 @@ use meilisearch_types::milli::{obkv_to_json, BEU32};
use meilisearch_types::tasks::{Status, Task}; use meilisearch_types::tasks::{Status, Task};
use meilisearch_types::versioning::{get_version, parse_version}; use meilisearch_types::versioning::{get_version, parse_version};
use meilisearch_types::Index; use meilisearch_types::Index;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use serde_json::Value::Object; use serde_json::Value::Object;
use time::macros::format_description; use time::macros::format_description;
use time::OffsetDateTime; use time::OffsetDateTime;
@ -652,6 +653,8 @@ fn hair_dryer(
let num_keys = database.len(&rtxn)? as usize; let num_keys = database.len(&rtxn)? as usize;
let first_entry = database.iter(&rtxn)?.next().transpose()?; let first_entry = database.iter(&rtxn)?.next().transpose()?;
let last_entry = database.rev_iter(&rtxn)?.next().transpose()?; let last_entry = database.rev_iter(&rtxn)?.next().transpose()?;
// Visit more keys per thread to ensure wide coverage.
let total_keys_to_visit = num_keys * 100;
let keys_by_thread = num_keys / total_threads; let keys_by_thread = num_keys / total_threads;
let Some(((first_key, _), (last_key, _))) = first_entry.zip(last_entry) let Some(((first_key, _), (last_key, _))) = first_entry.zip(last_entry)
@ -662,24 +665,53 @@ fn hair_dryer(
let first_key_num = first_key.try_into().map(u64::from_be_bytes).unwrap(); let first_key_num = first_key.try_into().map(u64::from_be_bytes).unwrap();
let last_key_num = last_key.try_into().map(u64::from_be_bytes).unwrap(); let last_key_num = last_key.try_into().map(u64::from_be_bytes).unwrap();
eprintln!("between {first_key_num:x} and {last_key_num:x}"); eprintln!("Iterating over {keys_by_thread} entries by thread ({total_threads}x)...");
eprintln!("Iterating over {keys_by_thread} entries by thread...");
let progress = AtomicUsize::new(0); let progress = AtomicUsize::new(0);
let count = thread::scope(|s| -> anyhow::Result<usize> { thread::scope(|s| -> anyhow::Result<()> {
let mut handles = Vec::new(); let mut handles = Vec::new();
for tid in 0..total_threads { for tid in 0..total_threads {
let index = &index; let index = &index;
let progress = &progress; let progress = &progress;
let handle = s.spawn(move || -> anyhow::Result<usize> { let handle = s.spawn(move || -> anyhow::Result<()> {
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
let start = first_key_num + (keys_by_thread * tid) as u64;
let start_bytes = start.to_be_bytes();
let range = (Bound::Included(&start_bytes[..]), Bound::Unbounded);
let mut count: usize = 0; let mut rng = StdRng::seed_from_u64(tid as u64);
for result in database.range(&rtxn, &range)?.take(keys_by_thread) { for _ in 0..keys_by_thread {
let random_key_num = rng.gen_range(first_key_num..=last_key_num);
let random_key = random_key_num.to_be_bytes();
let Some((key, value)) = database.get_greater_than(&rtxn, &random_key)? else {
continue;
};
// All of this just to avoid compiler optimizations 🤞
// We must read all the bytes to make the pages hot in cache.
// <https://doc.rust-lang.org/std/hint/fn.black_box.html>
black_box(key.iter().fold(0, |acc, _| acc + 1));
black_box(value.iter().fold(0, |acc, _| acc + 1));
let current_progress = progress.fetch_add(1, Ordering::Relaxed);
if current_progress % 10_000 == 0 {
let perc = (current_progress as f64) / (total_keys_to_visit as f64) * 100.0;
eprintln!("Visited {current_progress}/{total_keys_to_visit} ({perc:.2}%) keys");
}
}
Ok(())
});
handles.push(handle);
}
handles.into_iter().try_for_each(|h| h.join().unwrap())
})?;
eprintln!("Doing a last pass on all the keys...");
let mut count = 0;
for (i, result) in database.iter(&rtxn)?.enumerate() {
let (key, value) = result?; let (key, value) = result?;
// All of this just to avoid compiler optimizations 🤞 // All of this just to avoid compiler optimizations 🤞
@ -688,26 +720,12 @@ fn hair_dryer(
count += black_box(key.iter().fold(0, |acc, _| acc + 1)); count += black_box(key.iter().fold(0, |acc, _| acc + 1));
count += black_box(value.iter().fold(0, |acc, _| acc + 1)); count += black_box(value.iter().fold(0, |acc, _| acc + 1));
let current_progress = progress.fetch_add(1, Ordering::Relaxed); if i % 10_000 == 0 {
if current_progress % 10_000 == 0 { let perc = (i as f64) / (total_keys_to_visit as f64) * 100.0;
let perc = (current_progress as f64) / (num_keys as f64) * 100.0; eprintln!("Visited {i}/{total_keys_to_visit} ({perc:.2}%) keys");
eprintln!("Visited {current_progress}/{num_keys} ({perc:.2}%) keys");
} }
} }
Ok(count)
});
handles.push(handle);
}
let mut count = 0usize;
for handle in handles {
count += handle.join().unwrap()?;
}
Ok(count)
})?;
eprintln!("Done hair drying a total of at least {count} bytes."); eprintln!("Done hair drying a total of at least {count} bytes.");
} }
} }