Implement a basic non working bumpalo-based cache

This commit is contained in:
Clément Renault 2024-10-16 11:11:07 +02:00
parent c1fcb2ebc6
commit 7169dd8e37
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
5 changed files with 55 additions and 261 deletions

5
Cargo.lock generated
View File

@ -3567,6 +3567,7 @@ dependencies = [
name = "milli" name = "milli"
version = "1.11.0" version = "1.11.0"
dependencies = [ dependencies = [
"allocator-api2",
"arroy", "arroy",
"big_s", "big_s",
"bimap", "bimap",
@ -3590,7 +3591,7 @@ dependencies = [
"fxhash", "fxhash",
"geoutils", "geoutils",
"grenad", "grenad",
"hashbrown 0.14.5", "hashbrown 0.15.0",
"heed", "heed",
"hf-hub", "hf-hub",
"indexmap", "indexmap",
@ -4434,7 +4435,7 @@ dependencies = [
[[package]] [[package]]
name = "raw-collections" name = "raw-collections"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2" source = "git+https://github.com/dureuill/raw-collections.git#147dfe8eee739f2638c921c83e7d64ca1d47dcb2"
dependencies = [ dependencies = [
"allocator-api2", "allocator-api2",
"bumpalo", "bumpalo",

View File

@ -94,10 +94,11 @@ tracing = "0.1.40"
ureq = { version = "2.10.0", features = ["json"] } ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2" url = "2.5.2"
rayon-par-bridge = "0.1.0" rayon-par-bridge = "0.1.0"
hashbrown = "0.14.5" hashbrown = "0.15.0"
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
bumpalo = "3.16.0" bumpalo = "3.16.0"
thread_local = "1.1.8" thread_local = "1.1.8"
allocator-api2 = "0.2.18"
[dev-dependencies] [dev-dependencies]
mimalloc = { version = "0.1.43", default-features = false } mimalloc = { version = "0.1.43", default-features = false }

View File

@ -1,21 +1,28 @@
use std::cell::RefCell;
use std::fmt::Write as _; use std::fmt::Write as _;
use std::mem;
use std::num::NonZeroUsize;
use bumpalo::Bump;
use grenad::{MergeFunction, Sorter}; use grenad::{MergeFunction, Sorter};
use raw_collections::alloc::{RefBump, RefBytes};
use roaring::bitmap::Statistics; use roaring::bitmap::Statistics;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use smallvec::SmallVec;
use super::lru::Lru;
use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::CboRoaringBitmapCodec; use crate::CboRoaringBitmapCodec;
const KEY_SIZE: usize = 12; const KEY_SIZE: usize = 12;
#[derive(Debug)] #[derive(Debug)]
pub struct CboCachedSorter<MF> { pub struct CboCachedSorter<'extractor, MF> {
cache: Lru<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap>, cache: Option<
hashbrown::HashMap<
// TODO check the size of it
RefBytes<'extractor>,
DelAddRoaringBitmap,
hashbrown::DefaultHashBuilder,
RefBump<'extractor>,
>,
>,
sorter: Sorter<MF>, sorter: Sorter<MF>,
deladd_buffer: Vec<u8>, deladd_buffer: Vec<u8>,
cbo_buffer: Vec<u8>, cbo_buffer: Vec<u8>,
@ -23,10 +30,11 @@ pub struct CboCachedSorter<MF> {
fitted_in_key: usize, fitted_in_key: usize,
} }
impl<MF> CboCachedSorter<MF> { impl<'extractor, MF> CboCachedSorter<'extractor, MF> {
pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self { /// TODO may add the capacity
pub fn new_in(sorter: Sorter<MF>, alloc: RefBump<'extractor>) -> Self {
CboCachedSorter { CboCachedSorter {
cache: Lru::new(cap), cache: Some(hashbrown::HashMap::new_in(alloc)),
sorter, sorter,
deladd_buffer: Vec::new(), deladd_buffer: Vec::new(),
cbo_buffer: Vec::new(), cbo_buffer: Vec::new(),
@ -36,9 +44,9 @@ impl<MF> CboCachedSorter<MF> {
} }
} }
impl<MF: MergeFunction> CboCachedSorter<MF> { impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> {
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
match self.cache.get_mut(key) { match self.cache.unwrap().get_mut(key) {
Some(DelAddRoaringBitmap { del, add: _ }) => { Some(DelAddRoaringBitmap { del, add: _ }) => {
del.get_or_insert_with(RoaringBitmap::default).insert(n); del.get_or_insert_with(RoaringBitmap::default).insert(n);
} }
@ -60,7 +68,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
key: &[u8], key: &[u8],
bitmap: RoaringBitmap, bitmap: RoaringBitmap,
) -> grenad::Result<(), MF::Error> { ) -> grenad::Result<(), MF::Error> {
match self.cache.get_mut(key) { match self.cache.unwrap().get_mut(key) {
Some(DelAddRoaringBitmap { del, add: _ }) => { Some(DelAddRoaringBitmap { del, add: _ }) => {
*del.get_or_insert_with(RoaringBitmap::default) |= bitmap; *del.get_or_insert_with(RoaringBitmap::default) |= bitmap;
} }
@ -78,7 +86,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
} }
pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
match self.cache.get_mut(key) { match self.cache.unwrap().get_mut(key) {
Some(DelAddRoaringBitmap { del: _, add }) => { Some(DelAddRoaringBitmap { del: _, add }) => {
add.get_or_insert_with(RoaringBitmap::default).insert(n); add.get_or_insert_with(RoaringBitmap::default).insert(n);
} }
@ -100,7 +108,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
key: &[u8], key: &[u8],
bitmap: RoaringBitmap, bitmap: RoaringBitmap,
) -> grenad::Result<(), MF::Error> { ) -> grenad::Result<(), MF::Error> {
match self.cache.get_mut(key) { match self.cache.unwrap().get_mut(key) {
Some(DelAddRoaringBitmap { del: _, add }) => { Some(DelAddRoaringBitmap { del: _, add }) => {
*add.get_or_insert_with(RoaringBitmap::default) |= bitmap; *add.get_or_insert_with(RoaringBitmap::default) |= bitmap;
} }
@ -118,7 +126,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
} }
pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
match self.cache.get_mut(key) { match self.cache.unwrap().get_mut(key) {
Some(DelAddRoaringBitmap { del, add }) => { Some(DelAddRoaringBitmap { del, add }) => {
del.get_or_insert_with(RoaringBitmap::default).insert(n); del.get_or_insert_with(RoaringBitmap::default).insert(n);
add.get_or_insert_with(RoaringBitmap::default).insert(n); add.get_or_insert_with(RoaringBitmap::default).insert(n);
@ -174,7 +182,24 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
self.sorter.insert(key, val) self.sorter.insert(key, val)
} }
pub fn into_sorter(mut self) -> grenad::Result<Sorter<MF>, MF::Error> { pub fn spill_to_disk(&mut self, bump: &'extractor RefCell<Bump>) -> std::io::Result<()> {
let cache = self.cache.take().unwrap();
/// I want to spill to disk for real
drop(cache);
bump.borrow_mut().reset();
let alloc = RefBump::new(bump.borrow());
self.cache = Some(hashbrown::HashMap::new_in(alloc));
Ok(())
}
pub fn into_sorter(self) -> grenad::Result<Sorter<MF>, MF::Error> {
let Self { cache, sorter, total_insertions, fitted_in_key, .. } = self;
let cache = cache.unwrap();
let mut all_n_containers = Vec::new(); let mut all_n_containers = Vec::new();
let mut all_n_array_containers = Vec::new(); let mut all_n_array_containers = Vec::new();
let mut all_n_bitset_containers = Vec::new(); let mut all_n_bitset_containers = Vec::new();
@ -182,8 +207,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
let mut all_n_values_bitset_containers = Vec::new(); let mut all_n_values_bitset_containers = Vec::new();
let mut all_cardinality = Vec::new(); let mut all_cardinality = Vec::new();
let default_arc = Lru::new(NonZeroUsize::MIN); for (_key, deladd) in &cache {
for (key, deladd) in mem::replace(&mut self.cache, default_arc) {
for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() { for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() {
let Statistics { let Statistics {
n_containers, n_containers,
@ -201,8 +225,11 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
all_n_values_bitset_containers.push(n_values_bitset_containers); all_n_values_bitset_containers.push(n_values_bitset_containers);
all_cardinality.push(cardinality as u32); all_cardinality.push(cardinality as u32);
} }
}
self.write_entry(key, deladd)?; for (key, deladd) in cache {
// self.write_entry(key, deladd)?;
todo!("spill into the sorter")
} }
let mut output = String::new(); let mut output = String::new();
@ -222,14 +249,14 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
let _ = writeln!( let _ = writeln!(
&mut output, &mut output,
"LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions", "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions",
self.fitted_in_key, fitted_in_key,
(self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, (fitted_in_key as f32 / total_insertions as f32) * 100.0,
self.total_insertions, total_insertions,
); );
eprintln!("{output}"); eprintln!("{output}");
Ok(self.sorter) Ok(sorter)
} }
} }

View File

@ -1,234 +0,0 @@
use std::borrow::Borrow;
use std::hash::{BuildHasher, Hash};
use std::iter::repeat_with;
use std::mem;
use std::num::NonZeroUsize;
use hashbrown::hash_map::{DefaultHashBuilder, Entry};
use hashbrown::HashMap;
#[derive(Debug)]
pub struct Lru<K, V, S = DefaultHashBuilder> {
lookup: HashMap<K, usize, S>,
storage: FixedSizeList<LruNode<K, V>>,
}
impl<K: Eq + Hash, V> Lru<K, V> {
/// Creates a new LRU cache that holds at most `capacity` elements.
pub fn new(capacity: NonZeroUsize) -> Self {
Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) }
}
}
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
/// Creates a new LRU cache that holds at most `capacity` elements
/// and uses the provided hash builder to hash keys.
pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru<K, V, S> {
Self {
lookup: HashMap::with_hasher(hash_builder),
storage: FixedSizeList::new(capacity.get()),
}
}
}
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
/// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache.
///
/// Moves the key to the head of the LRU list if it exists.
pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut V>
where
K: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{
let idx = *self.lookup.get(key)?;
self.storage.move_front(idx).map(|node| &mut node.value)
}
}
impl<K: Clone + Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
match self.lookup.entry(key) {
Entry::Occupied(occ) => {
// It's fine to unwrap here because:
// * the entry already exists
let node = self.storage.move_front(*occ.get()).unwrap();
let old_value = mem::replace(&mut node.value, value);
let old_key = occ.replace_key();
Some((old_key, old_value))
}
Entry::Vacant(vac) => {
let key = vac.key().clone();
if self.storage.is_full() {
// It's fine to unwrap here because:
// * the cache capacity is non zero
// * the cache is full
let idx = self.storage.back_idx();
let node = self.storage.move_front(idx).unwrap();
let LruNode { key, value } = mem::replace(node, LruNode { key, value });
vac.insert(idx);
self.lookup.remove(&key);
Some((key, value))
} else {
// It's fine to unwrap here because:
// * the cache capacity is non zero
// * the cache is not full
let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap();
vac.insert(idx);
None
}
}
}
}
}
impl<K, V, S> IntoIterator for Lru<K, V, S> {
type Item = (K, V);
type IntoIter = IntoIter<K, V>;
fn into_iter(self) -> Self::IntoIter {
IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes }
}
}
pub struct IntoIter<K, V> {
lookup_iter: hashbrown::hash_map::IntoIter<K, usize>,
nodes: Box<[Option<FixedSizeListNode<LruNode<K, V>>>]>,
}
impl<K, V> Iterator for IntoIter<K, V> {
type Item = (K, V);
fn next(&mut self) -> Option<Self::Item> {
let (_key, idx) = self.lookup_iter.next()?;
let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data;
Some((key, value))
}
}
#[derive(Debug)]
struct LruNode<K, V> {
key: K,
value: V,
}
#[derive(Debug)]
struct FixedSizeListNode<T> {
prev: usize,
next: usize,
data: T,
}
#[derive(Debug)]
struct FixedSizeList<T> {
nodes: Box<[Option<FixedSizeListNode<T>>]>,
/// Also corresponds to the first `None` in the nodes.
length: usize,
// TODO Also, we probably do not need one of the front and back cursors.
front: usize,
back: usize,
}
impl<T> FixedSizeList<T> {
fn new(capacity: usize) -> Self {
Self {
nodes: repeat_with(|| None).take(capacity).collect::<Vec<_>>().into_boxed_slice(),
length: 0,
front: usize::MAX,
back: usize::MAX,
}
}
#[inline]
fn capacity(&self) -> usize {
self.nodes.len()
}
#[inline]
fn len(&self) -> usize {
self.length
}
#[inline]
fn is_empty(&self) -> bool {
self.len() == 0
}
#[inline]
fn is_full(&self) -> bool {
self.len() == self.capacity()
}
#[inline]
fn back_idx(&self) -> usize {
self.back
}
#[inline]
fn next(&mut self) -> Option<usize> {
if self.is_full() {
None
} else {
let current_free = self.length;
self.length += 1;
Some(current_free)
}
}
#[inline]
fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode<T>> {
self.nodes.get_mut(idx).and_then(|node| node.as_mut())
}
#[inline]
fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode<T>> {
self.nodes.get(idx).and_then(|node| node.as_ref())
}
#[inline]
fn move_front(&mut self, idx: usize) -> Option<&mut T> {
let node = self.nodes.get_mut(idx)?.take()?;
if let Some(prev) = self.node_mut(node.prev) {
prev.next = node.next;
} else {
self.front = node.next;
}
if let Some(next) = self.node_mut(node.next) {
next.prev = node.prev;
} else {
self.back = node.prev;
}
if let Some(front) = self.node_mut(self.front) {
front.prev = idx;
}
if self.node_ref(self.back).is_none() {
self.back = idx;
}
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
prev: usize::MAX,
next: self.front,
data: node.data,
});
self.front = idx;
Some(&mut node.data)
}
#[inline]
fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> {
let idx = self.next()?;
if let Some(front) = self.node_mut(self.front) {
front.prev = idx;
}
if self.node_ref(self.back).is_none() {
self.back = idx;
}
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
prev: usize::MAX,
next: self.front,
data,
});
self.front = idx;
Some((idx, &mut node.data))
}
}

View File

@ -1,6 +1,5 @@
mod cache; mod cache;
mod faceted; mod faceted;
mod lru;
mod searchable; mod searchable;
use std::cell::RefCell; use std::cell::RefCell;