diff --git a/Cargo.lock b/Cargo.lock index 5cd1f3976..1e3d71c9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3567,6 +3567,7 @@ dependencies = [ name = "milli" version = "1.11.0" dependencies = [ + "allocator-api2", "arroy", "big_s", "bimap", @@ -3590,7 +3591,7 @@ dependencies = [ "fxhash", "geoutils", "grenad", - "hashbrown 0.14.5", + "hashbrown 0.15.0", "heed", "hf-hub", "indexmap", @@ -4434,7 +4435,7 @@ dependencies = [ [[package]] name = "raw-collections" version = "0.1.0" -source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2" +source = "git+https://github.com/dureuill/raw-collections.git#147dfe8eee739f2638c921c83e7d64ca1d47dcb2" dependencies = [ "allocator-api2", "bumpalo", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 46633bdec..14861d887 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -94,10 +94,11 @@ tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" -hashbrown = "0.14.5" +hashbrown = "0.15.0" raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" } bumpalo = "3.16.0" thread_local = "1.1.8" +allocator-api2 = "0.2.18" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 2fbe427f3..232eba594 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -1,21 +1,28 @@ +use std::cell::RefCell; use std::fmt::Write as _; -use std::mem; -use std::num::NonZeroUsize; +use bumpalo::Bump; use grenad::{MergeFunction, Sorter}; +use raw_collections::alloc::{RefBump, RefBytes}; use roaring::bitmap::Statistics; use roaring::RoaringBitmap; -use smallvec::SmallVec; -use super::lru::Lru; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::CboRoaringBitmapCodec; const KEY_SIZE: usize = 12; #[derive(Debug)] -pub struct CboCachedSorter { - cache: Lru, DelAddRoaringBitmap>, +pub struct CboCachedSorter<'extractor, MF> { + cache: Option< + hashbrown::HashMap< + // TODO check the size of it + RefBytes<'extractor>, + DelAddRoaringBitmap, + hashbrown::DefaultHashBuilder, + RefBump<'extractor>, + >, + >, sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, @@ -23,10 +30,11 @@ pub struct CboCachedSorter { fitted_in_key: usize, } -impl CboCachedSorter { - pub fn new(cap: NonZeroUsize, sorter: Sorter) -> Self { +impl<'extractor, MF> CboCachedSorter<'extractor, MF> { + /// TODO may add the capacity + pub fn new_in(sorter: Sorter, alloc: RefBump<'extractor>) -> Self { CboCachedSorter { - cache: Lru::new(cap), + cache: Some(hashbrown::HashMap::new_in(alloc)), sorter, deladd_buffer: Vec::new(), cbo_buffer: Vec::new(), @@ -36,9 +44,9 @@ impl CboCachedSorter { } } -impl CboCachedSorter { +impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> { pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { + match self.cache.unwrap().get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { del.get_or_insert_with(RoaringBitmap::default).insert(n); } @@ -60,7 +68,7 @@ impl CboCachedSorter { key: &[u8], bitmap: RoaringBitmap, ) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { + match self.cache.unwrap().get_mut(key) { Some(DelAddRoaringBitmap { del, add: _ }) => { *del.get_or_insert_with(RoaringBitmap::default) |= bitmap; } @@ -78,7 +86,7 @@ impl CboCachedSorter { } pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { + match self.cache.unwrap().get_mut(key) { Some(DelAddRoaringBitmap { del: _, add }) => { add.get_or_insert_with(RoaringBitmap::default).insert(n); } @@ -100,7 +108,7 @@ impl CboCachedSorter { key: &[u8], bitmap: RoaringBitmap, ) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { + match self.cache.unwrap().get_mut(key) { Some(DelAddRoaringBitmap { del: _, add }) => { *add.get_or_insert_with(RoaringBitmap::default) |= bitmap; } @@ -118,7 +126,7 @@ impl CboCachedSorter { } pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> { - match self.cache.get_mut(key) { + match self.cache.unwrap().get_mut(key) { Some(DelAddRoaringBitmap { del, add }) => { del.get_or_insert_with(RoaringBitmap::default).insert(n); add.get_or_insert_with(RoaringBitmap::default).insert(n); @@ -174,7 +182,24 @@ impl CboCachedSorter { self.sorter.insert(key, val) } - pub fn into_sorter(mut self) -> grenad::Result, MF::Error> { + pub fn spill_to_disk(&mut self, bump: &'extractor RefCell) -> std::io::Result<()> { + let cache = self.cache.take().unwrap(); + + /// I want to spill to disk for real + drop(cache); + + bump.borrow_mut().reset(); + + let alloc = RefBump::new(bump.borrow()); + self.cache = Some(hashbrown::HashMap::new_in(alloc)); + + Ok(()) + } + + pub fn into_sorter(self) -> grenad::Result, MF::Error> { + let Self { cache, sorter, total_insertions, fitted_in_key, .. } = self; + let cache = cache.unwrap(); + let mut all_n_containers = Vec::new(); let mut all_n_array_containers = Vec::new(); let mut all_n_bitset_containers = Vec::new(); @@ -182,8 +207,7 @@ impl CboCachedSorter { let mut all_n_values_bitset_containers = Vec::new(); let mut all_cardinality = Vec::new(); - let default_arc = Lru::new(NonZeroUsize::MIN); - for (key, deladd) in mem::replace(&mut self.cache, default_arc) { + for (_key, deladd) in &cache { for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() { let Statistics { n_containers, @@ -201,8 +225,11 @@ impl CboCachedSorter { all_n_values_bitset_containers.push(n_values_bitset_containers); all_cardinality.push(cardinality as u32); } + } - self.write_entry(key, deladd)?; + for (key, deladd) in cache { + // self.write_entry(key, deladd)?; + todo!("spill into the sorter") } let mut output = String::new(); @@ -222,14 +249,14 @@ impl CboCachedSorter { let _ = writeln!( &mut output, "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions", - self.fitted_in_key, - (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, - self.total_insertions, + fitted_in_key, + (fitted_in_key as f32 / total_insertions as f32) * 100.0, + total_insertions, ); eprintln!("{output}"); - Ok(self.sorter) + Ok(sorter) } } diff --git a/milli/src/update/new/extract/lru.rs b/milli/src/update/new/extract/lru.rs deleted file mode 100644 index 3eca47cb2..000000000 --- a/milli/src/update/new/extract/lru.rs +++ /dev/null @@ -1,234 +0,0 @@ -use std::borrow::Borrow; -use std::hash::{BuildHasher, Hash}; -use std::iter::repeat_with; -use std::mem; -use std::num::NonZeroUsize; - -use hashbrown::hash_map::{DefaultHashBuilder, Entry}; -use hashbrown::HashMap; - -#[derive(Debug)] -pub struct Lru { - lookup: HashMap, - storage: FixedSizeList>, -} - -impl Lru { - /// Creates a new LRU cache that holds at most `capacity` elements. - pub fn new(capacity: NonZeroUsize) -> Self { - Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) } - } -} - -impl Lru { - /// Creates a new LRU cache that holds at most `capacity` elements - /// and uses the provided hash builder to hash keys. - pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru { - Self { - lookup: HashMap::with_hasher(hash_builder), - storage: FixedSizeList::new(capacity.get()), - } - } -} - -impl Lru { - /// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache. - /// - /// Moves the key to the head of the LRU list if it exists. - pub fn get_mut(&mut self, key: &Q) -> Option<&mut V> - where - K: Borrow, - Q: Hash + Eq + ?Sized, - { - let idx = *self.lookup.get(key)?; - self.storage.move_front(idx).map(|node| &mut node.value) - } -} - -impl Lru { - pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> { - match self.lookup.entry(key) { - Entry::Occupied(occ) => { - // It's fine to unwrap here because: - // * the entry already exists - let node = self.storage.move_front(*occ.get()).unwrap(); - let old_value = mem::replace(&mut node.value, value); - let old_key = occ.replace_key(); - Some((old_key, old_value)) - } - Entry::Vacant(vac) => { - let key = vac.key().clone(); - if self.storage.is_full() { - // It's fine to unwrap here because: - // * the cache capacity is non zero - // * the cache is full - let idx = self.storage.back_idx(); - let node = self.storage.move_front(idx).unwrap(); - let LruNode { key, value } = mem::replace(node, LruNode { key, value }); - vac.insert(idx); - self.lookup.remove(&key); - Some((key, value)) - } else { - // It's fine to unwrap here because: - // * the cache capacity is non zero - // * the cache is not full - let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap(); - vac.insert(idx); - None - } - } - } - } -} - -impl IntoIterator for Lru { - type Item = (K, V); - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes } - } -} - -pub struct IntoIter { - lookup_iter: hashbrown::hash_map::IntoIter, - nodes: Box<[Option>>]>, -} - -impl Iterator for IntoIter { - type Item = (K, V); - - fn next(&mut self) -> Option { - let (_key, idx) = self.lookup_iter.next()?; - let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data; - Some((key, value)) - } -} - -#[derive(Debug)] -struct LruNode { - key: K, - value: V, -} - -#[derive(Debug)] -struct FixedSizeListNode { - prev: usize, - next: usize, - data: T, -} - -#[derive(Debug)] -struct FixedSizeList { - nodes: Box<[Option>]>, - /// Also corresponds to the first `None` in the nodes. - length: usize, - // TODO Also, we probably do not need one of the front and back cursors. - front: usize, - back: usize, -} - -impl FixedSizeList { - fn new(capacity: usize) -> Self { - Self { - nodes: repeat_with(|| None).take(capacity).collect::>().into_boxed_slice(), - length: 0, - front: usize::MAX, - back: usize::MAX, - } - } - - #[inline] - fn capacity(&self) -> usize { - self.nodes.len() - } - - #[inline] - fn len(&self) -> usize { - self.length - } - - #[inline] - fn is_empty(&self) -> bool { - self.len() == 0 - } - - #[inline] - fn is_full(&self) -> bool { - self.len() == self.capacity() - } - - #[inline] - fn back_idx(&self) -> usize { - self.back - } - - #[inline] - fn next(&mut self) -> Option { - if self.is_full() { - None - } else { - let current_free = self.length; - self.length += 1; - Some(current_free) - } - } - - #[inline] - fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode> { - self.nodes.get_mut(idx).and_then(|node| node.as_mut()) - } - - #[inline] - fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode> { - self.nodes.get(idx).and_then(|node| node.as_ref()) - } - - #[inline] - fn move_front(&mut self, idx: usize) -> Option<&mut T> { - let node = self.nodes.get_mut(idx)?.take()?; - if let Some(prev) = self.node_mut(node.prev) { - prev.next = node.next; - } else { - self.front = node.next; - } - if let Some(next) = self.node_mut(node.next) { - next.prev = node.prev; - } else { - self.back = node.prev; - } - - if let Some(front) = self.node_mut(self.front) { - front.prev = idx; - } - if self.node_ref(self.back).is_none() { - self.back = idx; - } - - let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { - prev: usize::MAX, - next: self.front, - data: node.data, - }); - self.front = idx; - Some(&mut node.data) - } - - #[inline] - fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> { - let idx = self.next()?; - if let Some(front) = self.node_mut(self.front) { - front.prev = idx; - } - if self.node_ref(self.back).is_none() { - self.back = idx; - } - let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { - prev: usize::MAX, - next: self.front, - data, - }); - self.front = idx; - Some((idx, &mut node.data)) - } -} diff --git a/milli/src/update/new/extract/mod.rs b/milli/src/update/new/extract/mod.rs index 1c86d80af..aa69a69b1 100644 --- a/milli/src/update/new/extract/mod.rs +++ b/milli/src/update/new/extract/mod.rs @@ -1,6 +1,5 @@ mod cache; mod faceted; -mod lru; mod searchable; use std::cell::RefCell;