mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-10-30 01:38:49 +01:00
Implement a basic non working bumpalo-based cache
This commit is contained in:
parent
c1fcb2ebc6
commit
7169dd8e37
5
Cargo.lock
generated
5
Cargo.lock
generated
@ -3567,6 +3567,7 @@ dependencies = [
|
||||
name = "milli"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"arroy",
|
||||
"big_s",
|
||||
"bimap",
|
||||
@ -3590,7 +3591,7 @@ dependencies = [
|
||||
"fxhash",
|
||||
"geoutils",
|
||||
"grenad",
|
||||
"hashbrown 0.14.5",
|
||||
"hashbrown 0.15.0",
|
||||
"heed",
|
||||
"hf-hub",
|
||||
"indexmap",
|
||||
@ -4434,7 +4435,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "raw-collections"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/dureuill/raw-collections.git#0ecd143c1707d237e3c4d749bc685418da2fccc2"
|
||||
source = "git+https://github.com/dureuill/raw-collections.git#147dfe8eee739f2638c921c83e7d64ca1d47dcb2"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"bumpalo",
|
||||
|
@ -94,10 +94,11 @@ tracing = "0.1.40"
|
||||
ureq = { version = "2.10.0", features = ["json"] }
|
||||
url = "2.5.2"
|
||||
rayon-par-bridge = "0.1.0"
|
||||
hashbrown = "0.14.5"
|
||||
hashbrown = "0.15.0"
|
||||
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
|
||||
bumpalo = "3.16.0"
|
||||
thread_local = "1.1.8"
|
||||
allocator-api2 = "0.2.18"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.43", default-features = false }
|
||||
|
@ -1,21 +1,28 @@
|
||||
use std::cell::RefCell;
|
||||
use std::fmt::Write as _;
|
||||
use std::mem;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use grenad::{MergeFunction, Sorter};
|
||||
use raw_collections::alloc::{RefBump, RefBytes};
|
||||
use roaring::bitmap::Statistics;
|
||||
use roaring::RoaringBitmap;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use super::lru::Lru;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
const KEY_SIZE: usize = 12;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CboCachedSorter<MF> {
|
||||
cache: Lru<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap>,
|
||||
pub struct CboCachedSorter<'extractor, MF> {
|
||||
cache: Option<
|
||||
hashbrown::HashMap<
|
||||
// TODO check the size of it
|
||||
RefBytes<'extractor>,
|
||||
DelAddRoaringBitmap,
|
||||
hashbrown::DefaultHashBuilder,
|
||||
RefBump<'extractor>,
|
||||
>,
|
||||
>,
|
||||
sorter: Sorter<MF>,
|
||||
deladd_buffer: Vec<u8>,
|
||||
cbo_buffer: Vec<u8>,
|
||||
@ -23,10 +30,11 @@ pub struct CboCachedSorter<MF> {
|
||||
fitted_in_key: usize,
|
||||
}
|
||||
|
||||
impl<MF> CboCachedSorter<MF> {
|
||||
pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self {
|
||||
impl<'extractor, MF> CboCachedSorter<'extractor, MF> {
|
||||
/// TODO may add the capacity
|
||||
pub fn new_in(sorter: Sorter<MF>, alloc: RefBump<'extractor>) -> Self {
|
||||
CboCachedSorter {
|
||||
cache: Lru::new(cap),
|
||||
cache: Some(hashbrown::HashMap::new_in(alloc)),
|
||||
sorter,
|
||||
deladd_buffer: Vec::new(),
|
||||
cbo_buffer: Vec::new(),
|
||||
@ -36,9 +44,9 @@ impl<MF> CboCachedSorter<MF> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> {
|
||||
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
|
||||
match self.cache.get_mut(key) {
|
||||
match self.cache.unwrap().get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del, add: _ }) => {
|
||||
del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
@ -60,7 +68,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
key: &[u8],
|
||||
bitmap: RoaringBitmap,
|
||||
) -> grenad::Result<(), MF::Error> {
|
||||
match self.cache.get_mut(key) {
|
||||
match self.cache.unwrap().get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del, add: _ }) => {
|
||||
*del.get_or_insert_with(RoaringBitmap::default) |= bitmap;
|
||||
}
|
||||
@ -78,7 +86,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
}
|
||||
|
||||
pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
|
||||
match self.cache.get_mut(key) {
|
||||
match self.cache.unwrap().get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del: _, add }) => {
|
||||
add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
@ -100,7 +108,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
key: &[u8],
|
||||
bitmap: RoaringBitmap,
|
||||
) -> grenad::Result<(), MF::Error> {
|
||||
match self.cache.get_mut(key) {
|
||||
match self.cache.unwrap().get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del: _, add }) => {
|
||||
*add.get_or_insert_with(RoaringBitmap::default) |= bitmap;
|
||||
}
|
||||
@ -118,7 +126,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
}
|
||||
|
||||
pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
|
||||
match self.cache.get_mut(key) {
|
||||
match self.cache.unwrap().get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del, add }) => {
|
||||
del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
@ -174,7 +182,24 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
self.sorter.insert(key, val)
|
||||
}
|
||||
|
||||
pub fn into_sorter(mut self) -> grenad::Result<Sorter<MF>, MF::Error> {
|
||||
pub fn spill_to_disk(&mut self, bump: &'extractor RefCell<Bump>) -> std::io::Result<()> {
|
||||
let cache = self.cache.take().unwrap();
|
||||
|
||||
/// I want to spill to disk for real
|
||||
drop(cache);
|
||||
|
||||
bump.borrow_mut().reset();
|
||||
|
||||
let alloc = RefBump::new(bump.borrow());
|
||||
self.cache = Some(hashbrown::HashMap::new_in(alloc));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn into_sorter(self) -> grenad::Result<Sorter<MF>, MF::Error> {
|
||||
let Self { cache, sorter, total_insertions, fitted_in_key, .. } = self;
|
||||
let cache = cache.unwrap();
|
||||
|
||||
let mut all_n_containers = Vec::new();
|
||||
let mut all_n_array_containers = Vec::new();
|
||||
let mut all_n_bitset_containers = Vec::new();
|
||||
@ -182,8 +207,7 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
let mut all_n_values_bitset_containers = Vec::new();
|
||||
let mut all_cardinality = Vec::new();
|
||||
|
||||
let default_arc = Lru::new(NonZeroUsize::MIN);
|
||||
for (key, deladd) in mem::replace(&mut self.cache, default_arc) {
|
||||
for (_key, deladd) in &cache {
|
||||
for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() {
|
||||
let Statistics {
|
||||
n_containers,
|
||||
@ -201,8 +225,11 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
all_n_values_bitset_containers.push(n_values_bitset_containers);
|
||||
all_cardinality.push(cardinality as u32);
|
||||
}
|
||||
}
|
||||
|
||||
self.write_entry(key, deladd)?;
|
||||
for (key, deladd) in cache {
|
||||
// self.write_entry(key, deladd)?;
|
||||
todo!("spill into the sorter")
|
||||
}
|
||||
|
||||
let mut output = String::new();
|
||||
@ -222,14 +249,14 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
|
||||
let _ = writeln!(
|
||||
&mut output,
|
||||
"LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions",
|
||||
self.fitted_in_key,
|
||||
(self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0,
|
||||
self.total_insertions,
|
||||
fitted_in_key,
|
||||
(fitted_in_key as f32 / total_insertions as f32) * 100.0,
|
||||
total_insertions,
|
||||
);
|
||||
|
||||
eprintln!("{output}");
|
||||
|
||||
Ok(self.sorter)
|
||||
Ok(sorter)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,234 +0,0 @@
|
||||
use std::borrow::Borrow;
|
||||
use std::hash::{BuildHasher, Hash};
|
||||
use std::iter::repeat_with;
|
||||
use std::mem;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use hashbrown::hash_map::{DefaultHashBuilder, Entry};
|
||||
use hashbrown::HashMap;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Lru<K, V, S = DefaultHashBuilder> {
|
||||
lookup: HashMap<K, usize, S>,
|
||||
storage: FixedSizeList<LruNode<K, V>>,
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V> Lru<K, V> {
|
||||
/// Creates a new LRU cache that holds at most `capacity` elements.
|
||||
pub fn new(capacity: NonZeroUsize) -> Self {
|
||||
Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
|
||||
/// Creates a new LRU cache that holds at most `capacity` elements
|
||||
/// and uses the provided hash builder to hash keys.
|
||||
pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru<K, V, S> {
|
||||
Self {
|
||||
lookup: HashMap::with_hasher(hash_builder),
|
||||
storage: FixedSizeList::new(capacity.get()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
|
||||
/// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache.
|
||||
///
|
||||
/// Moves the key to the head of the LRU list if it exists.
|
||||
pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut V>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let idx = *self.lookup.get(key)?;
|
||||
self.storage.move_front(idx).map(|node| &mut node.value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Clone + Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
|
||||
pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
|
||||
match self.lookup.entry(key) {
|
||||
Entry::Occupied(occ) => {
|
||||
// It's fine to unwrap here because:
|
||||
// * the entry already exists
|
||||
let node = self.storage.move_front(*occ.get()).unwrap();
|
||||
let old_value = mem::replace(&mut node.value, value);
|
||||
let old_key = occ.replace_key();
|
||||
Some((old_key, old_value))
|
||||
}
|
||||
Entry::Vacant(vac) => {
|
||||
let key = vac.key().clone();
|
||||
if self.storage.is_full() {
|
||||
// It's fine to unwrap here because:
|
||||
// * the cache capacity is non zero
|
||||
// * the cache is full
|
||||
let idx = self.storage.back_idx();
|
||||
let node = self.storage.move_front(idx).unwrap();
|
||||
let LruNode { key, value } = mem::replace(node, LruNode { key, value });
|
||||
vac.insert(idx);
|
||||
self.lookup.remove(&key);
|
||||
Some((key, value))
|
||||
} else {
|
||||
// It's fine to unwrap here because:
|
||||
// * the cache capacity is non zero
|
||||
// * the cache is not full
|
||||
let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap();
|
||||
vac.insert(idx);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V, S> IntoIterator for Lru<K, V, S> {
|
||||
type Item = (K, V);
|
||||
type IntoIter = IntoIter<K, V>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntoIter<K, V> {
|
||||
lookup_iter: hashbrown::hash_map::IntoIter<K, usize>,
|
||||
nodes: Box<[Option<FixedSizeListNode<LruNode<K, V>>>]>,
|
||||
}
|
||||
|
||||
impl<K, V> Iterator for IntoIter<K, V> {
|
||||
type Item = (K, V);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let (_key, idx) = self.lookup_iter.next()?;
|
||||
let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data;
|
||||
Some((key, value))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LruNode<K, V> {
|
||||
key: K,
|
||||
value: V,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FixedSizeListNode<T> {
|
||||
prev: usize,
|
||||
next: usize,
|
||||
data: T,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FixedSizeList<T> {
|
||||
nodes: Box<[Option<FixedSizeListNode<T>>]>,
|
||||
/// Also corresponds to the first `None` in the nodes.
|
||||
length: usize,
|
||||
// TODO Also, we probably do not need one of the front and back cursors.
|
||||
front: usize,
|
||||
back: usize,
|
||||
}
|
||||
|
||||
impl<T> FixedSizeList<T> {
|
||||
fn new(capacity: usize) -> Self {
|
||||
Self {
|
||||
nodes: repeat_with(|| None).take(capacity).collect::<Vec<_>>().into_boxed_slice(),
|
||||
length: 0,
|
||||
front: usize::MAX,
|
||||
back: usize::MAX,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn capacity(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn len(&self) -> usize {
|
||||
self.length
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_full(&self) -> bool {
|
||||
self.len() == self.capacity()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_idx(&self) -> usize {
|
||||
self.back
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
if self.is_full() {
|
||||
None
|
||||
} else {
|
||||
let current_free = self.length;
|
||||
self.length += 1;
|
||||
Some(current_free)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode<T>> {
|
||||
self.nodes.get_mut(idx).and_then(|node| node.as_mut())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode<T>> {
|
||||
self.nodes.get(idx).and_then(|node| node.as_ref())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn move_front(&mut self, idx: usize) -> Option<&mut T> {
|
||||
let node = self.nodes.get_mut(idx)?.take()?;
|
||||
if let Some(prev) = self.node_mut(node.prev) {
|
||||
prev.next = node.next;
|
||||
} else {
|
||||
self.front = node.next;
|
||||
}
|
||||
if let Some(next) = self.node_mut(node.next) {
|
||||
next.prev = node.prev;
|
||||
} else {
|
||||
self.back = node.prev;
|
||||
}
|
||||
|
||||
if let Some(front) = self.node_mut(self.front) {
|
||||
front.prev = idx;
|
||||
}
|
||||
if self.node_ref(self.back).is_none() {
|
||||
self.back = idx;
|
||||
}
|
||||
|
||||
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
|
||||
prev: usize::MAX,
|
||||
next: self.front,
|
||||
data: node.data,
|
||||
});
|
||||
self.front = idx;
|
||||
Some(&mut node.data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> {
|
||||
let idx = self.next()?;
|
||||
if let Some(front) = self.node_mut(self.front) {
|
||||
front.prev = idx;
|
||||
}
|
||||
if self.node_ref(self.back).is_none() {
|
||||
self.back = idx;
|
||||
}
|
||||
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
|
||||
prev: usize::MAX,
|
||||
next: self.front,
|
||||
data,
|
||||
});
|
||||
self.front = idx;
|
||||
Some((idx, &mut node.data))
|
||||
}
|
||||
}
|
@ -1,6 +1,5 @@
|
||||
mod cache;
|
||||
mod faceted;
|
||||
mod lru;
|
||||
mod searchable;
|
||||
|
||||
use std::cell::RefCell;
|
||||
|
Loading…
Reference in New Issue
Block a user