From 2fcae719ad94df0fd2fdd050f1f7dcbcc783febe Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 29 Jun 2020 22:25:59 +0200 Subject: [PATCH] Use another LRU impl which uses hashbrown --- src/bin/indexer.rs | 10 +- src/cache.rs | 479 +++++++++++++++++++++++++++++++++++---------- src/lib.rs | 3 +- 3 files changed, 381 insertions(+), 111 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 010b0e03b..d3bf30793 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -47,7 +47,7 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index eprintln!("Indexing into LMDB..."); let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000); - let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000); + let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(3_000_000); // Write the headers into a Vec of bytes. let headers = rdr.headers()?; @@ -108,14 +108,14 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index index.documents.put(wtxn, &BEU32::new(document_id), &document)?; } - for (word, ids) in word_positions { - index.word_positions.put(wtxn, &word, &ids)?; + for (word, ids) in &word_positions { + index.word_positions.put(wtxn, word, ids)?; } - for ((word, position), ids) in word_position_docids { + for ((word, position), ids) in &word_position_docids { let mut key = word.as_bytes().to_vec(); key.extend_from_slice(&position.to_be_bytes()); - index.word_position_docids.put(wtxn, &key, &ids)?; + index.word_position_docids.put(wtxn, &key, ids)?; } // We store the words from the postings. diff --git a/src/cache.rs b/src/cache.rs index d9a1a293f..d90aa4294 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -1,150 +1,419 @@ -// Copyright 2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. +// MIT License +// Copyright (c) 2016 Jerome Froelich use std::borrow::Borrow; -use std::collections::hash_map::RandomState; -use std::hash::{Hash, BuildHasher}; -use std::iter::FromIterator; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::iter::FusedIterator; +use std::marker::PhantomData; +use std::mem; +use std::ptr; +use std::usize; -use linked_hash_map::LinkedHashMap; +use std::collections::HashMap; -/// An LRU cache. -#[derive(Clone)] -pub struct LruCache { - map: LinkedHashMap, - max_size: usize, +use crate::FastMap8; + +// Struct used to hold a reference to a key +#[doc(hidden)] +pub struct KeyRef { + k: *const K, } -impl LruCache { - /// Creates an empty cache that can hold at most `capacity` items. - pub fn new(capacity: usize) -> Self { - LruCache { - map: LinkedHashMap::new(), - max_size: capacity, +impl Hash for KeyRef { + fn hash(&self, state: &mut H) { + unsafe { (*self.k).hash(state) } + } +} + +impl PartialEq for KeyRef { + fn eq(&self, other: &KeyRef) -> bool { + unsafe { (*self.k).eq(&*other.k) } + } +} + +impl Eq for KeyRef {} + +#[cfg(feature = "nightly")] +#[doc(hidden)] +pub auto trait NotKeyRef {} + +#[cfg(feature = "nightly")] +impl !NotKeyRef for KeyRef {} + +#[cfg(feature = "nightly")] +impl Borrow for KeyRef +where + K: Borrow, + D: NotKeyRef + ?Sized, +{ + fn borrow(&self) -> &D { + unsafe { &*self.k }.borrow() + } +} + +#[cfg(not(feature = "nightly"))] +impl Borrow for KeyRef { + fn borrow(&self) -> &K { + unsafe { &*self.k } + } +} + +// Struct used to hold a key value pair. Also contains references to previous and next entries +// so we can maintain the entries in a linked list ordered by their use. +struct LruEntry { + key: mem::MaybeUninit, + val: mem::MaybeUninit, + prev: *mut LruEntry, + next: *mut LruEntry, +} + +impl LruEntry { + fn new(key: K, val: V) -> Self { + LruEntry { + key: mem::MaybeUninit::new(key), + val: mem::MaybeUninit::new(val), + prev: ptr::null_mut(), + next: ptr::null_mut(), + } + } + + fn new_sigil() -> Self { + LruEntry { + key: mem::MaybeUninit::uninit(), + val: mem::MaybeUninit::uninit(), + prev: ptr::null_mut(), + next: ptr::null_mut(), } } } -impl LruCache { - /// Creates an empty cache that can hold at most `capacity` items with the given hash builder. - pub fn with_hasher(capacity: usize, hash_builder: S) -> Self { - LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity } +/// An LRU Cache +pub struct LruCache { + map: FastMap8, Box>>, + cap: usize, + + // head and tail are sigil nodes to faciliate inserting entries + head: *mut LruEntry, + tail: *mut LruEntry, +} + +impl LruCache { + /// Creates a new LRU Cache that holds at most `cap` items. + /// + /// # Example + /// + /// ``` + /// use lru::LruCache; + /// let mut cache: LruCache = LruCache::new(10); + /// ``` + pub fn new(cap: usize) -> LruCache { + let mut map = FastMap8::default(); + map.reserve(cap); + LruCache::construct(cap, map) } - /// Checks if the map contains the given key. - pub fn contains_key(&mut self, key: &Q) -> bool - where K: Borrow, - Q: Hash + Eq + /// Creates a new LRU Cache that never automatically evicts items. + /// + /// # Example + /// + /// ``` + /// use lru::LruCache; + /// let mut cache: LruCache = LruCache::unbounded(); + /// ``` + pub fn unbounded() -> LruCache { + LruCache::construct(usize::MAX, HashMap::default()) + } +} + +impl LruCache { + /// Creates a new LRU Cache with the given capacity. + fn construct(cap: usize, map: FastMap8, Box>>) -> LruCache { + // NB: The compiler warns that cache does not need to be marked as mutable if we + // declare it as such since we only mutate it inside the unsafe block. + let cache = LruCache { + map, + cap, + head: Box::into_raw(Box::new(LruEntry::new_sigil())), + tail: Box::into_raw(Box::new(LruEntry::new_sigil())), + }; + + unsafe { + (*cache.head).next = cache.tail; + (*cache.tail).prev = cache.head; + } + + cache + } + + /// Puts a key-value pair into cache. If the capacity is reached the evicted entry is returned. + pub fn insert(&mut self, k: K, mut v: V) -> Option<(K, V)> { + let node_ptr = self.map.get_mut(&KeyRef { k: &k }).map(|node| { + let node_ptr: *mut LruEntry = &mut **node; + node_ptr + }); + + match node_ptr { + Some(node_ptr) => { + // if the key is already in the cache just update its value and move it to the + // front of the list + unsafe { mem::swap(&mut v, &mut (*(*node_ptr).val.as_mut_ptr()) as &mut V) } + self.detach(node_ptr); + self.attach(node_ptr); + None + } + None => { + let (mut node, old_entry) = if self.len() == self.cap() { + // if the cache is full, remove the last entry so we can use it for the new key + let old_key = KeyRef { + k: unsafe { &(*(*(*self.tail).prev).key.as_ptr()) }, + }; + let mut old_node = self.map.remove(&old_key).unwrap(); + + // drop the node's current key and val so we can overwrite them + let old_entry = unsafe { (old_node.key.assume_init(), old_node.val.assume_init()) }; + + old_node.key = mem::MaybeUninit::new(k); + old_node.val = mem::MaybeUninit::new(v); + + let node_ptr: *mut LruEntry = &mut *old_node; + self.detach(node_ptr); + + (old_node, Some(old_entry)) + } else { + // if the cache is not full allocate a new LruEntry + (Box::new(LruEntry::new(k, v)), None) + }; + + let node_ptr: *mut LruEntry = &mut *node; + self.attach(node_ptr); + + let keyref = unsafe { (*node_ptr).key.as_ptr() }; + self.map.insert(KeyRef { k: keyref }, node); + + old_entry + } + } + } + + /// Returns a mutable reference to the value of the key in the cache or `None` if it + /// is not present in the cache. Moves the key to the head of the LRU list if it exists. + pub fn get_mut<'a, Q>(&'a mut self, k: &Q) -> Option<&'a mut V> + where + KeyRef: Borrow, + Q: Hash + Eq + ?Sized, { - self.get_mut(key).is_some() - } + if let Some(node) = self.map.get_mut(k) { + let node_ptr: *mut LruEntry = &mut **node; - /// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned. - pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> { - self.map.insert(k, v); - if self.len() > self.capacity() { - self.remove_lru() + self.detach(node_ptr); + self.attach(node_ptr); + + Some(unsafe { &mut (*(*node_ptr).val.as_mut_ptr()) as &mut V }) } else { None } } - /// Returns a mutable reference to the value corresponding to the given key in the cache, if - /// any. - pub fn get_mut(&mut self, k: &Q) -> Option<&mut V> - where K: Borrow, - Q: Hash + Eq + /// Returns a bool indicating whether the given key is in the cache. Does not update the + /// LRU list. + pub fn contains_key(&self, k: &Q) -> bool + where + KeyRef: Borrow, + Q: Hash + Eq + ?Sized, { - self.map.get_refresh(k) + self.map.contains_key(k) } - pub fn peek_mut(&mut self, k: &Q) -> Option<&mut V> - where K: Borrow, - Q: Hash + Eq + /// Removes and returns the value corresponding to the key from the cache or + /// `None` if it does not exist. + pub fn remove(&mut self, k: &Q) -> Option + where + KeyRef: Borrow, + Q: Hash + Eq + ?Sized, { - self.map.get_mut(k) + match self.map.remove(&k) { + None => None, + Some(mut old_node) => { + let node_ptr: *mut LruEntry = &mut *old_node; + self.detach(node_ptr); + unsafe { Some(old_node.val.assume_init()) } + } + } } - /// Removes the given key from the cache and returns its corresponding value. - pub fn remove(&mut self, k: &Q) -> Option - where K: Borrow, - Q: Hash + Eq - { - self.map.remove(k) + /// Removes and returns the key and value corresponding to the least recently + /// used item or `None` if the cache is empty. + pub fn remove_lru(&mut self) -> Option<(K, V)> { + let node = self.remove_last()?; + // N.B.: Can't destructure directly because of https://github.com/rust-lang/rust/issues/28536 + let node = *node; + let LruEntry { key, val, .. } = node; + unsafe { Some((key.assume_init(), val.assume_init())) } + } + + /// Returns the number of key-value pairs that are currently in the the cache. + pub fn len(&self) -> usize { + self.map.len() + } + + /// Returns a bool indicating whether the cache is empty or not. + pub fn is_empty(&self) -> bool { + self.map.len() == 0 } /// Returns the maximum number of key-value pairs the cache can hold. - pub fn capacity(&self) -> usize { - self.max_size + pub fn cap(&self) -> usize { + self.cap } - /// Sets the number of key-value pairs the cache can hold. Removes - /// least-recently-used key-value pairs if necessary. - pub fn set_capacity(&mut self, capacity: usize) { - for _ in capacity..self.len() { - self.remove_lru(); + /// Resizes the cache. If the new capacity is smaller than the size of the current + /// cache any entries past the new capacity are discarded. + pub fn resize(&mut self, cap: usize) { + // return early if capacity doesn't change + if cap == self.cap { + return; } - self.max_size = capacity; + + while self.map.len() > cap { + self.remove_last(); + } + self.map.shrink_to_fit(); + + self.cap = cap; } - /// Removes and returns the least recently used key-value pair as a tuple. - #[inline] - pub fn remove_lru(&mut self) -> Option<(K, V)> { - self.map.pop_front() + /// Clears the contents of the cache. + pub fn clear(&mut self) { + loop { + match self.remove_last() { + Some(_) => (), + None => break, + } + } } - /// Returns the number of key-value pairs in the cache. - pub fn len(&self) -> usize { self.map.len() } + /// An iterator visiting all entries in order. The iterator element type is `(&'a K, &'a V)`. + pub fn iter<'a>(&'a self) -> Iter<'a, K, V> { + Iter { + len: self.len(), + ptr: unsafe { (*self.head).next }, + phantom: PhantomData, + } + } - /// Returns `true` if the cache contains no key-value pairs. - pub fn is_empty(&self) -> bool { self.map.is_empty() } + fn remove_last(&mut self) -> Option>> { + let prev; + unsafe { prev = (*self.tail).prev } + if prev != self.head { + let old_key = KeyRef { + k: unsafe { &(*(*(*self.tail).prev).key.as_ptr()) }, + }; + let mut old_node = self.map.remove(&old_key).unwrap(); + let node_ptr: *mut LruEntry = &mut *old_node; + self.detach(node_ptr); + Some(old_node) + } else { + None + } + } - /// Removes all key-value pairs from the cache. - pub fn clear(&mut self) { self.map.clear(); } -} + fn detach(&mut self, node: *mut LruEntry) { + unsafe { + (*(*node).prev).next = (*node).next; + (*(*node).next).prev = (*node).prev; + } + } -impl IntoIterator for LruCache { - type Item = (K, V); - type IntoIter = IntoIter; - - fn into_iter(self) -> IntoIter { - IntoIter(self.map.into_iter()) + fn attach(&mut self, node: *mut LruEntry) { + unsafe { + (*node).next = (*self.head).next; + (*node).prev = self.head; + (*self.head).next = node; + (*(*node).next).prev = node; + } } } -#[derive(Clone)] -pub struct IntoIter(linked_hash_map::IntoIter); +impl Drop for LruCache { + fn drop(&mut self) { + self.map.values_mut().for_each(|e| unsafe { + ptr::drop_in_place(e.key.as_mut_ptr()); + ptr::drop_in_place(e.val.as_mut_ptr()); + }); + // We rebox the head/tail, and because these are maybe-uninit + // they do not have the absent k/v dropped. + unsafe { + let _head = *Box::from_raw(self.head); + let _tail = *Box::from_raw(self.tail); + } + } +} -impl Iterator for IntoIter { - type Item = (K, V); +impl<'a, K: Hash + Eq, V> IntoIterator for &'a LruCache { + type Item = (&'a K, &'a V); + type IntoIter = Iter<'a, K, V>; - fn next(&mut self) -> Option<(K, V)> { - self.0.next() + fn into_iter(self) -> Iter<'a, K, V> { + self.iter() + } +} + +// The compiler does not automatically derive Send and Sync for LruCache because it contains +// raw pointers. The raw pointers are safely encapsulated by LruCache though so we can +// implement Send and Sync for it below. +unsafe impl Send for LruCache {} +unsafe impl Sync for LruCache {} + +impl fmt::Debug for LruCache { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("LruCache") + .field("len", &self.len()) + .field("cap", &self.cap()) + .finish() + } +} + +/// An iterator over the entries of a `LruCache`. +pub struct Iter<'a, K: 'a, V: 'a> { + len: usize, + ptr: *const LruEntry, + phantom: PhantomData<&'a K>, +} + +impl<'a, K, V> Iterator for Iter<'a, K, V> { + type Item = (&'a K, &'a V); + + fn next(&mut self) -> Option<(&'a K, &'a V)> { + if self.len == 0 { + return None; + } + + let key = unsafe { &(*(*self.ptr).key.as_ptr()) as &K }; + let val = unsafe { &(*(*self.ptr).val.as_ptr()) as &V }; + + self.len -= 1; + self.ptr = unsafe { (*self.ptr).next }; + + Some((key, val)) } fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() + (self.len, Some(self.len)) + } + + fn count(self) -> usize { + self.len } } -impl DoubleEndedIterator for IntoIter { - fn next_back(&mut self) -> Option<(K, V)> { - self.0.next_back() - } -} +impl<'a, K, V> ExactSizeIterator for Iter<'a, K, V> {} +impl<'a, K, V> FusedIterator for Iter<'a, K, V> {} -impl ExactSizeIterator for IntoIter { - fn len(&self) -> usize { - self.0.len() - } -} +// The compiler does not automatically derive Send and Sync for Iter because it contains +// raw pointers. +unsafe impl<'a, K: Send, V: Send> Send for Iter<'a, K, V> {} +unsafe impl<'a, K: Sync, V: Sync> Sync for Iter<'a, K, V> {} pub struct ArcCache where @@ -180,8 +449,7 @@ where evicted.extend(self.frequent_set.insert(key, value)); return evicted; } - if self.recent_set.contains_key(&key) { - self.recent_set.remove(&key); + if self.recent_set.remove(&key).is_some() { evicted.extend(self.frequent_set.insert(key, value)); return evicted; } @@ -202,7 +470,8 @@ where evicted.extend(self.replace(true)); } self.frequent_evicted.remove(&key); - return Vec::from_iter(self.frequent_set.insert(key, value)); + evicted.extend(self.frequent_set.insert(key, value)); + return evicted; } if self.recent_evicted.contains_key(&key) { let recent_evicted_len = self.recent_evicted.len(); @@ -242,7 +511,7 @@ where where K: Clone + Hash + Eq, { - if let Some(value) = self.recent_set.remove(&key) { + if let Some(value) = self.recent_set.remove(key) { self.frequent_set.insert((*key).clone(), value); } self.frequent_set.get_mut(key) @@ -268,11 +537,11 @@ where } } -impl IntoIterator for ArcCache{ - type Item = (K, V); - type IntoIter = std::iter::Chain, IntoIter>; +impl<'a, K: 'a + Eq + Hash, V: 'a> IntoIterator for &'a ArcCache{ + type Item = (&'a K, &'a V); + type IntoIter = std::iter::Chain, Iter<'a, K, V>>; fn into_iter(self) -> Self::IntoIter { - self.recent_set.into_iter().chain(self.frequent_set) + self.recent_set.iter().chain(&self.frequent_set) } } diff --git a/src/lib.rs b/src/lib.rs index 103c252bf..113203adb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ use std::time::Instant; use cow_utils::CowUtils; use fst::{IntoStreamer, Streamer}; -use fxhash::FxHasher32; +use fxhash::{FxHasher32, FxHasher64}; use heed::types::*; use heed::{PolyDatabase, Database}; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; @@ -28,6 +28,7 @@ static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); pub type FastMap4 = HashMap>; +pub type FastMap8 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>;