Use another LRU impl which uses hashbrown

This commit is contained in:
Kerollmops 2020-06-29 22:25:59 +02:00
parent f98b615bf3
commit 2fcae719ad
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 381 additions and 111 deletions

View File

@ -47,7 +47,7 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
eprintln!("Indexing into LMDB..."); eprintln!("Indexing into LMDB...");
let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000); let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000);
let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000); let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(3_000_000);
// Write the headers into a Vec of bytes. // Write the headers into a Vec of bytes.
let headers = rdr.headers()?; let headers = rdr.headers()?;
@ -108,14 +108,14 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
index.documents.put(wtxn, &BEU32::new(document_id), &document)?; index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
} }
for (word, ids) in word_positions { for (word, ids) in &word_positions {
index.word_positions.put(wtxn, &word, &ids)?; index.word_positions.put(wtxn, word, ids)?;
} }
for ((word, position), ids) in word_position_docids { for ((word, position), ids) in &word_position_docids {
let mut key = word.as_bytes().to_vec(); let mut key = word.as_bytes().to_vec();
key.extend_from_slice(&position.to_be_bytes()); key.extend_from_slice(&position.to_be_bytes());
index.word_position_docids.put(wtxn, &key, &ids)?; index.word_position_docids.put(wtxn, &key, ids)?;
} }
// We store the words from the postings. // We store the words from the postings.

View File

@ -1,150 +1,419 @@
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT // MIT License
// file at the top-level directory of this distribution and at // Copyright (c) 2016 Jerome Froelich
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::borrow::Borrow; use std::borrow::Borrow;
use std::collections::hash_map::RandomState; use std::fmt;
use std::hash::{Hash, BuildHasher}; use std::hash::{Hash, Hasher};
use std::iter::FromIterator; use std::iter::FusedIterator;
use std::marker::PhantomData;
use std::mem;
use std::ptr;
use std::usize;
use linked_hash_map::LinkedHashMap; use std::collections::HashMap;
/// An LRU cache. use crate::FastMap8;
#[derive(Clone)]
pub struct LruCache<K: Eq + Hash, V, S: BuildHasher = RandomState> { // Struct used to hold a reference to a key
map: LinkedHashMap<K, V, S>, #[doc(hidden)]
max_size: usize, pub struct KeyRef<K> {
k: *const K,
} }
impl<K: Eq + Hash, V> LruCache<K, V> { impl<K: Hash> Hash for KeyRef<K> {
/// Creates an empty cache that can hold at most `capacity` items. fn hash<H: Hasher>(&self, state: &mut H) {
pub fn new(capacity: usize) -> Self { unsafe { (*self.k).hash(state) }
LruCache { }
map: LinkedHashMap::new(), }
max_size: capacity,
impl<K: PartialEq> PartialEq for KeyRef<K> {
fn eq(&self, other: &KeyRef<K>) -> bool {
unsafe { (*self.k).eq(&*other.k) }
}
}
impl<K: Eq> Eq for KeyRef<K> {}
#[cfg(feature = "nightly")]
#[doc(hidden)]
pub auto trait NotKeyRef {}
#[cfg(feature = "nightly")]
impl<K> !NotKeyRef for KeyRef<K> {}
#[cfg(feature = "nightly")]
impl<K, D> Borrow<D> for KeyRef<K>
where
K: Borrow<D>,
D: NotKeyRef + ?Sized,
{
fn borrow(&self) -> &D {
unsafe { &*self.k }.borrow()
}
}
#[cfg(not(feature = "nightly"))]
impl<K> Borrow<K> for KeyRef<K> {
fn borrow(&self) -> &K {
unsafe { &*self.k }
}
}
// Struct used to hold a key value pair. Also contains references to previous and next entries
// so we can maintain the entries in a linked list ordered by their use.
struct LruEntry<K, V> {
key: mem::MaybeUninit<K>,
val: mem::MaybeUninit<V>,
prev: *mut LruEntry<K, V>,
next: *mut LruEntry<K, V>,
}
impl<K, V> LruEntry<K, V> {
fn new(key: K, val: V) -> Self {
LruEntry {
key: mem::MaybeUninit::new(key),
val: mem::MaybeUninit::new(val),
prev: ptr::null_mut(),
next: ptr::null_mut(),
}
}
fn new_sigil() -> Self {
LruEntry {
key: mem::MaybeUninit::uninit(),
val: mem::MaybeUninit::uninit(),
prev: ptr::null_mut(),
next: ptr::null_mut(),
} }
} }
} }
impl<K: Eq + Hash, V, S: BuildHasher> LruCache<K, V, S> { /// An LRU Cache
/// Creates an empty cache that can hold at most `capacity` items with the given hash builder. pub struct LruCache<K, V> {
pub fn with_hasher(capacity: usize, hash_builder: S) -> Self { map: FastMap8<KeyRef<K>, Box<LruEntry<K, V>>>,
LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity } cap: usize,
// head and tail are sigil nodes to faciliate inserting entries
head: *mut LruEntry<K, V>,
tail: *mut LruEntry<K, V>,
}
impl<K: Hash + Eq, V> LruCache<K, V> {
/// Creates a new LRU Cache that holds at most `cap` items.
///
/// # Example
///
/// ```
/// use lru::LruCache;
/// let mut cache: LruCache<isize, &str> = LruCache::new(10);
/// ```
pub fn new(cap: usize) -> LruCache<K, V> {
let mut map = FastMap8::default();
map.reserve(cap);
LruCache::construct(cap, map)
} }
/// Checks if the map contains the given key. /// Creates a new LRU Cache that never automatically evicts items.
pub fn contains_key<Q: ?Sized>(&mut self, key: &Q) -> bool ///
where K: Borrow<Q>, /// # Example
Q: Hash + Eq ///
/// ```
/// use lru::LruCache;
/// let mut cache: LruCache<isize, &str> = LruCache::unbounded();
/// ```
pub fn unbounded() -> LruCache<K, V> {
LruCache::construct(usize::MAX, HashMap::default())
}
}
impl<K: Hash + Eq, V> LruCache<K, V> {
/// Creates a new LRU Cache with the given capacity.
fn construct(cap: usize, map: FastMap8<KeyRef<K>, Box<LruEntry<K, V>>>) -> LruCache<K, V> {
// NB: The compiler warns that cache does not need to be marked as mutable if we
// declare it as such since we only mutate it inside the unsafe block.
let cache = LruCache {
map,
cap,
head: Box::into_raw(Box::new(LruEntry::new_sigil())),
tail: Box::into_raw(Box::new(LruEntry::new_sigil())),
};
unsafe {
(*cache.head).next = cache.tail;
(*cache.tail).prev = cache.head;
}
cache
}
/// Puts a key-value pair into cache. If the capacity is reached the evicted entry is returned.
pub fn insert(&mut self, k: K, mut v: V) -> Option<(K, V)> {
let node_ptr = self.map.get_mut(&KeyRef { k: &k }).map(|node| {
let node_ptr: *mut LruEntry<K, V> = &mut **node;
node_ptr
});
match node_ptr {
Some(node_ptr) => {
// if the key is already in the cache just update its value and move it to the
// front of the list
unsafe { mem::swap(&mut v, &mut (*(*node_ptr).val.as_mut_ptr()) as &mut V) }
self.detach(node_ptr);
self.attach(node_ptr);
None
}
None => {
let (mut node, old_entry) = if self.len() == self.cap() {
// if the cache is full, remove the last entry so we can use it for the new key
let old_key = KeyRef {
k: unsafe { &(*(*(*self.tail).prev).key.as_ptr()) },
};
let mut old_node = self.map.remove(&old_key).unwrap();
// drop the node's current key and val so we can overwrite them
let old_entry = unsafe { (old_node.key.assume_init(), old_node.val.assume_init()) };
old_node.key = mem::MaybeUninit::new(k);
old_node.val = mem::MaybeUninit::new(v);
let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
self.detach(node_ptr);
(old_node, Some(old_entry))
} else {
// if the cache is not full allocate a new LruEntry
(Box::new(LruEntry::new(k, v)), None)
};
let node_ptr: *mut LruEntry<K, V> = &mut *node;
self.attach(node_ptr);
let keyref = unsafe { (*node_ptr).key.as_ptr() };
self.map.insert(KeyRef { k: keyref }, node);
old_entry
}
}
}
/// Returns a mutable reference to the value of the key in the cache or `None` if it
/// is not present in the cache. Moves the key to the head of the LRU list if it exists.
pub fn get_mut<'a, Q>(&'a mut self, k: &Q) -> Option<&'a mut V>
where
KeyRef<K>: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{ {
self.get_mut(key).is_some() if let Some(node) = self.map.get_mut(k) {
} let node_ptr: *mut LruEntry<K, V> = &mut **node;
/// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned. self.detach(node_ptr);
pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> { self.attach(node_ptr);
self.map.insert(k, v);
if self.len() > self.capacity() { Some(unsafe { &mut (*(*node_ptr).val.as_mut_ptr()) as &mut V })
self.remove_lru()
} else { } else {
None None
} }
} }
/// Returns a mutable reference to the value corresponding to the given key in the cache, if /// Returns a bool indicating whether the given key is in the cache. Does not update the
/// any. /// LRU list.
pub fn get_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V> pub fn contains_key<Q>(&self, k: &Q) -> bool
where K: Borrow<Q>, where
Q: Hash + Eq KeyRef<K>: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{ {
self.map.get_refresh(k) self.map.contains_key(k)
} }
pub fn peek_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V> /// Removes and returns the value corresponding to the key from the cache or
where K: Borrow<Q>, /// `None` if it does not exist.
Q: Hash + Eq pub fn remove<Q>(&mut self, k: &Q) -> Option<V>
where
KeyRef<K>: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{ {
self.map.get_mut(k) match self.map.remove(&k) {
None => None,
Some(mut old_node) => {
let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
self.detach(node_ptr);
unsafe { Some(old_node.val.assume_init()) }
}
}
} }
/// Removes the given key from the cache and returns its corresponding value. /// Removes and returns the key and value corresponding to the least recently
pub fn remove<Q: ?Sized>(&mut self, k: &Q) -> Option<V> /// used item or `None` if the cache is empty.
where K: Borrow<Q>, pub fn remove_lru(&mut self) -> Option<(K, V)> {
Q: Hash + Eq let node = self.remove_last()?;
{ // N.B.: Can't destructure directly because of https://github.com/rust-lang/rust/issues/28536
self.map.remove(k) let node = *node;
let LruEntry { key, val, .. } = node;
unsafe { Some((key.assume_init(), val.assume_init())) }
}
/// Returns the number of key-value pairs that are currently in the the cache.
pub fn len(&self) -> usize {
self.map.len()
}
/// Returns a bool indicating whether the cache is empty or not.
pub fn is_empty(&self) -> bool {
self.map.len() == 0
} }
/// Returns the maximum number of key-value pairs the cache can hold. /// Returns the maximum number of key-value pairs the cache can hold.
pub fn capacity(&self) -> usize { pub fn cap(&self) -> usize {
self.max_size self.cap
} }
/// Sets the number of key-value pairs the cache can hold. Removes /// Resizes the cache. If the new capacity is smaller than the size of the current
/// least-recently-used key-value pairs if necessary. /// cache any entries past the new capacity are discarded.
pub fn set_capacity(&mut self, capacity: usize) { pub fn resize(&mut self, cap: usize) {
for _ in capacity..self.len() { // return early if capacity doesn't change
self.remove_lru(); if cap == self.cap {
return;
} }
self.max_size = capacity;
while self.map.len() > cap {
self.remove_last();
}
self.map.shrink_to_fit();
self.cap = cap;
} }
/// Removes and returns the least recently used key-value pair as a tuple. /// Clears the contents of the cache.
#[inline] pub fn clear(&mut self) {
pub fn remove_lru(&mut self) -> Option<(K, V)> { loop {
self.map.pop_front() match self.remove_last() {
Some(_) => (),
None => break,
}
}
} }
/// Returns the number of key-value pairs in the cache. /// An iterator visiting all entries in order. The iterator element type is `(&'a K, &'a V)`.
pub fn len(&self) -> usize { self.map.len() } pub fn iter<'a>(&'a self) -> Iter<'a, K, V> {
Iter {
len: self.len(),
ptr: unsafe { (*self.head).next },
phantom: PhantomData,
}
}
/// Returns `true` if the cache contains no key-value pairs. fn remove_last(&mut self) -> Option<Box<LruEntry<K, V>>> {
pub fn is_empty(&self) -> bool { self.map.is_empty() } let prev;
unsafe { prev = (*self.tail).prev }
if prev != self.head {
let old_key = KeyRef {
k: unsafe { &(*(*(*self.tail).prev).key.as_ptr()) },
};
let mut old_node = self.map.remove(&old_key).unwrap();
let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
self.detach(node_ptr);
Some(old_node)
} else {
None
}
}
/// Removes all key-value pairs from the cache. fn detach(&mut self, node: *mut LruEntry<K, V>) {
pub fn clear(&mut self) { self.map.clear(); } unsafe {
} (*(*node).prev).next = (*node).next;
(*(*node).next).prev = (*node).prev;
}
}
impl<K: Eq + Hash, V, S: BuildHasher> IntoIterator for LruCache<K, V, S> { fn attach(&mut self, node: *mut LruEntry<K, V>) {
type Item = (K, V); unsafe {
type IntoIter = IntoIter<K, V>; (*node).next = (*self.head).next;
(*node).prev = self.head;
fn into_iter(self) -> IntoIter<K, V> { (*self.head).next = node;
IntoIter(self.map.into_iter()) (*(*node).next).prev = node;
}
} }
} }
#[derive(Clone)] impl<K, V> Drop for LruCache<K, V> {
pub struct IntoIter<K, V>(linked_hash_map::IntoIter<K, V>); fn drop(&mut self) {
self.map.values_mut().for_each(|e| unsafe {
ptr::drop_in_place(e.key.as_mut_ptr());
ptr::drop_in_place(e.val.as_mut_ptr());
});
// We rebox the head/tail, and because these are maybe-uninit
// they do not have the absent k/v dropped.
unsafe {
let _head = *Box::from_raw(self.head);
let _tail = *Box::from_raw(self.tail);
}
}
}
impl<K, V> Iterator for IntoIter<K, V> { impl<'a, K: Hash + Eq, V> IntoIterator for &'a LruCache<K, V> {
type Item = (K, V); type Item = (&'a K, &'a V);
type IntoIter = Iter<'a, K, V>;
fn next(&mut self) -> Option<(K, V)> { fn into_iter(self) -> Iter<'a, K, V> {
self.0.next() self.iter()
}
}
// The compiler does not automatically derive Send and Sync for LruCache because it contains
// raw pointers. The raw pointers are safely encapsulated by LruCache though so we can
// implement Send and Sync for it below.
unsafe impl<K: Send, V: Send> Send for LruCache<K, V> {}
unsafe impl<K: Sync, V: Sync> Sync for LruCache<K, V> {}
impl<K: Hash + Eq, V> fmt::Debug for LruCache<K, V> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("LruCache")
.field("len", &self.len())
.field("cap", &self.cap())
.finish()
}
}
/// An iterator over the entries of a `LruCache`.
pub struct Iter<'a, K: 'a, V: 'a> {
len: usize,
ptr: *const LruEntry<K, V>,
phantom: PhantomData<&'a K>,
}
impl<'a, K, V> Iterator for Iter<'a, K, V> {
type Item = (&'a K, &'a V);
fn next(&mut self) -> Option<(&'a K, &'a V)> {
if self.len == 0 {
return None;
}
let key = unsafe { &(*(*self.ptr).key.as_ptr()) as &K };
let val = unsafe { &(*(*self.ptr).val.as_ptr()) as &V };
self.len -= 1;
self.ptr = unsafe { (*self.ptr).next };
Some((key, val))
} }
fn size_hint(&self) -> (usize, Option<usize>) { fn size_hint(&self) -> (usize, Option<usize>) {
self.0.size_hint() (self.len, Some(self.len))
}
fn count(self) -> usize {
self.len
} }
} }
impl<K, V> DoubleEndedIterator for IntoIter<K, V> { impl<'a, K, V> ExactSizeIterator for Iter<'a, K, V> {}
fn next_back(&mut self) -> Option<(K, V)> { impl<'a, K, V> FusedIterator for Iter<'a, K, V> {}
self.0.next_back()
}
}
impl<K, V> ExactSizeIterator for IntoIter<K, V> { // The compiler does not automatically derive Send and Sync for Iter because it contains
fn len(&self) -> usize { // raw pointers.
self.0.len() unsafe impl<'a, K: Send, V: Send> Send for Iter<'a, K, V> {}
} unsafe impl<'a, K: Sync, V: Sync> Sync for Iter<'a, K, V> {}
}
pub struct ArcCache<K, V> pub struct ArcCache<K, V>
where where
@ -180,8 +449,7 @@ where
evicted.extend(self.frequent_set.insert(key, value)); evicted.extend(self.frequent_set.insert(key, value));
return evicted; return evicted;
} }
if self.recent_set.contains_key(&key) { if self.recent_set.remove(&key).is_some() {
self.recent_set.remove(&key);
evicted.extend(self.frequent_set.insert(key, value)); evicted.extend(self.frequent_set.insert(key, value));
return evicted; return evicted;
} }
@ -202,7 +470,8 @@ where
evicted.extend(self.replace(true)); evicted.extend(self.replace(true));
} }
self.frequent_evicted.remove(&key); self.frequent_evicted.remove(&key);
return Vec::from_iter(self.frequent_set.insert(key, value)); evicted.extend(self.frequent_set.insert(key, value));
return evicted;
} }
if self.recent_evicted.contains_key(&key) { if self.recent_evicted.contains_key(&key) {
let recent_evicted_len = self.recent_evicted.len(); let recent_evicted_len = self.recent_evicted.len();
@ -242,7 +511,7 @@ where
where where
K: Clone + Hash + Eq, K: Clone + Hash + Eq,
{ {
if let Some(value) = self.recent_set.remove(&key) { if let Some(value) = self.recent_set.remove(key) {
self.frequent_set.insert((*key).clone(), value); self.frequent_set.insert((*key).clone(), value);
} }
self.frequent_set.get_mut(key) self.frequent_set.get_mut(key)
@ -268,11 +537,11 @@ where
} }
} }
impl<K: Eq + Hash, V> IntoIterator for ArcCache<K, V>{ impl<'a, K: 'a + Eq + Hash, V: 'a> IntoIterator for &'a ArcCache<K, V>{
type Item = (K, V); type Item = (&'a K, &'a V);
type IntoIter = std::iter::Chain<IntoIter<K, V>, IntoIter<K, V>>; type IntoIter = std::iter::Chain<Iter<'a, K, V>, Iter<'a, K, V>>;
fn into_iter(self) -> Self::IntoIter { fn into_iter(self) -> Self::IntoIter {
self.recent_set.into_iter().chain(self.frequent_set) self.recent_set.iter().chain(&self.frequent_set)
} }
} }

View File

@ -11,7 +11,7 @@ use std::time::Instant;
use cow_utils::CowUtils; use cow_utils::CowUtils;
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use fxhash::FxHasher32; use fxhash::{FxHasher32, FxHasher64};
use heed::types::*; use heed::types::*;
use heed::{PolyDatabase, Database}; use heed::{PolyDatabase, Database};
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
@ -28,6 +28,7 @@ static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>; pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>; pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;