From 2fcae719ad94df0fd2fdd050f1f7dcbcc783febe Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 29 Jun 2020 22:25:59 +0200
Subject: [PATCH] Use another LRU impl which uses hashbrown

---
 src/bin/indexer.rs |  10 +-
 src/cache.rs       | 479 +++++++++++++++++++++++++++++++++++----------
 src/lib.rs         |   3 +-
 3 files changed, 381 insertions(+), 111 deletions(-)
diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index 010b0e03b..d3bf30793 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -47,7 +47,7 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
     eprintln!("Indexing into LMDB...");
 
     let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000);
-    let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000);
+    let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(3_000_000);
 
     // Write the headers into a Vec of bytes.
     let headers = rdr.headers()?;
@@ -108,14 +108,14 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
         index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
     }
 
-    for (word, ids) in word_positions {
-        index.word_positions.put(wtxn, &word, &ids)?;
+    for (word, ids) in &word_positions {
+        index.word_positions.put(wtxn, word, ids)?;
     }
 
-    for ((word, position), ids) in word_position_docids {
+    for ((word, position), ids) in &word_position_docids {
         let mut key = word.as_bytes().to_vec();
         key.extend_from_slice(&position.to_be_bytes());
-        index.word_position_docids.put(wtxn, &key, &ids)?;
+        index.word_position_docids.put(wtxn, &key, ids)?;
     }
 
     // We store the words from the postings.
diff --git a/src/cache.rs b/src/cache.rs
index d9a1a293f..d90aa4294 100644
--- a/src/cache.rs
+++ b/src/cache.rs
@@ -1,150 +1,419 @@
-// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
+// MIT License
+// Copyright (c) 2016 Jerome Froelich
 
 use std::borrow::Borrow;
-use std::collections::hash_map::RandomState;
-use std::hash::{Hash, BuildHasher};
-use std::iter::FromIterator;
+use std::fmt;
+use std::hash::{Hash, Hasher};
+use std::iter::FusedIterator;
+use std::marker::PhantomData;
+use std::mem;
+use std::ptr;
+use std::usize;
 
-use linked_hash_map::LinkedHashMap;
+use std::collections::HashMap;
 
-/// An LRU cache.
-#[derive(Clone)]
-pub struct LruCache<K: Eq + Hash, V, S: BuildHasher = RandomState> {
-    map: LinkedHashMap<K, V, S>,
-    max_size: usize,
+use crate::FastMap8;
+
+// Struct used to hold a reference to a key
+#[doc(hidden)]
+pub struct KeyRef<K> {
+    k: *const K,
 }
 
-impl<K: Eq + Hash, V> LruCache<K, V> {
-    /// Creates an empty cache that can hold at most `capacity` items.
-    pub fn new(capacity: usize) -> Self {
-        LruCache {
-            map: LinkedHashMap::new(),
-            max_size: capacity,
+impl<K: Hash> Hash for KeyRef<K> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        unsafe { (*self.k).hash(state) }
+    }
+}
+
+impl<K: PartialEq> PartialEq for KeyRef<K> {
+    fn eq(&self, other: &KeyRef<K>) -> bool {
+        unsafe { (*self.k).eq(&*other.k) }
+    }
+}
+
+impl<K: Eq> Eq for KeyRef<K> {}
+
+#[cfg(feature = "nightly")]
+#[doc(hidden)]
+pub auto trait NotKeyRef {}
+
+#[cfg(feature = "nightly")]
+impl<K> !NotKeyRef for KeyRef<K> {}
+
+#[cfg(feature = "nightly")]
+impl<K, D> Borrow<D> for KeyRef<K>
+where
+    K: Borrow<D>,
+    D: NotKeyRef + ?Sized,
+{
+    fn borrow(&self) -> &D {
+        unsafe { &*self.k }.borrow()
+    }
+}
+
+#[cfg(not(feature = "nightly"))]
+impl<K> Borrow<K> for KeyRef<K> {
+    fn borrow(&self) -> &K {
+        unsafe { &*self.k }
+    }
+}
+
+// Struct used to hold a key value pair. Also contains references to previous and next entries
+// so we can maintain the entries in a linked list ordered by their use.
+struct LruEntry<K, V> {
+    key: mem::MaybeUninit<K>,
+    val: mem::MaybeUninit<V>,
+    prev: *mut LruEntry<K, V>,
+    next: *mut LruEntry<K, V>,
+}
+
+impl<K, V> LruEntry<K, V> {
+    fn new(key: K, val: V) -> Self {
+        LruEntry {
+            key: mem::MaybeUninit::new(key),
+            val: mem::MaybeUninit::new(val),
+            prev: ptr::null_mut(),
+            next: ptr::null_mut(),
+        }
+    }
+
+    fn new_sigil() -> Self {
+        LruEntry {
+            key: mem::MaybeUninit::uninit(),
+            val: mem::MaybeUninit::uninit(),
+            prev: ptr::null_mut(),
+            next: ptr::null_mut(),
         }
     }
 }
 
-impl<K: Eq + Hash, V, S: BuildHasher> LruCache<K, V, S> {
-    /// Creates an empty cache that can hold at most `capacity` items with the given hash builder.
-    pub fn with_hasher(capacity: usize, hash_builder: S) -> Self {
-        LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity }
+/// An LRU Cache
+pub struct LruCache<K, V> {
+    map: FastMap8<KeyRef<K>, Box<LruEntry<K, V>>>,
+    cap: usize,
+
+    // head and tail are sigil nodes to faciliate inserting entries
+    head: *mut LruEntry<K, V>,
+    tail: *mut LruEntry<K, V>,
+}
+
+impl<K: Hash + Eq, V> LruCache<K, V> {
+    /// Creates a new LRU Cache that holds at most `cap` items.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use lru::LruCache;
+    /// let mut cache: LruCache<isize, &str> = LruCache::new(10);
+    /// ```
+    pub fn new(cap: usize) -> LruCache<K, V> {
+        let mut map = FastMap8::default();
+        map.reserve(cap);
+        LruCache::construct(cap, map)
     }
 
-    /// Checks if the map contains the given key.
-    pub fn contains_key<Q: ?Sized>(&mut self, key: &Q) -> bool
-        where K: Borrow<Q>,
-              Q: Hash + Eq
+    /// Creates a new LRU Cache that never automatically evicts items.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use lru::LruCache;
+    /// let mut cache: LruCache<isize, &str> = LruCache::unbounded();
+    /// ```
+    pub fn unbounded() -> LruCache<K, V> {
+        LruCache::construct(usize::MAX, HashMap::default())
+    }
+}
+
+impl<K: Hash + Eq, V> LruCache<K, V> {
+    /// Creates a new LRU Cache with the given capacity.
+    fn construct(cap: usize, map: FastMap8<KeyRef<K>, Box<LruEntry<K, V>>>) -> LruCache<K, V> {
+        // NB: The compiler warns that cache does not need to be marked as mutable if we
+        // declare it as such since we only mutate it inside the unsafe block.
+        let cache = LruCache {
+            map,
+            cap,
+            head: Box::into_raw(Box::new(LruEntry::new_sigil())),
+            tail: Box::into_raw(Box::new(LruEntry::new_sigil())),
+        };
+
+        unsafe {
+            (*cache.head).next = cache.tail;
+            (*cache.tail).prev = cache.head;
+        }
+
+        cache
+    }
+
+    /// Puts a key-value pair into cache. If the capacity is reached the evicted entry is returned.
+    pub fn insert(&mut self, k: K, mut v: V) -> Option<(K, V)> {
+        let node_ptr = self.map.get_mut(&KeyRef { k: &k }).map(|node| {
+            let node_ptr: *mut LruEntry<K, V> = &mut **node;
+            node_ptr
+        });
+
+        match node_ptr {
+            Some(node_ptr) => {
+                // if the key is already in the cache just update its value and move it to the
+                // front of the list
+                unsafe { mem::swap(&mut v, &mut (*(*node_ptr).val.as_mut_ptr()) as &mut V) }
+                self.detach(node_ptr);
+                self.attach(node_ptr);
+                None
+            }
+            None => {
+                let (mut node, old_entry) = if self.len() == self.cap() {
+                    // if the cache is full, remove the last entry so we can use it for the new key
+                    let old_key = KeyRef {
+                        k: unsafe { &(*(*(*self.tail).prev).key.as_ptr()) },
+                    };
+                    let mut old_node = self.map.remove(&old_key).unwrap();
+
+                    // drop the node's current key and val so we can overwrite them
+                    let old_entry = unsafe { (old_node.key.assume_init(), old_node.val.assume_init()) };
+
+                    old_node.key = mem::MaybeUninit::new(k);
+                    old_node.val = mem::MaybeUninit::new(v);
+
+                    let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
+                    self.detach(node_ptr);
+
+                    (old_node, Some(old_entry))
+                } else {
+                    // if the cache is not full allocate a new LruEntry
+                    (Box::new(LruEntry::new(k, v)), None)
+                };
+
+                let node_ptr: *mut LruEntry<K, V> = &mut *node;
+                self.attach(node_ptr);
+
+                let keyref = unsafe { (*node_ptr).key.as_ptr() };
+                self.map.insert(KeyRef { k: keyref }, node);
+
+                old_entry
+            }
+        }
+    }
+
+    /// Returns a mutable reference to the value of the key in the cache or `None` if it
+    /// is not present in the cache. Moves the key to the head of the LRU list if it exists.
+    pub fn get_mut<'a, Q>(&'a mut self, k: &Q) -> Option<&'a mut V>
+    where
+        KeyRef<K>: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
     {
-        self.get_mut(key).is_some()
-    }
+        if let Some(node) = self.map.get_mut(k) {
+            let node_ptr: *mut LruEntry<K, V> = &mut **node;
 
-    /// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned.
-    pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> {
-        self.map.insert(k, v);
-        if self.len() > self.capacity() {
-            self.remove_lru()
+            self.detach(node_ptr);
+            self.attach(node_ptr);
+
+            Some(unsafe { &mut (*(*node_ptr).val.as_mut_ptr()) as &mut V })
         } else {
             None
         }
     }
 
-    /// Returns a mutable reference to the value corresponding to the given key in the cache, if
-    /// any.
-    pub fn get_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
-        where K: Borrow<Q>,
-              Q: Hash + Eq
+    /// Returns a bool indicating whether the given key is in the cache. Does not update the
+    /// LRU list.
+    pub fn contains_key<Q>(&self, k: &Q) -> bool
+    where
+        KeyRef<K>: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
     {
-        self.map.get_refresh(k)
+        self.map.contains_key(k)
     }
 
-    pub fn peek_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
-        where K: Borrow<Q>,
-              Q: Hash + Eq
+    /// Removes and returns the value corresponding to the key from the cache or
+    /// `None` if it does not exist.
+    pub fn remove<Q>(&mut self, k: &Q) -> Option<V>
+    where
+        KeyRef<K>: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
     {
-        self.map.get_mut(k)
+        match self.map.remove(&k) {
+            None => None,
+            Some(mut old_node) => {
+                let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
+                self.detach(node_ptr);
+                unsafe { Some(old_node.val.assume_init()) }
+            }
+        }
     }
 
-    /// Removes the given key from the cache and returns its corresponding value.
-    pub fn remove<Q: ?Sized>(&mut self, k: &Q) -> Option<V>
-        where K: Borrow<Q>,
-              Q: Hash + Eq
-    {
-        self.map.remove(k)
+    /// Removes and returns the key and value corresponding to the least recently
+    /// used item or `None` if the cache is empty.
+    pub fn remove_lru(&mut self) -> Option<(K, V)> {
+        let node = self.remove_last()?;
+        // N.B.: Can't destructure directly because of https://github.com/rust-lang/rust/issues/28536
+        let node = *node;
+        let LruEntry { key, val, .. } = node;
+        unsafe { Some((key.assume_init(), val.assume_init())) }
+    }
+
+    /// Returns the number of key-value pairs that are currently in the the cache.
+    pub fn len(&self) -> usize {
+        self.map.len()
+    }
+
+    /// Returns a bool indicating whether the cache is empty or not.
+    pub fn is_empty(&self) -> bool {
+        self.map.len() == 0
     }
 
     /// Returns the maximum number of key-value pairs the cache can hold.
-    pub fn capacity(&self) -> usize {
-        self.max_size
+    pub fn cap(&self) -> usize {
+        self.cap
     }
 
-    /// Sets the number of key-value pairs the cache can hold. Removes
-    /// least-recently-used key-value pairs if necessary.
-    pub fn set_capacity(&mut self, capacity: usize) {
-        for _ in capacity..self.len() {
-            self.remove_lru();
+    /// Resizes the cache. If the new capacity is smaller than the size of the current
+    /// cache any entries past the new capacity are discarded.
+    pub fn resize(&mut self, cap: usize) {
+        // return early if capacity doesn't change
+        if cap == self.cap {
+            return;
         }
-        self.max_size = capacity;
+
+        while self.map.len() > cap {
+            self.remove_last();
+        }
+        self.map.shrink_to_fit();
+
+        self.cap = cap;
     }
 
-    /// Removes and returns the least recently used key-value pair as a tuple.
-    #[inline]
-    pub fn remove_lru(&mut self) -> Option<(K, V)> {
-        self.map.pop_front()
+    /// Clears the contents of the cache.
+    pub fn clear(&mut self) {
+        loop {
+            match self.remove_last() {
+                Some(_) => (),
+                None => break,
+            }
+        }
     }
 
-    /// Returns the number of key-value pairs in the cache.
-    pub fn len(&self) -> usize { self.map.len() }
+    /// An iterator visiting all entries in order. The iterator element type is `(&'a K, &'a V)`.
+    pub fn iter<'a>(&'a self) -> Iter<'a, K, V> {
+        Iter {
+            len: self.len(),
+            ptr: unsafe { (*self.head).next },
+            phantom: PhantomData,
+        }
+    }
 
-    /// Returns `true` if the cache contains no key-value pairs.
-    pub fn is_empty(&self) -> bool { self.map.is_empty() }
+    fn remove_last(&mut self) -> Option<Box<LruEntry<K, V>>> {
+        let prev;
+        unsafe { prev = (*self.tail).prev }
+        if prev != self.head {
+            let old_key = KeyRef {
+                k: unsafe { &(*(*(*self.tail).prev).key.as_ptr()) },
+            };
+            let mut old_node = self.map.remove(&old_key).unwrap();
+            let node_ptr: *mut LruEntry<K, V> = &mut *old_node;
+            self.detach(node_ptr);
+            Some(old_node)
+        } else {
+            None
+        }
+    }
 
-    /// Removes all key-value pairs from the cache.
-    pub fn clear(&mut self) { self.map.clear(); }
-}
+    fn detach(&mut self, node: *mut LruEntry<K, V>) {
+        unsafe {
+            (*(*node).prev).next = (*node).next;
+            (*(*node).next).prev = (*node).prev;
+        }
+    }
 
-impl<K: Eq + Hash, V, S: BuildHasher> IntoIterator for LruCache<K, V, S> {
-    type Item = (K, V);
-    type IntoIter = IntoIter<K, V>;
-
-    fn into_iter(self) -> IntoIter<K, V> {
-        IntoIter(self.map.into_iter())
+    fn attach(&mut self, node: *mut LruEntry<K, V>) {
+        unsafe {
+            (*node).next = (*self.head).next;
+            (*node).prev = self.head;
+            (*self.head).next = node;
+            (*(*node).next).prev = node;
+        }
     }
 }
 
-#[derive(Clone)]
-pub struct IntoIter<K, V>(linked_hash_map::IntoIter<K, V>);
+impl<K, V> Drop for LruCache<K, V> {
+    fn drop(&mut self) {
+        self.map.values_mut().for_each(|e| unsafe {
+            ptr::drop_in_place(e.key.as_mut_ptr());
+            ptr::drop_in_place(e.val.as_mut_ptr());
+        });
+        // We rebox the head/tail, and because these are maybe-uninit
+        // they do not have the absent k/v dropped.
+        unsafe {
+            let _head = *Box::from_raw(self.head);
+            let _tail = *Box::from_raw(self.tail);
+        }
+    }
+}
 
-impl<K, V> Iterator for IntoIter<K, V> {
-    type Item = (K, V);
+impl<'a, K: Hash + Eq, V> IntoIterator for &'a LruCache<K, V> {
+    type Item = (&'a K, &'a V);
+    type IntoIter = Iter<'a, K, V>;
 
-    fn next(&mut self) -> Option<(K, V)> {
-        self.0.next()
+    fn into_iter(self) -> Iter<'a, K, V> {
+        self.iter()
+    }
+}
+
+// The compiler does not automatically derive Send and Sync for LruCache because it contains
+// raw pointers. The raw pointers are safely encapsulated by LruCache though so we can
+// implement Send and Sync for it below.
+unsafe impl<K: Send, V: Send> Send for LruCache<K, V> {}
+unsafe impl<K: Sync, V: Sync> Sync for LruCache<K, V> {}
+
+impl<K: Hash + Eq, V> fmt::Debug for LruCache<K, V> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("LruCache")
+            .field("len", &self.len())
+            .field("cap", &self.cap())
+            .finish()
+    }
+}
+
+/// An iterator over the entries of a `LruCache`.
+pub struct Iter<'a, K: 'a, V: 'a> {
+    len: usize,
+    ptr: *const LruEntry<K, V>,
+    phantom: PhantomData<&'a K>,
+}
+
+impl<'a, K, V> Iterator for Iter<'a, K, V> {
+    type Item = (&'a K, &'a V);
+
+    fn next(&mut self) -> Option<(&'a K, &'a V)> {
+        if self.len == 0 {
+            return None;
+        }
+
+        let key = unsafe { &(*(*self.ptr).key.as_ptr()) as &K };
+        let val = unsafe { &(*(*self.ptr).val.as_ptr()) as &V };
+
+        self.len -= 1;
+        self.ptr = unsafe { (*self.ptr).next };
+
+        Some((key, val))
     }
 
     fn size_hint(&self) -> (usize, Option<usize>) {
-        self.0.size_hint()
+        (self.len, Some(self.len))
+    }
+
+    fn count(self) -> usize {
+        self.len
     }
 }
 
-impl<K, V> DoubleEndedIterator for IntoIter<K, V> {
-    fn next_back(&mut self) -> Option<(K, V)> {
-        self.0.next_back()
-    }
-}
+impl<'a, K, V> ExactSizeIterator for Iter<'a, K, V> {}
+impl<'a, K, V> FusedIterator for Iter<'a, K, V> {}
 
-impl<K, V> ExactSizeIterator for IntoIter<K, V> {
-    fn len(&self) -> usize {
-        self.0.len()
-    }
-}
+// The compiler does not automatically derive Send and Sync for Iter because it contains
+// raw pointers.
+unsafe impl<'a, K: Send, V: Send> Send for Iter<'a, K, V> {}
+unsafe impl<'a, K: Sync, V: Sync> Sync for Iter<'a, K, V> {}
 
 pub struct ArcCache<K, V>
 where
@@ -180,8 +449,7 @@ where
             evicted.extend(self.frequent_set.insert(key, value));
             return evicted;
         }
-        if self.recent_set.contains_key(&key) {
-            self.recent_set.remove(&key);
+        if self.recent_set.remove(&key).is_some() {
             evicted.extend(self.frequent_set.insert(key, value));
             return evicted;
         }
@@ -202,7 +470,8 @@ where
                 evicted.extend(self.replace(true));
             }
             self.frequent_evicted.remove(&key);
-            return Vec::from_iter(self.frequent_set.insert(key, value));
+            evicted.extend(self.frequent_set.insert(key, value));
+            return evicted;
         }
         if self.recent_evicted.contains_key(&key) {
             let recent_evicted_len = self.recent_evicted.len();
@@ -242,7 +511,7 @@ where
     where
         K: Clone + Hash + Eq,
     {
-        if let Some(value) = self.recent_set.remove(&key) {
+        if let Some(value) = self.recent_set.remove(key) {
             self.frequent_set.insert((*key).clone(), value);
         }
         self.frequent_set.get_mut(key)
@@ -268,11 +537,11 @@ where
     }
 }
 
-impl<K: Eq + Hash, V> IntoIterator for ArcCache<K, V>{
-    type Item = (K, V);
-    type IntoIter = std::iter::Chain<IntoIter<K, V>, IntoIter<K, V>>;
+impl<'a, K: 'a + Eq + Hash, V: 'a> IntoIterator for &'a ArcCache<K, V>{
+    type Item = (&'a K, &'a V);
+    type IntoIter = std::iter::Chain<Iter<'a, K, V>, Iter<'a, K, V>>;
 
     fn into_iter(self) -> Self::IntoIter {
-        self.recent_set.into_iter().chain(self.frequent_set)
+        self.recent_set.iter().chain(&self.frequent_set)
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 103c252bf..113203adb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,7 +11,7 @@ use std::time::Instant;
 
 use cow_utils::CowUtils;
 use fst::{IntoStreamer, Streamer};
-use fxhash::FxHasher32;
+use fxhash::{FxHasher32, FxHasher64};
 use heed::types::*;
 use heed::{PolyDatabase, Database};
 use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
@@ -28,6 +28,7 @@ static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
 static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
 
 pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
+pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
 pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
 pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
 pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;