Replace the LRU by an Arc cache

2025-07-04 04:17:10 +02:00 · 2020-06-29 19:48:02 +02:00 · 2020-06-29 19:48:02 +02:00 · f98b615bf3
commit f98b615bf3
parent 07abebfc46
5 changed files with 307 additions and 78 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1,14 +1,5 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-[[package]]
-name = "ahash"
-version = "0.2.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3"
-dependencies = [
- "const-random",
-]
-
 [[package]]
 name = "anyhow"
 version = "1.0.31"
@ -189,26 +180,6 @@ dependencies = [
 "bitflags",
 ]

-[[package]]
-name = "const-random"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a"
-dependencies = [
- "const-random-macro",
- "proc-macro-hack",
-]
-
-[[package]]
-name = "const-random-macro"
-version = "0.1.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a"
-dependencies = [
- "getrandom",
- "proc-macro-hack",
-]
-
 [[package]]
 name = "cow-utils"
 version = "0.1.2"
@ -535,16 +506,6 @@ dependencies = [
 "tokio-util",
 ]

-[[package]]
-name = "hashbrown"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead"
-dependencies = [
- "ahash",
- "autocfg 0.1.7",
-]
-
 [[package]]
 name = "headers"
 version = "0.3.2"
@ -789,6 +750,12 @@ version = "0.2.70"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f"

+[[package]]
+name = "linked-hash-map"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a"
+
 [[package]]
 name = "lmdb-rkv-sys"
 version = "0.11.0"
@ -818,15 +785,6 @@ dependencies = [
 "cfg-if",
 ]

-[[package]]
-name = "lru"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2"
-dependencies = [
- "hashbrown",
-]
-
 [[package]]
 name = "matches"
 version = "0.1.8"
@ -856,7 +814,7 @@ dependencies = [
 "itertools",
 "jemallocator",
 "levenshtein_automata",
- "lru",
+ "linked-hash-map",
 "memmap",
 "once_cell",
 "roaring",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -16,7 +16,7 @@ fxhash = "0.2.1"
 heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
 jemallocator = "0.3.2"
 levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
-lru = "0.5.2"
+linked-hash-map = "0.5.3"
 memmap = "0.7.0"
 once_cell = "1.4.0"
 roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@ -10,10 +10,11 @@ use cow_utils::CowUtils;
 use fst::Streamer;
 use heed::EnvOpenOptions;
 use heed::types::*;
-use lru::LruCache;
+use roaring::RoaringBitmap;
 use slice_group_by::StrGroupBy;
 use structopt::StructOpt;

+use mega_mini_indexer::cache::ArcCache;
 use mega_mini_indexer::{BEU32, Index, DocumentId};

 const MAX_POSITION: usize = 1000;
@ -45,9 +46,8 @@ struct Opt {
 fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
    eprintln!("Indexing into LMDB...");

-    let cache_size = 3_000_000;
-    let mut word_positions = LruCache::new(cache_size + 1);
-    let mut word_position_docids = LruCache::new(cache_size + 1);
+    let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000);
+    let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000);

    // Write the headers into a Vec of bytes.
    let headers = rdr.headers()?;
@ -69,42 +69,34 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index

                    // ------ merge word positions --------

-                    let ids = match word_positions.get_mut(&word) {
-                        Some(ids) => ids,
+                    match word_positions.get_mut(&word) {
+                        Some(ids) => { ids.insert(position); },
                        None => {
-                            let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
-                            word_positions.put(word.clone(), ids);
-                            if word_positions.len() > cache_size {
-                                let (word, ids) = word_positions.pop_lru().unwrap();
+                            let mut ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
+                            ids.insert(position);
+                            for (word, ids) in word_positions.insert(word.clone(), ids) {
                                index.word_positions.put(wtxn, &word, &ids)?;
                            }
-                            word_positions.get_mut(&word).unwrap()
                        }
-                    };
-
-                    ids.insert(position);
+                    }

                    // ------ merge word position documents ids --------

                    let mut key = word.as_bytes().to_vec();
                    key.extend_from_slice(&position.to_be_bytes());

-                    let ids = match word_position_docids.get_mut(&(word.clone(), position)) {
-                        Some(ids) => ids,
+                    match word_position_docids.get_mut(&(word.clone(), position)) {
+                        Some(ids) => { ids.insert(position); },
                        None => {
-                            let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
-                            word_position_docids.put((word.clone(), position), ids);
-                            if word_position_docids.len() > cache_size {
-                                let ((word, position), ids) = word_position_docids.pop_lru().unwrap();
+                            let mut ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
+                            ids.insert(position);
+                            for ((word, position), ids) in word_position_docids.insert((word.clone(), position), ids) {
                                let mut key = word.as_bytes().to_vec();
                                key.extend_from_slice(&position.to_be_bytes());
                                index.word_position_docids.put(wtxn, &key, &ids)?;
                            }
-                            word_position_docids.get_mut(&(word, position)).unwrap()
                        }
-                    };
-
-                    ids.insert(position);
+                    }
                }
            }
        }
@ -116,14 +108,14 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
        index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
    }

-    for (word, ids) in &word_positions {
-        index.word_positions.put(wtxn, word, ids)?;
+    for (word, ids) in word_positions {
+        index.word_positions.put(wtxn, &word, &ids)?;
    }

-    for ((word, position), ids) in &word_position_docids {
+    for ((word, position), ids) in word_position_docids {
        let mut key = word.as_bytes().to_vec();
        key.extend_from_slice(&position.to_be_bytes());
-        index.word_position_docids.put(wtxn, &key, ids)?;
+        index.word_position_docids.put(wtxn, &key, &ids)?;
    }

    // We store the words from the postings.
--- a/src/cache.rs
+++ b/src/cache.rs
@ -0,0 +1,278 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::borrow::Borrow;
+use std::collections::hash_map::RandomState;
+use std::hash::{Hash, BuildHasher};
+use std::iter::FromIterator;
+
+use linked_hash_map::LinkedHashMap;
+
+/// An LRU cache.
+#[derive(Clone)]
+pub struct LruCache<K: Eq + Hash, V, S: BuildHasher = RandomState> {
+    map: LinkedHashMap<K, V, S>,
+    max_size: usize,
+}
+
+impl<K: Eq + Hash, V> LruCache<K, V> {
+    /// Creates an empty cache that can hold at most `capacity` items.
+    pub fn new(capacity: usize) -> Self {
+        LruCache {
+            map: LinkedHashMap::new(),
+            max_size: capacity,
+        }
+    }
+}
+
+impl<K: Eq + Hash, V, S: BuildHasher> LruCache<K, V, S> {
+    /// Creates an empty cache that can hold at most `capacity` items with the given hash builder.
+    pub fn with_hasher(capacity: usize, hash_builder: S) -> Self {
+        LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity }
+    }
+
+    /// Checks if the map contains the given key.
+    pub fn contains_key<Q: ?Sized>(&mut self, key: &Q) -> bool
+        where K: Borrow<Q>,
+              Q: Hash + Eq
+    {
+        self.get_mut(key).is_some()
+    }
+
+    /// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned.
+    pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> {
+        self.map.insert(k, v);
+        if self.len() > self.capacity() {
+            self.remove_lru()
+        } else {
+            None
+        }
+    }
+
+    /// Returns a mutable reference to the value corresponding to the given key in the cache, if
+    /// any.
+    pub fn get_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
+        where K: Borrow<Q>,
+              Q: Hash + Eq
+    {
+        self.map.get_refresh(k)
+    }
+
+    pub fn peek_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
+        where K: Borrow<Q>,
+              Q: Hash + Eq
+    {
+        self.map.get_mut(k)
+    }
+
+    /// Removes the given key from the cache and returns its corresponding value.
+    pub fn remove<Q: ?Sized>(&mut self, k: &Q) -> Option<V>
+        where K: Borrow<Q>,
+              Q: Hash + Eq
+    {
+        self.map.remove(k)
+    }
+
+    /// Returns the maximum number of key-value pairs the cache can hold.
+    pub fn capacity(&self) -> usize {
+        self.max_size
+    }
+
+    /// Sets the number of key-value pairs the cache can hold. Removes
+    /// least-recently-used key-value pairs if necessary.
+    pub fn set_capacity(&mut self, capacity: usize) {
+        for _ in capacity..self.len() {
+            self.remove_lru();
+        }
+        self.max_size = capacity;
+    }
+
+    /// Removes and returns the least recently used key-value pair as a tuple.
+    #[inline]
+    pub fn remove_lru(&mut self) -> Option<(K, V)> {
+        self.map.pop_front()
+    }
+
+    /// Returns the number of key-value pairs in the cache.
+    pub fn len(&self) -> usize { self.map.len() }
+
+    /// Returns `true` if the cache contains no key-value pairs.
+    pub fn is_empty(&self) -> bool { self.map.is_empty() }
+
+    /// Removes all key-value pairs from the cache.
+    pub fn clear(&mut self) { self.map.clear(); }
+}
+
+impl<K: Eq + Hash, V, S: BuildHasher> IntoIterator for LruCache<K, V, S> {
+    type Item = (K, V);
+    type IntoIter = IntoIter<K, V>;
+
+    fn into_iter(self) -> IntoIter<K, V> {
+        IntoIter(self.map.into_iter())
+    }
+}
+
+#[derive(Clone)]
+pub struct IntoIter<K, V>(linked_hash_map::IntoIter<K, V>);
+
+impl<K, V> Iterator for IntoIter<K, V> {
+    type Item = (K, V);
+
+    fn next(&mut self) -> Option<(K, V)> {
+        self.0.next()
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.0.size_hint()
+    }
+}
+
+impl<K, V> DoubleEndedIterator for IntoIter<K, V> {
+    fn next_back(&mut self) -> Option<(K, V)> {
+        self.0.next_back()
+    }
+}
+
+impl<K, V> ExactSizeIterator for IntoIter<K, V> {
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+}
+
+pub struct ArcCache<K, V>
+where
+    K: Eq + Hash,
+{
+    recent_set: LruCache<K, V>,
+    recent_evicted: LruCache<K, ()>,
+    frequent_set: LruCache<K, V>,
+    frequent_evicted: LruCache<K, ()>,
+    capacity: usize,
+    p: usize,
+}
+
+impl<K, V> ArcCache<K, V>
+where
+    K: Eq + Hash + Clone,
+{
+    pub fn new(capacity: usize) -> ArcCache<K, V> {
+        assert_ne!(capacity, 0, "cache length cannot be zero");
+        ArcCache {
+            recent_set: LruCache::new(capacity),
+            recent_evicted: LruCache::new(capacity),
+            frequent_set: LruCache::new(capacity),
+            frequent_evicted: LruCache::new(capacity),
+            capacity: capacity,
+            p: 0,
+        }
+    }
+
+    pub fn insert(&mut self, key: K, value: V) -> Vec<(K, V)> {
+        let mut evicted = Vec::new();
+        if self.frequent_set.contains_key(&key) {
+            evicted.extend(self.frequent_set.insert(key, value));
+            return evicted;
+        }
+        if self.recent_set.contains_key(&key) {
+            self.recent_set.remove(&key);
+            evicted.extend(self.frequent_set.insert(key, value));
+            return evicted;
+        }
+        if self.frequent_evicted.contains_key(&key) {
+            let recent_evicted_len = self.recent_evicted.len();
+            let frequent_evicted_len = self.frequent_evicted.len();
+            let delta = if recent_evicted_len > frequent_evicted_len {
+                recent_evicted_len / frequent_evicted_len
+            } else {
+                1
+            };
+            if delta < self.p {
+                self.p -= delta;
+            } else {
+                self.p = 0
+            }
+            if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
+                evicted.extend(self.replace(true));
+            }
+            self.frequent_evicted.remove(&key);
+            return Vec::from_iter(self.frequent_set.insert(key, value));
+        }
+        if self.recent_evicted.contains_key(&key) {
+            let recent_evicted_len = self.recent_evicted.len();
+            let frequent_evicted_len = self.frequent_evicted.len();
+            let delta = if frequent_evicted_len > recent_evicted_len {
+                frequent_evicted_len / recent_evicted_len
+            } else {
+                1
+            };
+            if delta <= self.capacity - self.p {
+                self.p += delta;
+            } else {
+                self.p = self.capacity;
+            }
+            if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
+                evicted.extend(self.replace(false));
+            }
+            self.recent_evicted.remove(&key);
+            evicted.extend(self.frequent_set.insert(key, value));
+            return evicted;
+        }
+        let mut evicted = Vec::with_capacity(2);
+        if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
+            evicted.extend(self.replace(false));
+        }
+        if self.recent_evicted.len() > self.capacity - self.p {
+            self.recent_evicted.remove_lru();
+        }
+        if self.frequent_evicted.len() > self.p {
+            self.frequent_evicted.remove_lru();
+        }
+        evicted.extend(self.recent_set.insert(key, value));
+        evicted
+    }
+
+    pub fn get_mut(&mut self, key: &K) -> Option<&mut V>
+    where
+        K: Clone + Hash + Eq,
+    {
+        if let Some(value) = self.recent_set.remove(&key) {
+            self.frequent_set.insert((*key).clone(), value);
+        }
+        self.frequent_set.get_mut(key)
+    }
+
+    fn replace(&mut self, frequent_evicted_contains_key: bool) -> Option<(K, V)> {
+        let recent_set_len = self.recent_set.len();
+        if recent_set_len > 0
+            && (recent_set_len > self.p
+                || (recent_set_len == self.p && frequent_evicted_contains_key))
+        {
+            if let Some((old_key, old_val)) = self.recent_set.remove_lru() {
+                self.recent_evicted.insert(old_key.clone(), ());
+                return Some((old_key, old_val));
+            }
+        } else {
+            if let Some((old_key, old_val)) = self.frequent_set.remove_lru() {
+                self.frequent_evicted.insert(old_key.clone(), ());
+                return Some((old_key, old_val));
+            }
+        }
+        None
+    }
+}
+
+impl<K: Eq + Hash, V> IntoIterator for ArcCache<K, V>{
+    type Item = (K, V);
+    type IntoIter = std::iter::Chain<IntoIter<K, V>, IntoIter<K, V>>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.recent_set.into_iter().chain(self.frequent_set)
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -2,6 +2,7 @@ mod best_proximity;
 mod heed_codec;
 mod iter_shortest_paths;
 mod query_tokens;
+pub mod cache;

 use std::borrow::Cow;
 use std::collections::HashMap;