Replace the LRU by an Arc cache

This commit is contained in:
Kerollmops 2020-06-29 19:48:02 +02:00
parent 07abebfc46
commit f98b615bf3
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
5 changed files with 307 additions and 78 deletions

56
Cargo.lock generated
View File

@ -1,14 +1,5 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "ahash"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3"
dependencies = [
"const-random",
]
[[package]]
name = "anyhow"
version = "1.0.31"
@ -189,26 +180,6 @@ dependencies = [
"bitflags",
]
[[package]]
name = "const-random"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a"
dependencies = [
"const-random-macro",
"proc-macro-hack",
]
[[package]]
name = "const-random-macro"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a"
dependencies = [
"getrandom",
"proc-macro-hack",
]
[[package]]
name = "cow-utils"
version = "0.1.2"
@ -535,16 +506,6 @@ dependencies = [
"tokio-util",
]
[[package]]
name = "hashbrown"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead"
dependencies = [
"ahash",
"autocfg 0.1.7",
]
[[package]]
name = "headers"
version = "0.3.2"
@ -789,6 +750,12 @@ version = "0.2.70"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f"
[[package]]
name = "linked-hash-map"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a"
[[package]]
name = "lmdb-rkv-sys"
version = "0.11.0"
@ -818,15 +785,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "lru"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2"
dependencies = [
"hashbrown",
]
[[package]]
name = "matches"
version = "0.1.8"
@ -856,7 +814,7 @@ dependencies = [
"itertools",
"jemallocator",
"levenshtein_automata",
"lru",
"linked-hash-map",
"memmap",
"once_cell",
"roaring",

View File

@ -16,7 +16,7 @@ fxhash = "0.2.1"
heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
jemallocator = "0.3.2"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
lru = "0.5.2"
linked-hash-map = "0.5.3"
memmap = "0.7.0"
once_cell = "1.4.0"
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }

View File

@ -10,10 +10,11 @@ use cow_utils::CowUtils;
use fst::Streamer;
use heed::EnvOpenOptions;
use heed::types::*;
use lru::LruCache;
use roaring::RoaringBitmap;
use slice_group_by::StrGroupBy;
use structopt::StructOpt;
use mega_mini_indexer::cache::ArcCache;
use mega_mini_indexer::{BEU32, Index, DocumentId};
const MAX_POSITION: usize = 1000;
@ -45,9 +46,8 @@ struct Opt {
fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
eprintln!("Indexing into LMDB...");
let cache_size = 3_000_000;
let mut word_positions = LruCache::new(cache_size + 1);
let mut word_position_docids = LruCache::new(cache_size + 1);
let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000);
let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000);
// Write the headers into a Vec of bytes.
let headers = rdr.headers()?;
@ -69,42 +69,34 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
// ------ merge word positions --------
let ids = match word_positions.get_mut(&word) {
Some(ids) => ids,
match word_positions.get_mut(&word) {
Some(ids) => { ids.insert(position); },
None => {
let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
word_positions.put(word.clone(), ids);
if word_positions.len() > cache_size {
let (word, ids) = word_positions.pop_lru().unwrap();
let mut ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
ids.insert(position);
for (word, ids) in word_positions.insert(word.clone(), ids) {
index.word_positions.put(wtxn, &word, &ids)?;
}
word_positions.get_mut(&word).unwrap()
}
};
ids.insert(position);
}
// ------ merge word position documents ids --------
let mut key = word.as_bytes().to_vec();
key.extend_from_slice(&position.to_be_bytes());
let ids = match word_position_docids.get_mut(&(word.clone(), position)) {
Some(ids) => ids,
match word_position_docids.get_mut(&(word.clone(), position)) {
Some(ids) => { ids.insert(position); },
None => {
let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
word_position_docids.put((word.clone(), position), ids);
if word_position_docids.len() > cache_size {
let ((word, position), ids) = word_position_docids.pop_lru().unwrap();
let mut ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
ids.insert(position);
for ((word, position), ids) in word_position_docids.insert((word.clone(), position), ids) {
let mut key = word.as_bytes().to_vec();
key.extend_from_slice(&position.to_be_bytes());
index.word_position_docids.put(wtxn, &key, &ids)?;
}
word_position_docids.get_mut(&(word, position)).unwrap()
}
};
ids.insert(position);
}
}
}
}
@ -116,14 +108,14 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
}
for (word, ids) in &word_positions {
index.word_positions.put(wtxn, word, ids)?;
for (word, ids) in word_positions {
index.word_positions.put(wtxn, &word, &ids)?;
}
for ((word, position), ids) in &word_position_docids {
for ((word, position), ids) in word_position_docids {
let mut key = word.as_bytes().to_vec();
key.extend_from_slice(&position.to_be_bytes());
index.word_position_docids.put(wtxn, &key, ids)?;
index.word_position_docids.put(wtxn, &key, &ids)?;
}
// We store the words from the postings.

278
src/cache.rs Normal file
View File

@ -0,0 +1,278 @@
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::borrow::Borrow;
use std::collections::hash_map::RandomState;
use std::hash::{Hash, BuildHasher};
use std::iter::FromIterator;
use linked_hash_map::LinkedHashMap;
/// An LRU cache.
#[derive(Clone)]
pub struct LruCache<K: Eq + Hash, V, S: BuildHasher = RandomState> {
map: LinkedHashMap<K, V, S>,
max_size: usize,
}
impl<K: Eq + Hash, V> LruCache<K, V> {
/// Creates an empty cache that can hold at most `capacity` items.
pub fn new(capacity: usize) -> Self {
LruCache {
map: LinkedHashMap::new(),
max_size: capacity,
}
}
}
impl<K: Eq + Hash, V, S: BuildHasher> LruCache<K, V, S> {
/// Creates an empty cache that can hold at most `capacity` items with the given hash builder.
pub fn with_hasher(capacity: usize, hash_builder: S) -> Self {
LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity }
}
/// Checks if the map contains the given key.
pub fn contains_key<Q: ?Sized>(&mut self, key: &Q) -> bool
where K: Borrow<Q>,
Q: Hash + Eq
{
self.get_mut(key).is_some()
}
/// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned.
pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> {
self.map.insert(k, v);
if self.len() > self.capacity() {
self.remove_lru()
} else {
None
}
}
/// Returns a mutable reference to the value corresponding to the given key in the cache, if
/// any.
pub fn get_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
where K: Borrow<Q>,
Q: Hash + Eq
{
self.map.get_refresh(k)
}
pub fn peek_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
where K: Borrow<Q>,
Q: Hash + Eq
{
self.map.get_mut(k)
}
/// Removes the given key from the cache and returns its corresponding value.
pub fn remove<Q: ?Sized>(&mut self, k: &Q) -> Option<V>
where K: Borrow<Q>,
Q: Hash + Eq
{
self.map.remove(k)
}
/// Returns the maximum number of key-value pairs the cache can hold.
pub fn capacity(&self) -> usize {
self.max_size
}
/// Sets the number of key-value pairs the cache can hold. Removes
/// least-recently-used key-value pairs if necessary.
pub fn set_capacity(&mut self, capacity: usize) {
for _ in capacity..self.len() {
self.remove_lru();
}
self.max_size = capacity;
}
/// Removes and returns the least recently used key-value pair as a tuple.
#[inline]
pub fn remove_lru(&mut self) -> Option<(K, V)> {
self.map.pop_front()
}
/// Returns the number of key-value pairs in the cache.
pub fn len(&self) -> usize { self.map.len() }
/// Returns `true` if the cache contains no key-value pairs.
pub fn is_empty(&self) -> bool { self.map.is_empty() }
/// Removes all key-value pairs from the cache.
pub fn clear(&mut self) { self.map.clear(); }
}
impl<K: Eq + Hash, V, S: BuildHasher> IntoIterator for LruCache<K, V, S> {
type Item = (K, V);
type IntoIter = IntoIter<K, V>;
fn into_iter(self) -> IntoIter<K, V> {
IntoIter(self.map.into_iter())
}
}
#[derive(Clone)]
pub struct IntoIter<K, V>(linked_hash_map::IntoIter<K, V>);
impl<K, V> Iterator for IntoIter<K, V> {
type Item = (K, V);
fn next(&mut self) -> Option<(K, V)> {
self.0.next()
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.0.size_hint()
}
}
impl<K, V> DoubleEndedIterator for IntoIter<K, V> {
fn next_back(&mut self) -> Option<(K, V)> {
self.0.next_back()
}
}
impl<K, V> ExactSizeIterator for IntoIter<K, V> {
fn len(&self) -> usize {
self.0.len()
}
}
pub struct ArcCache<K, V>
where
K: Eq + Hash,
{
recent_set: LruCache<K, V>,
recent_evicted: LruCache<K, ()>,
frequent_set: LruCache<K, V>,
frequent_evicted: LruCache<K, ()>,
capacity: usize,
p: usize,
}
impl<K, V> ArcCache<K, V>
where
K: Eq + Hash + Clone,
{
pub fn new(capacity: usize) -> ArcCache<K, V> {
assert_ne!(capacity, 0, "cache length cannot be zero");
ArcCache {
recent_set: LruCache::new(capacity),
recent_evicted: LruCache::new(capacity),
frequent_set: LruCache::new(capacity),
frequent_evicted: LruCache::new(capacity),
capacity: capacity,
p: 0,
}
}
pub fn insert(&mut self, key: K, value: V) -> Vec<(K, V)> {
let mut evicted = Vec::new();
if self.frequent_set.contains_key(&key) {
evicted.extend(self.frequent_set.insert(key, value));
return evicted;
}
if self.recent_set.contains_key(&key) {
self.recent_set.remove(&key);
evicted.extend(self.frequent_set.insert(key, value));
return evicted;
}
if self.frequent_evicted.contains_key(&key) {
let recent_evicted_len = self.recent_evicted.len();
let frequent_evicted_len = self.frequent_evicted.len();
let delta = if recent_evicted_len > frequent_evicted_len {
recent_evicted_len / frequent_evicted_len
} else {
1
};
if delta < self.p {
self.p -= delta;
} else {
self.p = 0
}
if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
evicted.extend(self.replace(true));
}
self.frequent_evicted.remove(&key);
return Vec::from_iter(self.frequent_set.insert(key, value));
}
if self.recent_evicted.contains_key(&key) {
let recent_evicted_len = self.recent_evicted.len();
let frequent_evicted_len = self.frequent_evicted.len();
let delta = if frequent_evicted_len > recent_evicted_len {
frequent_evicted_len / recent_evicted_len
} else {
1
};
if delta <= self.capacity - self.p {
self.p += delta;
} else {
self.p = self.capacity;
}
if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
evicted.extend(self.replace(false));
}
self.recent_evicted.remove(&key);
evicted.extend(self.frequent_set.insert(key, value));
return evicted;
}
let mut evicted = Vec::with_capacity(2);
if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
evicted.extend(self.replace(false));
}
if self.recent_evicted.len() > self.capacity - self.p {
self.recent_evicted.remove_lru();
}
if self.frequent_evicted.len() > self.p {
self.frequent_evicted.remove_lru();
}
evicted.extend(self.recent_set.insert(key, value));
evicted
}
pub fn get_mut(&mut self, key: &K) -> Option<&mut V>
where
K: Clone + Hash + Eq,
{
if let Some(value) = self.recent_set.remove(&key) {
self.frequent_set.insert((*key).clone(), value);
}
self.frequent_set.get_mut(key)
}
fn replace(&mut self, frequent_evicted_contains_key: bool) -> Option<(K, V)> {
let recent_set_len = self.recent_set.len();
if recent_set_len > 0
&& (recent_set_len > self.p
|| (recent_set_len == self.p && frequent_evicted_contains_key))
{
if let Some((old_key, old_val)) = self.recent_set.remove_lru() {
self.recent_evicted.insert(old_key.clone(), ());
return Some((old_key, old_val));
}
} else {
if let Some((old_key, old_val)) = self.frequent_set.remove_lru() {
self.frequent_evicted.insert(old_key.clone(), ());
return Some((old_key, old_val));
}
}
None
}
}
impl<K: Eq + Hash, V> IntoIterator for ArcCache<K, V>{
type Item = (K, V);
type IntoIter = std::iter::Chain<IntoIter<K, V>, IntoIter<K, V>>;
fn into_iter(self) -> Self::IntoIter {
self.recent_set.into_iter().chain(self.frequent_set)
}
}

View File

@ -2,6 +2,7 @@ mod best_proximity;
mod heed_codec;
mod iter_shortest_paths;
mod query_tokens;
pub mod cache;
use std::borrow::Cow;
use std::collections::HashMap;