diff --git a/Cargo.lock b/Cargo.lock index c33e73502..e3c89585f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,14 +1,5 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -[[package]] -name = "ahash" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3" -dependencies = [ - "const-random", -] - [[package]] name = "anyhow" version = "1.0.31" @@ -189,26 +180,6 @@ dependencies = [ "bitflags", ] -[[package]] -name = "const-random" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a" -dependencies = [ - "const-random-macro", - "proc-macro-hack", -] - -[[package]] -name = "const-random-macro" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a" -dependencies = [ - "getrandom", - "proc-macro-hack", -] - [[package]] name = "cow-utils" version = "0.1.2" @@ -535,16 +506,6 @@ dependencies = [ "tokio-util", ] -[[package]] -name = "hashbrown" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead" -dependencies = [ - "ahash", - "autocfg 0.1.7", -] - [[package]] name = "headers" version = "0.3.2" @@ -789,6 +750,12 @@ version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" +[[package]] +name = "linked-hash-map" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a" + [[package]] name = "lmdb-rkv-sys" version = "0.11.0" @@ -818,15 +785,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "lru" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2" -dependencies = [ - "hashbrown", -] - [[package]] name = "matches" version = "0.1.8" @@ -856,7 +814,7 @@ dependencies = [ "itertools", "jemallocator", "levenshtein_automata", - "lru", + "linked-hash-map", "memmap", "once_cell", "roaring", diff --git a/Cargo.toml b/Cargo.toml index 73e9a7f2a..714b1b158 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ fxhash = "0.2.1" heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } -lru = "0.5.2" +linked-hash-map = "0.5.3" memmap = "0.7.0" once_cell = "1.4.0" roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" } diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 1465013fa..010b0e03b 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -10,10 +10,11 @@ use cow_utils::CowUtils; use fst::Streamer; use heed::EnvOpenOptions; use heed::types::*; -use lru::LruCache; +use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; use structopt::StructOpt; +use mega_mini_indexer::cache::ArcCache; use mega_mini_indexer::{BEU32, Index, DocumentId}; const MAX_POSITION: usize = 1000; @@ -45,9 +46,8 @@ struct Opt { fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index: &Index) -> anyhow::Result<()> { eprintln!("Indexing into LMDB..."); - let cache_size = 3_000_000; - let mut word_positions = LruCache::new(cache_size + 1); - let mut word_position_docids = LruCache::new(cache_size + 1); + let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000); + let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000); // Write the headers into a Vec of bytes. let headers = rdr.headers()?; @@ -69,42 +69,34 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index // ------ merge word positions -------- - let ids = match word_positions.get_mut(&word) { - Some(ids) => ids, + match word_positions.get_mut(&word) { + Some(ids) => { ids.insert(position); }, None => { - let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default(); - word_positions.put(word.clone(), ids); - if word_positions.len() > cache_size { - let (word, ids) = word_positions.pop_lru().unwrap(); + let mut ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default(); + ids.insert(position); + for (word, ids) in word_positions.insert(word.clone(), ids) { index.word_positions.put(wtxn, &word, &ids)?; } - word_positions.get_mut(&word).unwrap() } - }; - - ids.insert(position); + } // ------ merge word position documents ids -------- let mut key = word.as_bytes().to_vec(); key.extend_from_slice(&position.to_be_bytes()); - let ids = match word_position_docids.get_mut(&(word.clone(), position)) { - Some(ids) => ids, + match word_position_docids.get_mut(&(word.clone(), position)) { + Some(ids) => { ids.insert(position); }, None => { - let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default(); - word_position_docids.put((word.clone(), position), ids); - if word_position_docids.len() > cache_size { - let ((word, position), ids) = word_position_docids.pop_lru().unwrap(); + let mut ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default(); + ids.insert(position); + for ((word, position), ids) in word_position_docids.insert((word.clone(), position), ids) { let mut key = word.as_bytes().to_vec(); key.extend_from_slice(&position.to_be_bytes()); index.word_position_docids.put(wtxn, &key, &ids)?; } - word_position_docids.get_mut(&(word, position)).unwrap() } - }; - - ids.insert(position); + } } } } @@ -116,14 +108,14 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index index.documents.put(wtxn, &BEU32::new(document_id), &document)?; } - for (word, ids) in &word_positions { - index.word_positions.put(wtxn, word, ids)?; + for (word, ids) in word_positions { + index.word_positions.put(wtxn, &word, &ids)?; } - for ((word, position), ids) in &word_position_docids { + for ((word, position), ids) in word_position_docids { let mut key = word.as_bytes().to_vec(); key.extend_from_slice(&position.to_be_bytes()); - index.word_position_docids.put(wtxn, &key, ids)?; + index.word_position_docids.put(wtxn, &key, &ids)?; } // We store the words from the postings. diff --git a/src/cache.rs b/src/cache.rs new file mode 100644 index 000000000..d9a1a293f --- /dev/null +++ b/src/cache.rs @@ -0,0 +1,278 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::borrow::Borrow; +use std::collections::hash_map::RandomState; +use std::hash::{Hash, BuildHasher}; +use std::iter::FromIterator; + +use linked_hash_map::LinkedHashMap; + +/// An LRU cache. +#[derive(Clone)] +pub struct LruCache { + map: LinkedHashMap, + max_size: usize, +} + +impl LruCache { + /// Creates an empty cache that can hold at most `capacity` items. + pub fn new(capacity: usize) -> Self { + LruCache { + map: LinkedHashMap::new(), + max_size: capacity, + } + } +} + +impl LruCache { + /// Creates an empty cache that can hold at most `capacity` items with the given hash builder. + pub fn with_hasher(capacity: usize, hash_builder: S) -> Self { + LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity } + } + + /// Checks if the map contains the given key. + pub fn contains_key(&mut self, key: &Q) -> bool + where K: Borrow, + Q: Hash + Eq + { + self.get_mut(key).is_some() + } + + /// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned. + pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> { + self.map.insert(k, v); + if self.len() > self.capacity() { + self.remove_lru() + } else { + None + } + } + + /// Returns a mutable reference to the value corresponding to the given key in the cache, if + /// any. + pub fn get_mut(&mut self, k: &Q) -> Option<&mut V> + where K: Borrow, + Q: Hash + Eq + { + self.map.get_refresh(k) + } + + pub fn peek_mut(&mut self, k: &Q) -> Option<&mut V> + where K: Borrow, + Q: Hash + Eq + { + self.map.get_mut(k) + } + + /// Removes the given key from the cache and returns its corresponding value. + pub fn remove(&mut self, k: &Q) -> Option + where K: Borrow, + Q: Hash + Eq + { + self.map.remove(k) + } + + /// Returns the maximum number of key-value pairs the cache can hold. + pub fn capacity(&self) -> usize { + self.max_size + } + + /// Sets the number of key-value pairs the cache can hold. Removes + /// least-recently-used key-value pairs if necessary. + pub fn set_capacity(&mut self, capacity: usize) { + for _ in capacity..self.len() { + self.remove_lru(); + } + self.max_size = capacity; + } + + /// Removes and returns the least recently used key-value pair as a tuple. + #[inline] + pub fn remove_lru(&mut self) -> Option<(K, V)> { + self.map.pop_front() + } + + /// Returns the number of key-value pairs in the cache. + pub fn len(&self) -> usize { self.map.len() } + + /// Returns `true` if the cache contains no key-value pairs. + pub fn is_empty(&self) -> bool { self.map.is_empty() } + + /// Removes all key-value pairs from the cache. + pub fn clear(&mut self) { self.map.clear(); } +} + +impl IntoIterator for LruCache { + type Item = (K, V); + type IntoIter = IntoIter; + + fn into_iter(self) -> IntoIter { + IntoIter(self.map.into_iter()) + } +} + +#[derive(Clone)] +pub struct IntoIter(linked_hash_map::IntoIter); + +impl Iterator for IntoIter { + type Item = (K, V); + + fn next(&mut self) -> Option<(K, V)> { + self.0.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +impl DoubleEndedIterator for IntoIter { + fn next_back(&mut self) -> Option<(K, V)> { + self.0.next_back() + } +} + +impl ExactSizeIterator for IntoIter { + fn len(&self) -> usize { + self.0.len() + } +} + +pub struct ArcCache +where + K: Eq + Hash, +{ + recent_set: LruCache, + recent_evicted: LruCache, + frequent_set: LruCache, + frequent_evicted: LruCache, + capacity: usize, + p: usize, +} + +impl ArcCache +where + K: Eq + Hash + Clone, +{ + pub fn new(capacity: usize) -> ArcCache { + assert_ne!(capacity, 0, "cache length cannot be zero"); + ArcCache { + recent_set: LruCache::new(capacity), + recent_evicted: LruCache::new(capacity), + frequent_set: LruCache::new(capacity), + frequent_evicted: LruCache::new(capacity), + capacity: capacity, + p: 0, + } + } + + pub fn insert(&mut self, key: K, value: V) -> Vec<(K, V)> { + let mut evicted = Vec::new(); + if self.frequent_set.contains_key(&key) { + evicted.extend(self.frequent_set.insert(key, value)); + return evicted; + } + if self.recent_set.contains_key(&key) { + self.recent_set.remove(&key); + evicted.extend(self.frequent_set.insert(key, value)); + return evicted; + } + if self.frequent_evicted.contains_key(&key) { + let recent_evicted_len = self.recent_evicted.len(); + let frequent_evicted_len = self.frequent_evicted.len(); + let delta = if recent_evicted_len > frequent_evicted_len { + recent_evicted_len / frequent_evicted_len + } else { + 1 + }; + if delta < self.p { + self.p -= delta; + } else { + self.p = 0 + } + if self.recent_set.len() + self.frequent_set.len() >= self.capacity { + evicted.extend(self.replace(true)); + } + self.frequent_evicted.remove(&key); + return Vec::from_iter(self.frequent_set.insert(key, value)); + } + if self.recent_evicted.contains_key(&key) { + let recent_evicted_len = self.recent_evicted.len(); + let frequent_evicted_len = self.frequent_evicted.len(); + let delta = if frequent_evicted_len > recent_evicted_len { + frequent_evicted_len / recent_evicted_len + } else { + 1 + }; + if delta <= self.capacity - self.p { + self.p += delta; + } else { + self.p = self.capacity; + } + if self.recent_set.len() + self.frequent_set.len() >= self.capacity { + evicted.extend(self.replace(false)); + } + self.recent_evicted.remove(&key); + evicted.extend(self.frequent_set.insert(key, value)); + return evicted; + } + let mut evicted = Vec::with_capacity(2); + if self.recent_set.len() + self.frequent_set.len() >= self.capacity { + evicted.extend(self.replace(false)); + } + if self.recent_evicted.len() > self.capacity - self.p { + self.recent_evicted.remove_lru(); + } + if self.frequent_evicted.len() > self.p { + self.frequent_evicted.remove_lru(); + } + evicted.extend(self.recent_set.insert(key, value)); + evicted + } + + pub fn get_mut(&mut self, key: &K) -> Option<&mut V> + where + K: Clone + Hash + Eq, + { + if let Some(value) = self.recent_set.remove(&key) { + self.frequent_set.insert((*key).clone(), value); + } + self.frequent_set.get_mut(key) + } + + fn replace(&mut self, frequent_evicted_contains_key: bool) -> Option<(K, V)> { + let recent_set_len = self.recent_set.len(); + if recent_set_len > 0 + && (recent_set_len > self.p + || (recent_set_len == self.p && frequent_evicted_contains_key)) + { + if let Some((old_key, old_val)) = self.recent_set.remove_lru() { + self.recent_evicted.insert(old_key.clone(), ()); + return Some((old_key, old_val)); + } + } else { + if let Some((old_key, old_val)) = self.frequent_set.remove_lru() { + self.frequent_evicted.insert(old_key.clone(), ()); + return Some((old_key, old_val)); + } + } + None + } +} + +impl IntoIterator for ArcCache{ + type Item = (K, V); + type IntoIter = std::iter::Chain, IntoIter>; + + fn into_iter(self) -> Self::IntoIter { + self.recent_set.into_iter().chain(self.frequent_set) + } +} diff --git a/src/lib.rs b/src/lib.rs index e003a0677..103c252bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ mod best_proximity; mod heed_codec; mod iter_shortest_paths; mod query_tokens; +pub mod cache; use std::borrow::Cow; use std::collections::HashMap;