mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Replace the LRU by an Arc cache
This commit is contained in:
parent
07abebfc46
commit
f98b615bf3
56
Cargo.lock
generated
56
Cargo.lock
generated
@ -1,14 +1,5 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3"
|
||||
dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.31"
|
||||
@ -189,26 +180,6 @@ dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-random"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a"
|
||||
dependencies = [
|
||||
"const-random-macro",
|
||||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-random-macro"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cow-utils"
|
||||
version = "0.1.2"
|
||||
@ -535,16 +506,6 @@ dependencies = [
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"autocfg 0.1.7",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "headers"
|
||||
version = "0.3.2"
|
||||
@ -789,6 +750,12 @@ version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f"
|
||||
|
||||
[[package]]
|
||||
name = "linked-hash-map"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a"
|
||||
|
||||
[[package]]
|
||||
name = "lmdb-rkv-sys"
|
||||
version = "0.11.0"
|
||||
@ -818,15 +785,6 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2"
|
||||
dependencies = [
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.8"
|
||||
@ -856,7 +814,7 @@ dependencies = [
|
||||
"itertools",
|
||||
"jemallocator",
|
||||
"levenshtein_automata",
|
||||
"lru",
|
||||
"linked-hash-map",
|
||||
"memmap",
|
||||
"once_cell",
|
||||
"roaring",
|
||||
|
@ -16,7 +16,7 @@ fxhash = "0.2.1"
|
||||
heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
|
||||
jemallocator = "0.3.2"
|
||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||
lru = "0.5.2"
|
||||
linked-hash-map = "0.5.3"
|
||||
memmap = "0.7.0"
|
||||
once_cell = "1.4.0"
|
||||
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
|
||||
|
@ -10,10 +10,11 @@ use cow_utils::CowUtils;
|
||||
use fst::Streamer;
|
||||
use heed::EnvOpenOptions;
|
||||
use heed::types::*;
|
||||
use lru::LruCache;
|
||||
use roaring::RoaringBitmap;
|
||||
use slice_group_by::StrGroupBy;
|
||||
use structopt::StructOpt;
|
||||
|
||||
use mega_mini_indexer::cache::ArcCache;
|
||||
use mega_mini_indexer::{BEU32, Index, DocumentId};
|
||||
|
||||
const MAX_POSITION: usize = 1000;
|
||||
@ -45,9 +46,8 @@ struct Opt {
|
||||
fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
|
||||
eprintln!("Indexing into LMDB...");
|
||||
|
||||
let cache_size = 3_000_000;
|
||||
let mut word_positions = LruCache::new(cache_size + 1);
|
||||
let mut word_position_docids = LruCache::new(cache_size + 1);
|
||||
let mut word_positions = ArcCache::<_, RoaringBitmap>::new(100_000);
|
||||
let mut word_position_docids = ArcCache::<_, RoaringBitmap>::new(100_000);
|
||||
|
||||
// Write the headers into a Vec of bytes.
|
||||
let headers = rdr.headers()?;
|
||||
@ -69,42 +69,34 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
|
||||
|
||||
// ------ merge word positions --------
|
||||
|
||||
let ids = match word_positions.get_mut(&word) {
|
||||
Some(ids) => ids,
|
||||
match word_positions.get_mut(&word) {
|
||||
Some(ids) => { ids.insert(position); },
|
||||
None => {
|
||||
let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
|
||||
word_positions.put(word.clone(), ids);
|
||||
if word_positions.len() > cache_size {
|
||||
let (word, ids) = word_positions.pop_lru().unwrap();
|
||||
let mut ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
|
||||
ids.insert(position);
|
||||
for (word, ids) in word_positions.insert(word.clone(), ids) {
|
||||
index.word_positions.put(wtxn, &word, &ids)?;
|
||||
}
|
||||
word_positions.get_mut(&word).unwrap()
|
||||
}
|
||||
};
|
||||
|
||||
ids.insert(position);
|
||||
}
|
||||
|
||||
// ------ merge word position documents ids --------
|
||||
|
||||
let mut key = word.as_bytes().to_vec();
|
||||
key.extend_from_slice(&position.to_be_bytes());
|
||||
|
||||
let ids = match word_position_docids.get_mut(&(word.clone(), position)) {
|
||||
Some(ids) => ids,
|
||||
match word_position_docids.get_mut(&(word.clone(), position)) {
|
||||
Some(ids) => { ids.insert(position); },
|
||||
None => {
|
||||
let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
|
||||
word_position_docids.put((word.clone(), position), ids);
|
||||
if word_position_docids.len() > cache_size {
|
||||
let ((word, position), ids) = word_position_docids.pop_lru().unwrap();
|
||||
let mut ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
|
||||
ids.insert(position);
|
||||
for ((word, position), ids) in word_position_docids.insert((word.clone(), position), ids) {
|
||||
let mut key = word.as_bytes().to_vec();
|
||||
key.extend_from_slice(&position.to_be_bytes());
|
||||
index.word_position_docids.put(wtxn, &key, &ids)?;
|
||||
}
|
||||
word_position_docids.get_mut(&(word, position)).unwrap()
|
||||
}
|
||||
};
|
||||
|
||||
ids.insert(position);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -116,14 +108,14 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
|
||||
index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
|
||||
}
|
||||
|
||||
for (word, ids) in &word_positions {
|
||||
index.word_positions.put(wtxn, word, ids)?;
|
||||
for (word, ids) in word_positions {
|
||||
index.word_positions.put(wtxn, &word, &ids)?;
|
||||
}
|
||||
|
||||
for ((word, position), ids) in &word_position_docids {
|
||||
for ((word, position), ids) in word_position_docids {
|
||||
let mut key = word.as_bytes().to_vec();
|
||||
key.extend_from_slice(&position.to_be_bytes());
|
||||
index.word_position_docids.put(wtxn, &key, ids)?;
|
||||
index.word_position_docids.put(wtxn, &key, &ids)?;
|
||||
}
|
||||
|
||||
// We store the words from the postings.
|
||||
|
278
src/cache.rs
Normal file
278
src/cache.rs
Normal file
@ -0,0 +1,278 @@
|
||||
// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::hash_map::RandomState;
|
||||
use std::hash::{Hash, BuildHasher};
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
|
||||
/// An LRU cache.
|
||||
#[derive(Clone)]
|
||||
pub struct LruCache<K: Eq + Hash, V, S: BuildHasher = RandomState> {
|
||||
map: LinkedHashMap<K, V, S>,
|
||||
max_size: usize,
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V> LruCache<K, V> {
|
||||
/// Creates an empty cache that can hold at most `capacity` items.
|
||||
pub fn new(capacity: usize) -> Self {
|
||||
LruCache {
|
||||
map: LinkedHashMap::new(),
|
||||
max_size: capacity,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V, S: BuildHasher> LruCache<K, V, S> {
|
||||
/// Creates an empty cache that can hold at most `capacity` items with the given hash builder.
|
||||
pub fn with_hasher(capacity: usize, hash_builder: S) -> Self {
|
||||
LruCache { map: LinkedHashMap::with_hasher(hash_builder), max_size: capacity }
|
||||
}
|
||||
|
||||
/// Checks if the map contains the given key.
|
||||
pub fn contains_key<Q: ?Sized>(&mut self, key: &Q) -> bool
|
||||
where K: Borrow<Q>,
|
||||
Q: Hash + Eq
|
||||
{
|
||||
self.get_mut(key).is_some()
|
||||
}
|
||||
|
||||
/// Inserts a key-value pair into the cache. If the maximum size is reached the LRU is returned.
|
||||
pub fn insert(&mut self, k: K, v: V) -> Option<(K, V)> {
|
||||
self.map.insert(k, v);
|
||||
if self.len() > self.capacity() {
|
||||
self.remove_lru()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to the value corresponding to the given key in the cache, if
|
||||
/// any.
|
||||
pub fn get_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
|
||||
where K: Borrow<Q>,
|
||||
Q: Hash + Eq
|
||||
{
|
||||
self.map.get_refresh(k)
|
||||
}
|
||||
|
||||
pub fn peek_mut<Q: ?Sized>(&mut self, k: &Q) -> Option<&mut V>
|
||||
where K: Borrow<Q>,
|
||||
Q: Hash + Eq
|
||||
{
|
||||
self.map.get_mut(k)
|
||||
}
|
||||
|
||||
/// Removes the given key from the cache and returns its corresponding value.
|
||||
pub fn remove<Q: ?Sized>(&mut self, k: &Q) -> Option<V>
|
||||
where K: Borrow<Q>,
|
||||
Q: Hash + Eq
|
||||
{
|
||||
self.map.remove(k)
|
||||
}
|
||||
|
||||
/// Returns the maximum number of key-value pairs the cache can hold.
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.max_size
|
||||
}
|
||||
|
||||
/// Sets the number of key-value pairs the cache can hold. Removes
|
||||
/// least-recently-used key-value pairs if necessary.
|
||||
pub fn set_capacity(&mut self, capacity: usize) {
|
||||
for _ in capacity..self.len() {
|
||||
self.remove_lru();
|
||||
}
|
||||
self.max_size = capacity;
|
||||
}
|
||||
|
||||
/// Removes and returns the least recently used key-value pair as a tuple.
|
||||
#[inline]
|
||||
pub fn remove_lru(&mut self) -> Option<(K, V)> {
|
||||
self.map.pop_front()
|
||||
}
|
||||
|
||||
/// Returns the number of key-value pairs in the cache.
|
||||
pub fn len(&self) -> usize { self.map.len() }
|
||||
|
||||
/// Returns `true` if the cache contains no key-value pairs.
|
||||
pub fn is_empty(&self) -> bool { self.map.is_empty() }
|
||||
|
||||
/// Removes all key-value pairs from the cache.
|
||||
pub fn clear(&mut self) { self.map.clear(); }
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V, S: BuildHasher> IntoIterator for LruCache<K, V, S> {
|
||||
type Item = (K, V);
|
||||
type IntoIter = IntoIter<K, V>;
|
||||
|
||||
fn into_iter(self) -> IntoIter<K, V> {
|
||||
IntoIter(self.map.into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct IntoIter<K, V>(linked_hash_map::IntoIter<K, V>);
|
||||
|
||||
impl<K, V> Iterator for IntoIter<K, V> {
|
||||
type Item = (K, V);
|
||||
|
||||
fn next(&mut self) -> Option<(K, V)> {
|
||||
self.0.next()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.0.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V> DoubleEndedIterator for IntoIter<K, V> {
|
||||
fn next_back(&mut self) -> Option<(K, V)> {
|
||||
self.0.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V> ExactSizeIterator for IntoIter<K, V> {
|
||||
fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ArcCache<K, V>
|
||||
where
|
||||
K: Eq + Hash,
|
||||
{
|
||||
recent_set: LruCache<K, V>,
|
||||
recent_evicted: LruCache<K, ()>,
|
||||
frequent_set: LruCache<K, V>,
|
||||
frequent_evicted: LruCache<K, ()>,
|
||||
capacity: usize,
|
||||
p: usize,
|
||||
}
|
||||
|
||||
impl<K, V> ArcCache<K, V>
|
||||
where
|
||||
K: Eq + Hash + Clone,
|
||||
{
|
||||
pub fn new(capacity: usize) -> ArcCache<K, V> {
|
||||
assert_ne!(capacity, 0, "cache length cannot be zero");
|
||||
ArcCache {
|
||||
recent_set: LruCache::new(capacity),
|
||||
recent_evicted: LruCache::new(capacity),
|
||||
frequent_set: LruCache::new(capacity),
|
||||
frequent_evicted: LruCache::new(capacity),
|
||||
capacity: capacity,
|
||||
p: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: K, value: V) -> Vec<(K, V)> {
|
||||
let mut evicted = Vec::new();
|
||||
if self.frequent_set.contains_key(&key) {
|
||||
evicted.extend(self.frequent_set.insert(key, value));
|
||||
return evicted;
|
||||
}
|
||||
if self.recent_set.contains_key(&key) {
|
||||
self.recent_set.remove(&key);
|
||||
evicted.extend(self.frequent_set.insert(key, value));
|
||||
return evicted;
|
||||
}
|
||||
if self.frequent_evicted.contains_key(&key) {
|
||||
let recent_evicted_len = self.recent_evicted.len();
|
||||
let frequent_evicted_len = self.frequent_evicted.len();
|
||||
let delta = if recent_evicted_len > frequent_evicted_len {
|
||||
recent_evicted_len / frequent_evicted_len
|
||||
} else {
|
||||
1
|
||||
};
|
||||
if delta < self.p {
|
||||
self.p -= delta;
|
||||
} else {
|
||||
self.p = 0
|
||||
}
|
||||
if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
|
||||
evicted.extend(self.replace(true));
|
||||
}
|
||||
self.frequent_evicted.remove(&key);
|
||||
return Vec::from_iter(self.frequent_set.insert(key, value));
|
||||
}
|
||||
if self.recent_evicted.contains_key(&key) {
|
||||
let recent_evicted_len = self.recent_evicted.len();
|
||||
let frequent_evicted_len = self.frequent_evicted.len();
|
||||
let delta = if frequent_evicted_len > recent_evicted_len {
|
||||
frequent_evicted_len / recent_evicted_len
|
||||
} else {
|
||||
1
|
||||
};
|
||||
if delta <= self.capacity - self.p {
|
||||
self.p += delta;
|
||||
} else {
|
||||
self.p = self.capacity;
|
||||
}
|
||||
if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
|
||||
evicted.extend(self.replace(false));
|
||||
}
|
||||
self.recent_evicted.remove(&key);
|
||||
evicted.extend(self.frequent_set.insert(key, value));
|
||||
return evicted;
|
||||
}
|
||||
let mut evicted = Vec::with_capacity(2);
|
||||
if self.recent_set.len() + self.frequent_set.len() >= self.capacity {
|
||||
evicted.extend(self.replace(false));
|
||||
}
|
||||
if self.recent_evicted.len() > self.capacity - self.p {
|
||||
self.recent_evicted.remove_lru();
|
||||
}
|
||||
if self.frequent_evicted.len() > self.p {
|
||||
self.frequent_evicted.remove_lru();
|
||||
}
|
||||
evicted.extend(self.recent_set.insert(key, value));
|
||||
evicted
|
||||
}
|
||||
|
||||
pub fn get_mut(&mut self, key: &K) -> Option<&mut V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
if let Some(value) = self.recent_set.remove(&key) {
|
||||
self.frequent_set.insert((*key).clone(), value);
|
||||
}
|
||||
self.frequent_set.get_mut(key)
|
||||
}
|
||||
|
||||
fn replace(&mut self, frequent_evicted_contains_key: bool) -> Option<(K, V)> {
|
||||
let recent_set_len = self.recent_set.len();
|
||||
if recent_set_len > 0
|
||||
&& (recent_set_len > self.p
|
||||
|| (recent_set_len == self.p && frequent_evicted_contains_key))
|
||||
{
|
||||
if let Some((old_key, old_val)) = self.recent_set.remove_lru() {
|
||||
self.recent_evicted.insert(old_key.clone(), ());
|
||||
return Some((old_key, old_val));
|
||||
}
|
||||
} else {
|
||||
if let Some((old_key, old_val)) = self.frequent_set.remove_lru() {
|
||||
self.frequent_evicted.insert(old_key.clone(), ());
|
||||
return Some((old_key, old_val));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V> IntoIterator for ArcCache<K, V>{
|
||||
type Item = (K, V);
|
||||
type IntoIter = std::iter::Chain<IntoIter<K, V>, IntoIter<K, V>>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.recent_set.into_iter().chain(self.frequent_set)
|
||||
}
|
||||
}
|
@ -2,6 +2,7 @@ mod best_proximity;
|
||||
mod heed_codec;
|
||||
mod iter_shortest_paths;
|
||||
mod query_tokens;
|
||||
pub mod cache;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
Loading…
Reference in New Issue
Block a user