mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Replace the arc cache by a simple linked hash map
This commit is contained in:
parent
4d22d80281
commit
ed05999f63
18
Cargo.lock
generated
18
Cargo.lock
generated
@ -12,14 +12,6 @@ version = "1.0.31"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f"
|
checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "arc-cache"
|
|
||||||
version = "0.2.4"
|
|
||||||
source = "git+https://github.com/Kerollmops/rust-arc-cache.git?rev=56530f2#56530f2d219823f8f88dc03851f8fe057bd72564"
|
|
||||||
dependencies = [
|
|
||||||
"xlru-cache",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arc-swap"
|
name = "arc-swap"
|
||||||
version = "0.4.6"
|
version = "0.4.6"
|
||||||
@ -957,7 +949,6 @@ name = "milli"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"arc-cache",
|
|
||||||
"askama",
|
"askama",
|
||||||
"askama_warp",
|
"askama_warp",
|
||||||
"bstr",
|
"bstr",
|
||||||
@ -971,6 +962,7 @@ dependencies = [
|
|||||||
"itertools",
|
"itertools",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
|
"linked-hash-map",
|
||||||
"log 0.4.11",
|
"log 0.4.11",
|
||||||
"memmap",
|
"memmap",
|
||||||
"near-proximity",
|
"near-proximity",
|
||||||
@ -2356,14 +2348,6 @@ dependencies = [
|
|||||||
"winapi-build",
|
"winapi-build",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "xlru-cache"
|
|
||||||
version = "0.1.2"
|
|
||||||
source = "git+https://github.com/Kerollmops/rust-xlru-cache.git?rev=3c90f49#3c90f49e11758ee0cc4ff145b2606ba143188b77"
|
|
||||||
dependencies = [
|
|
||||||
"linked-hash-map",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zerocopy"
|
name = "zerocopy"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
|
@ -7,7 +7,6 @@ default-run = "indexer"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.28"
|
anyhow = "1.0.28"
|
||||||
arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" }
|
|
||||||
bstr = "0.2.13"
|
bstr = "0.2.13"
|
||||||
byteorder = "1.3.4"
|
byteorder = "1.3.4"
|
||||||
csv = "1.1.3"
|
csv = "1.1.3"
|
||||||
@ -17,6 +16,7 @@ fxhash = "0.2.1"
|
|||||||
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
|
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
|
||||||
jemallocator = "0.3.2"
|
jemallocator = "0.3.2"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
|
linked-hash-map = "0.5.3"
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
|
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
|
||||||
once_cell = "1.4.0"
|
once_cell = "1.4.0"
|
||||||
|
@ -8,12 +8,12 @@ use std::{iter, thread};
|
|||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use arc_cache::ArcCache;
|
|
||||||
use bstr::ByteSlice as _;
|
use bstr::ByteSlice as _;
|
||||||
use csv::StringRecord;
|
use csv::StringRecord;
|
||||||
use flate2::read::GzDecoder;
|
use flate2::read::GzDecoder;
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
use heed::{EnvOpenOptions, BytesEncode, types::*};
|
use heed::{EnvOpenOptions, BytesEncode, types::*};
|
||||||
|
use linked_hash_map::LinkedHashMap;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
use memmap::Mmap;
|
use memmap::Mmap;
|
||||||
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
||||||
@ -89,9 +89,10 @@ struct IndexerOpt {
|
|||||||
#[structopt(long, default_value = "1610612736")] // 1.5 GB
|
#[structopt(long, default_value = "1610612736")] // 1.5 GB
|
||||||
max_memory: usize,
|
max_memory: usize,
|
||||||
|
|
||||||
/// Size of the ARC cache when indexing.
|
/// Size of the linked hash map cache when indexing.
|
||||||
#[structopt(long, default_value = "43690")]
|
/// The bigger it is, the faster the indexing is but the more memory it takes.
|
||||||
arc_cache_size: usize,
|
#[structopt(long, default_value = "4096")]
|
||||||
|
linked_hash_map_size: usize,
|
||||||
|
|
||||||
/// The name of the compression algorithm to use when compressing intermediate
|
/// The name of the compression algorithm to use when compressing intermediate
|
||||||
/// chunks during indexing documents.
|
/// chunks during indexing documents.
|
||||||
@ -159,7 +160,7 @@ fn compute_words_pair_proximities(
|
|||||||
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
|
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
|
||||||
|
|
||||||
struct Store {
|
struct Store {
|
||||||
word_docids: ArcCache<SmallVec32<u8>, RoaringBitmap>,
|
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
||||||
documents_ids: RoaringBitmap,
|
documents_ids: RoaringBitmap,
|
||||||
sorter: Sorter<MergeFn>,
|
sorter: Sorter<MergeFn>,
|
||||||
documents_sorter: Sorter<MergeFn>,
|
documents_sorter: Sorter<MergeFn>,
|
||||||
@ -169,7 +170,7 @@ struct Store {
|
|||||||
|
|
||||||
impl Store {
|
impl Store {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
arc_cache_size: usize,
|
linked_hash_map_size: usize,
|
||||||
max_nb_chunks: Option<usize>,
|
max_nb_chunks: Option<usize>,
|
||||||
max_memory: Option<usize>,
|
max_memory: Option<usize>,
|
||||||
chunk_compression_type: CompressionType,
|
chunk_compression_type: CompressionType,
|
||||||
@ -195,7 +196,8 @@ impl Store {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Store {
|
Store {
|
||||||
word_docids: ArcCache::new(arc_cache_size),
|
// We overflow by one before poping the LRU element.
|
||||||
|
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size + 1),
|
||||||
documents_ids: RoaringBitmap::new(),
|
documents_ids: RoaringBitmap::new(),
|
||||||
sorter: builder.build(),
|
sorter: builder.build(),
|
||||||
documents_sorter: documents_builder.build(),
|
documents_sorter: documents_builder.build(),
|
||||||
@ -207,9 +209,21 @@ impl Store {
|
|||||||
// Save the documents ids under the position and word we have seen it.
|
// Save the documents ids under the position and word we have seen it.
|
||||||
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> {
|
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> {
|
||||||
let word_vec = SmallVec32::from(word.as_bytes());
|
let word_vec = SmallVec32::from(word.as_bytes());
|
||||||
let ids = RoaringBitmap::from_iter(Some(id));
|
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||||
let (_, lrus) = self.word_docids.insert(word_vec, ids, |old, new| old.union_with(&new));
|
match self.word_docids.get_refresh(&word_vec) {
|
||||||
Self::write_word_docids(&mut self.sorter, lrus)?;
|
Some(old) => { old.insert(id); },
|
||||||
|
None => {
|
||||||
|
// A newly inserted element is append at the end of the linked hash map.
|
||||||
|
self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id)));
|
||||||
|
// If the word docids just reached it's capacity we must make sure to remove
|
||||||
|
// one element, this way next time we insert we doesn't grow the capacity.
|
||||||
|
if self.word_docids.len() == self.word_docids.capacity() {
|
||||||
|
// Removing the front element is equivalent to removing the LRU element.
|
||||||
|
let lru = self.word_docids.pop_front();
|
||||||
|
Self::write_word_docids(&mut self.sorter, lru)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -600,7 +614,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
let index = Index::new(&env)?;
|
let index = Index::new(&env)?;
|
||||||
|
|
||||||
let num_threads = rayon::current_num_threads();
|
let num_threads = rayon::current_num_threads();
|
||||||
let arc_cache_size = opt.indexer.arc_cache_size;
|
let linked_hash_map_size = opt.indexer.linked_hash_map_size;
|
||||||
let max_nb_chunks = opt.indexer.max_nb_chunks;
|
let max_nb_chunks = opt.indexer.max_nb_chunks;
|
||||||
let max_memory = opt.indexer.max_memory;
|
let max_memory = opt.indexer.max_memory;
|
||||||
let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type);
|
let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type);
|
||||||
@ -611,7 +625,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, rdr)| {
|
.map(|(i, rdr)| {
|
||||||
Store::new(
|
Store::new(
|
||||||
arc_cache_size,
|
linked_hash_map_size,
|
||||||
max_nb_chunks,
|
max_nb_chunks,
|
||||||
Some(max_memory),
|
Some(max_memory),
|
||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
|
Loading…
Reference in New Issue
Block a user