diff --git a/Cargo.lock b/Cargo.lock index 57a02e8e2..b7f33cd88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12,14 +12,6 @@ version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" -[[package]] -name = "arc-cache" -version = "0.2.4" -source = "git+https://github.com/Kerollmops/rust-arc-cache.git?rev=56530f2#56530f2d219823f8f88dc03851f8fe057bd72564" -dependencies = [ - "xlru-cache", -] - [[package]] name = "arc-swap" version = "0.4.6" @@ -957,7 +949,6 @@ name = "milli" version = "0.1.0" dependencies = [ "anyhow", - "arc-cache", "askama", "askama_warp", "bstr", @@ -971,6 +962,7 @@ dependencies = [ "itertools", "jemallocator", "levenshtein_automata", + "linked-hash-map", "log 0.4.11", "memmap", "near-proximity", @@ -2356,14 +2348,6 @@ dependencies = [ "winapi-build", ] -[[package]] -name = "xlru-cache" -version = "0.1.2" -source = "git+https://github.com/Kerollmops/rust-xlru-cache.git?rev=3c90f49#3c90f49e11758ee0cc4ff145b2606ba143188b77" -dependencies = [ - "linked-hash-map", -] - [[package]] name = "zerocopy" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index b24845ada..60777ee34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,6 @@ default-run = "indexer" [dependencies] anyhow = "1.0.28" -arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" } bstr = "0.2.13" byteorder = "1.3.4" csv = "1.1.3" @@ -17,6 +16,7 @@ fxhash = "0.2.1" heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } +linked-hash-map = "0.5.3" memmap = "0.7.0" near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } once_cell = "1.4.0" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 73494be64..37a7bedd4 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -8,12 +8,12 @@ use std::{iter, thread}; use std::time::Instant; use anyhow::Context; -use arc_cache::ArcCache; use bstr::ByteSlice as _; use csv::StringRecord; use flate2::read::GzDecoder; use fst::IntoStreamer; use heed::{EnvOpenOptions, BytesEncode, types::*}; +use linked_hash_map::LinkedHashMap; use log::{debug, info}; use memmap::Mmap; use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; @@ -89,9 +89,10 @@ struct IndexerOpt { #[structopt(long, default_value = "1610612736")] // 1.5 GB max_memory: usize, - /// Size of the ARC cache when indexing. - #[structopt(long, default_value = "43690")] - arc_cache_size: usize, + /// Size of the linked hash map cache when indexing. + /// The bigger it is, the faster the indexing is but the more memory it takes. + #[structopt(long, default_value = "4096")] + linked_hash_map_size: usize, /// The name of the compression algorithm to use when compressing intermediate /// chunks during indexing documents. @@ -159,7 +160,7 @@ fn compute_words_pair_proximities( type MergeFn = fn(&[u8], &[Vec]) -> Result, ()>; struct Store { - word_docids: ArcCache, RoaringBitmap>, + word_docids: LinkedHashMap, RoaringBitmap>, documents_ids: RoaringBitmap, sorter: Sorter, documents_sorter: Sorter, @@ -169,7 +170,7 @@ struct Store { impl Store { pub fn new( - arc_cache_size: usize, + linked_hash_map_size: usize, max_nb_chunks: Option, max_memory: Option, chunk_compression_type: CompressionType, @@ -195,7 +196,8 @@ impl Store { } Store { - word_docids: ArcCache::new(arc_cache_size), + // We overflow by one before poping the LRU element. + word_docids: LinkedHashMap::with_capacity(linked_hash_map_size + 1), documents_ids: RoaringBitmap::new(), sorter: builder.build(), documents_sorter: documents_builder.build(), @@ -207,9 +209,21 @@ impl Store { // Save the documents ids under the position and word we have seen it. fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { let word_vec = SmallVec32::from(word.as_bytes()); - let ids = RoaringBitmap::from_iter(Some(id)); - let (_, lrus) = self.word_docids.insert(word_vec, ids, |old, new| old.union_with(&new)); - Self::write_word_docids(&mut self.sorter, lrus)?; + // if get_refresh finds the element it is assured to be at the end of the linked hash map. + match self.word_docids.get_refresh(&word_vec) { + Some(old) => { old.insert(id); }, + None => { + // A newly inserted element is append at the end of the linked hash map. + self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id))); + // If the word docids just reached it's capacity we must make sure to remove + // one element, this way next time we insert we doesn't grow the capacity. + if self.word_docids.len() == self.word_docids.capacity() { + // Removing the front element is equivalent to removing the LRU element. + let lru = self.word_docids.pop_front(); + Self::write_word_docids(&mut self.sorter, lru)?; + } + } + } Ok(()) } @@ -600,7 +614,7 @@ fn main() -> anyhow::Result<()> { let index = Index::new(&env)?; let num_threads = rayon::current_num_threads(); - let arc_cache_size = opt.indexer.arc_cache_size; + let linked_hash_map_size = opt.indexer.linked_hash_map_size; let max_nb_chunks = opt.indexer.max_nb_chunks; let max_memory = opt.indexer.max_memory; let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type); @@ -611,7 +625,7 @@ fn main() -> anyhow::Result<()> { .enumerate() .map(|(i, rdr)| { Store::new( - arc_cache_size, + linked_hash_map_size, max_nb_chunks, Some(max_memory), chunk_compression_type,