From 07abebfc462c769ac859145109be04708d85b107 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 29 Jun 2020 18:15:03 +0200 Subject: [PATCH] Introduce a (too big) LRU cache --- Cargo.lock | 49 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/bin/indexer.rs | 53 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 35ace21fc..c33e73502 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,14 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "ahash" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3" +dependencies = [ + "const-random", +] + [[package]] name = "anyhow" version = "1.0.31" @@ -180,6 +189,26 @@ dependencies = [ "bitflags", ] +[[package]] +name = "const-random" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a" +dependencies = [ + "getrandom", + "proc-macro-hack", +] + [[package]] name = "cow-utils" version = "0.1.2" @@ -506,6 +535,16 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "hashbrown" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead" +dependencies = [ + "ahash", + "autocfg 0.1.7", +] + [[package]] name = "headers" version = "0.3.2" @@ -779,6 +818,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "lru" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2" +dependencies = [ + "hashbrown", +] + [[package]] name = "matches" version = "0.1.8" @@ -808,6 +856,7 @@ dependencies = [ "itertools", "jemallocator", "levenshtein_automata", + "lru", "memmap", "once_cell", "roaring", diff --git a/Cargo.toml b/Cargo.toml index 220f2653a..73e9a7f2a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ fxhash = "0.2.1" heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } +lru = "0.5.2" memmap = "0.7.0" once_cell = "1.4.0" roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" } diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 5b0d8f1b4..1465013fa 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -2,7 +2,6 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, BTreeSet}; use std::convert::{TryFrom, TryInto}; use std::io; -use std::iter::FromIterator; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -11,7 +10,7 @@ use cow_utils::CowUtils; use fst::Streamer; use heed::EnvOpenOptions; use heed::types::*; -use roaring::RoaringBitmap; +use lru::LruCache; use slice_group_by::StrGroupBy; use structopt::StructOpt; @@ -46,6 +45,10 @@ struct Opt { fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index: &Index) -> anyhow::Result<()> { eprintln!("Indexing into LMDB..."); + let cache_size = 3_000_000; + let mut word_positions = LruCache::new(cache_size + 1); + let mut word_position_docids = LruCache::new(cache_size + 1); + // Write the headers into a Vec of bytes. let headers = rdr.headers()?; let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); @@ -61,29 +64,47 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { if !word.is_empty() && word.len() < 500 { // LMDB limits - let word = word.cow_to_lowercase(); + let word = word.to_lowercase(); // TODO cow_to_lowercase let position = (attr * 1000 + pos) as u32; // ------ merge word positions -------- - let ids = match index.word_positions.get(wtxn, &word)? { - Some(mut ids) => { ids.insert(position); ids }, - None => RoaringBitmap::from_iter(Some(position)), + let ids = match word_positions.get_mut(&word) { + Some(ids) => ids, + None => { + let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default(); + word_positions.put(word.clone(), ids); + if word_positions.len() > cache_size { + let (word, ids) = word_positions.pop_lru().unwrap(); + index.word_positions.put(wtxn, &word, &ids)?; + } + word_positions.get_mut(&word).unwrap() + } }; - index.word_positions.put(wtxn, &word, &ids)?; + ids.insert(position); // ------ merge word position documents ids -------- let mut key = word.as_bytes().to_vec(); key.extend_from_slice(&position.to_be_bytes()); - let ids = match index.word_position_docids.get(wtxn, &key)? { - Some(mut ids) => { ids.insert(document_id); ids }, - None => RoaringBitmap::from_iter(Some(document_id)), + let ids = match word_position_docids.get_mut(&(word.clone(), position)) { + Some(ids) => ids, + None => { + let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default(); + word_position_docids.put((word.clone(), position), ids); + if word_position_docids.len() > cache_size { + let ((word, position), ids) = word_position_docids.pop_lru().unwrap(); + let mut key = word.as_bytes().to_vec(); + key.extend_from_slice(&position.to_be_bytes()); + index.word_position_docids.put(wtxn, &key, &ids)?; + } + word_position_docids.get_mut(&(word, position)).unwrap() + } }; - index.word_position_docids.put(wtxn, &key, &ids)?; + ids.insert(position); } } } @@ -95,6 +116,16 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index index.documents.put(wtxn, &BEU32::new(document_id), &document)?; } + for (word, ids) in &word_positions { + index.word_positions.put(wtxn, word, ids)?; + } + + for ((word, position), ids) in &word_position_docids { + let mut key = word.as_bytes().to_vec(); + key.extend_from_slice(&position.to_be_bytes()); + index.word_position_docids.put(wtxn, &key, ids)?; + } + // We store the words from the postings. let mut new_words = BTreeSet::default(); let iter = index.word_positions.as_polymorph().iter::<_, Str, DecodeIgnore>(wtxn)?;