From 07abebfc462c769ac859145109be04708d85b107 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 29 Jun 2020 18:15:03 +0200
Subject: [PATCH] Introduce a (too big) LRU cache

---
 Cargo.lock         | 49 ++++++++++++++++++++++++++++++++++++++++++
 Cargo.toml         |  1 +
 src/bin/indexer.rs | 53 ++++++++++++++++++++++++++++++++++++----------
 3 files changed, 92 insertions(+), 11 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 35ace21fc..c33e73502 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,5 +1,14 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+[[package]]
+name = "ahash"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3"
+dependencies = [
+ "const-random",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.31"
@@ -180,6 +189,26 @@ dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "const-random"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a"
+dependencies = [
+ "const-random-macro",
+ "proc-macro-hack",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a"
+dependencies = [
+ "getrandom",
+ "proc-macro-hack",
+]
+
 [[package]]
 name = "cow-utils"
 version = "0.1.2"
@@ -506,6 +535,16 @@ dependencies = [
  "tokio-util",
 ]
 
+[[package]]
+name = "hashbrown"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead"
+dependencies = [
+ "ahash",
+ "autocfg 0.1.7",
+]
+
 [[package]]
 name = "headers"
 version = "0.3.2"
@@ -779,6 +818,15 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "lru"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2"
+dependencies = [
+ "hashbrown",
+]
+
 [[package]]
 name = "matches"
 version = "0.1.8"
@@ -808,6 +856,7 @@ dependencies = [
  "itertools",
  "jemallocator",
  "levenshtein_automata",
+ "lru",
  "memmap",
  "once_cell",
  "roaring",
diff --git a/Cargo.toml b/Cargo.toml
index 220f2653a..73e9a7f2a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ fxhash = "0.2.1"
 heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
 jemallocator = "0.3.2"
 levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
+lru = "0.5.2"
 memmap = "0.7.0"
 once_cell = "1.4.0"
 roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index 5b0d8f1b4..1465013fa 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -2,7 +2,6 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, BTreeSet};
 use std::convert::{TryFrom, TryInto};
 use std::io;
-use std::iter::FromIterator;
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
@@ -11,7 +10,7 @@ use cow_utils::CowUtils;
 use fst::Streamer;
 use heed::EnvOpenOptions;
 use heed::types::*;
-use roaring::RoaringBitmap;
+use lru::LruCache;
 use slice_group_by::StrGroupBy;
 use structopt::StructOpt;
 
@@ -46,6 +45,10 @@ struct Opt {
 fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
     eprintln!("Indexing into LMDB...");
 
+    let cache_size = 3_000_000;
+    let mut word_positions = LruCache::new(cache_size + 1);
+    let mut word_position_docids = LruCache::new(cache_size + 1);
+
     // Write the headers into a Vec of bytes.
     let headers = rdr.headers()?;
     let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
@@ -61,29 +64,47 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
         for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
             for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
                 if !word.is_empty() && word.len() < 500 { // LMDB limits
-                    let word = word.cow_to_lowercase();
+                    let word = word.to_lowercase(); // TODO cow_to_lowercase
                     let position = (attr * 1000 + pos) as u32;
 
                     // ------ merge word positions --------
 
-                    let ids = match index.word_positions.get(wtxn, &word)? {
-                        Some(mut ids) => { ids.insert(position); ids },
-                        None => RoaringBitmap::from_iter(Some(position)),
+                    let ids = match word_positions.get_mut(&word) {
+                        Some(ids) => ids,
+                        None => {
+                            let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
+                            word_positions.put(word.clone(), ids);
+                            if word_positions.len() > cache_size {
+                                let (word, ids) = word_positions.pop_lru().unwrap();
+                                index.word_positions.put(wtxn, &word, &ids)?;
+                            }
+                            word_positions.get_mut(&word).unwrap()
+                        }
                     };
 
-                    index.word_positions.put(wtxn, &word, &ids)?;
+                    ids.insert(position);
 
                     // ------ merge word position documents ids --------
 
                     let mut key = word.as_bytes().to_vec();
                     key.extend_from_slice(&position.to_be_bytes());
 
-                    let ids = match index.word_position_docids.get(wtxn, &key)? {
-                        Some(mut ids) => { ids.insert(document_id); ids },
-                        None => RoaringBitmap::from_iter(Some(document_id)),
+                    let ids = match word_position_docids.get_mut(&(word.clone(), position)) {
+                        Some(ids) => ids,
+                        None => {
+                            let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
+                            word_position_docids.put((word.clone(), position), ids);
+                            if word_position_docids.len() > cache_size {
+                                let ((word, position), ids) = word_position_docids.pop_lru().unwrap();
+                                let mut key = word.as_bytes().to_vec();
+                                key.extend_from_slice(&position.to_be_bytes());
+                                index.word_position_docids.put(wtxn, &key, &ids)?;
+                            }
+                            word_position_docids.get_mut(&(word, position)).unwrap()
+                        }
                     };
 
-                    index.word_position_docids.put(wtxn, &key, &ids)?;
+                    ids.insert(position);
                 }
             }
         }
@@ -95,6 +116,16 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
         index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
     }
 
+    for (word, ids) in &word_positions {
+        index.word_positions.put(wtxn, word, ids)?;
+    }
+
+    for ((word, position), ids) in &word_position_docids {
+        let mut key = word.as_bytes().to_vec();
+        key.extend_from_slice(&position.to_be_bytes());
+        index.word_position_docids.put(wtxn, &key, ids)?;
+    }
+
     // We store the words from the postings.
     let mut new_words = BTreeSet::default();
     let iter = index.word_positions.as_polymorph().iter::<_, Str, DecodeIgnore>(wtxn)?;