mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-10 23:18:55 +01:00
Introduce a (too big) LRU cache
This commit is contained in:
parent
5f0088594b
commit
07abebfc46
49
Cargo.lock
generated
49
Cargo.lock
generated
@ -1,5 +1,14 @@
|
|||||||
# This file is automatically @generated by Cargo.
|
# This file is automatically @generated by Cargo.
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
|
[[package]]
|
||||||
|
name = "ahash"
|
||||||
|
version = "0.2.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6f33b5018f120946c1dcf279194f238a9f146725593ead1c08fa47ff22b0b5d3"
|
||||||
|
dependencies = [
|
||||||
|
"const-random",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.31"
|
version = "1.0.31"
|
||||||
@ -180,6 +189,26 @@ dependencies = [
|
|||||||
"bitflags",
|
"bitflags",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "const-random"
|
||||||
|
version = "0.1.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2f1af9ac737b2dd2d577701e59fd09ba34822f6f2ebdb30a7647405d9e55e16a"
|
||||||
|
dependencies = [
|
||||||
|
"const-random-macro",
|
||||||
|
"proc-macro-hack",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "const-random-macro"
|
||||||
|
version = "0.1.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "25e4c606eb459dd29f7c57b2e0879f2b6f14ee130918c2b78ccb58a9624e6c7a"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom",
|
||||||
|
"proc-macro-hack",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cow-utils"
|
name = "cow-utils"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
@ -506,6 +535,16 @@ dependencies = [
|
|||||||
"tokio-util",
|
"tokio-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.6.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e6073d0ca812575946eb5f35ff68dbe519907b25c42530389ff946dc84c6ead"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
"autocfg 0.1.7",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "headers"
|
name = "headers"
|
||||||
version = "0.3.2"
|
version = "0.3.2"
|
||||||
@ -779,6 +818,15 @@ dependencies = [
|
|||||||
"cfg-if",
|
"cfg-if",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lru"
|
||||||
|
version = "0.5.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "297efb9401445cf7b6986a583d7ac194023334b46b294ff7da0d36662c1251c2"
|
||||||
|
dependencies = [
|
||||||
|
"hashbrown",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "matches"
|
name = "matches"
|
||||||
version = "0.1.8"
|
version = "0.1.8"
|
||||||
@ -808,6 +856,7 @@ dependencies = [
|
|||||||
"itertools",
|
"itertools",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
|
"lru",
|
||||||
"memmap",
|
"memmap",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"roaring",
|
"roaring",
|
||||||
|
@ -16,6 +16,7 @@ fxhash = "0.2.1"
|
|||||||
heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
|
heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
|
||||||
jemallocator = "0.3.2"
|
jemallocator = "0.3.2"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
|
lru = "0.5.2"
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
once_cell = "1.4.0"
|
once_cell = "1.4.0"
|
||||||
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
|
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
|
||||||
|
@ -2,7 +2,6 @@ use std::collections::hash_map::Entry;
|
|||||||
use std::collections::{HashMap, BTreeSet};
|
use std::collections::{HashMap, BTreeSet};
|
||||||
use std::convert::{TryFrom, TryInto};
|
use std::convert::{TryFrom, TryInto};
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::iter::FromIterator;
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
@ -11,7 +10,7 @@ use cow_utils::CowUtils;
|
|||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use heed::EnvOpenOptions;
|
use heed::EnvOpenOptions;
|
||||||
use heed::types::*;
|
use heed::types::*;
|
||||||
use roaring::RoaringBitmap;
|
use lru::LruCache;
|
||||||
use slice_group_by::StrGroupBy;
|
use slice_group_by::StrGroupBy;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
@ -46,6 +45,10 @@ struct Opt {
|
|||||||
fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
|
fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> {
|
||||||
eprintln!("Indexing into LMDB...");
|
eprintln!("Indexing into LMDB...");
|
||||||
|
|
||||||
|
let cache_size = 3_000_000;
|
||||||
|
let mut word_positions = LruCache::new(cache_size + 1);
|
||||||
|
let mut word_position_docids = LruCache::new(cache_size + 1);
|
||||||
|
|
||||||
// Write the headers into a Vec of bytes.
|
// Write the headers into a Vec of bytes.
|
||||||
let headers = rdr.headers()?;
|
let headers = rdr.headers()?;
|
||||||
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
|
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
|
||||||
@ -61,29 +64,47 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
|
|||||||
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
||||||
for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
|
for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
|
||||||
if !word.is_empty() && word.len() < 500 { // LMDB limits
|
if !word.is_empty() && word.len() < 500 { // LMDB limits
|
||||||
let word = word.cow_to_lowercase();
|
let word = word.to_lowercase(); // TODO cow_to_lowercase
|
||||||
let position = (attr * 1000 + pos) as u32;
|
let position = (attr * 1000 + pos) as u32;
|
||||||
|
|
||||||
// ------ merge word positions --------
|
// ------ merge word positions --------
|
||||||
|
|
||||||
let ids = match index.word_positions.get(wtxn, &word)? {
|
let ids = match word_positions.get_mut(&word) {
|
||||||
Some(mut ids) => { ids.insert(position); ids },
|
Some(ids) => ids,
|
||||||
None => RoaringBitmap::from_iter(Some(position)),
|
None => {
|
||||||
|
let ids = index.word_positions.get(wtxn, &word)?.unwrap_or_default();
|
||||||
|
word_positions.put(word.clone(), ids);
|
||||||
|
if word_positions.len() > cache_size {
|
||||||
|
let (word, ids) = word_positions.pop_lru().unwrap();
|
||||||
|
index.word_positions.put(wtxn, &word, &ids)?;
|
||||||
|
}
|
||||||
|
word_positions.get_mut(&word).unwrap()
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
index.word_positions.put(wtxn, &word, &ids)?;
|
ids.insert(position);
|
||||||
|
|
||||||
// ------ merge word position documents ids --------
|
// ------ merge word position documents ids --------
|
||||||
|
|
||||||
let mut key = word.as_bytes().to_vec();
|
let mut key = word.as_bytes().to_vec();
|
||||||
key.extend_from_slice(&position.to_be_bytes());
|
key.extend_from_slice(&position.to_be_bytes());
|
||||||
|
|
||||||
let ids = match index.word_position_docids.get(wtxn, &key)? {
|
let ids = match word_position_docids.get_mut(&(word.clone(), position)) {
|
||||||
Some(mut ids) => { ids.insert(document_id); ids },
|
Some(ids) => ids,
|
||||||
None => RoaringBitmap::from_iter(Some(document_id)),
|
None => {
|
||||||
|
let ids = index.word_position_docids.get(wtxn, &key)?.unwrap_or_default();
|
||||||
|
word_position_docids.put((word.clone(), position), ids);
|
||||||
|
if word_position_docids.len() > cache_size {
|
||||||
|
let ((word, position), ids) = word_position_docids.pop_lru().unwrap();
|
||||||
|
let mut key = word.as_bytes().to_vec();
|
||||||
|
key.extend_from_slice(&position.to_be_bytes());
|
||||||
|
index.word_position_docids.put(wtxn, &key, &ids)?;
|
||||||
|
}
|
||||||
|
word_position_docids.get_mut(&(word, position)).unwrap()
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
index.word_position_docids.put(wtxn, &key, &ids)?;
|
ids.insert(position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -95,6 +116,16 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
|
|||||||
index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
|
index.documents.put(wtxn, &BEU32::new(document_id), &document)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (word, ids) in &word_positions {
|
||||||
|
index.word_positions.put(wtxn, word, ids)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
for ((word, position), ids) in &word_position_docids {
|
||||||
|
let mut key = word.as_bytes().to_vec();
|
||||||
|
key.extend_from_slice(&position.to_be_bytes());
|
||||||
|
index.word_position_docids.put(wtxn, &key, ids)?;
|
||||||
|
}
|
||||||
|
|
||||||
// We store the words from the postings.
|
// We store the words from the postings.
|
||||||
let mut new_words = BTreeSet::default();
|
let mut new_words = BTreeSet::default();
|
||||||
let iter = index.word_positions.as_polymorph().iter::<_, Str, DecodeIgnore>(wtxn)?;
|
let iter = index.word_positions.as_polymorph().iter::<_, Str, DecodeIgnore>(wtxn)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user