mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-10-30 01:38:49 +01:00
Rewrite the cache to fill multiple caches
This commit is contained in:
parent
437940d053
commit
b7e106b34a
@ -1,8 +1,11 @@
|
|||||||
|
use std::borrow::BorrowMut;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::hash::BuildHasher;
|
||||||
use std::io::{self, BufReader, BufWriter, Read as _, Seek, Write as _};
|
use std::io::{self, BufReader, BufWriter, Read as _, Seek, Write as _};
|
||||||
use std::vec;
|
use std::vec;
|
||||||
|
|
||||||
use hashbrown::hash_map::RawEntryMut;
|
use hashbrown::hash_map::RawEntryMut;
|
||||||
|
use hashbrown::{DefaultHashBuilder, HashMap};
|
||||||
use raw_collections::alloc::{RefBump, RefBytes};
|
use raw_collections::alloc::{RefBump, RefBytes};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use tempfile::tempfile;
|
use tempfile::tempfile;
|
||||||
@ -11,22 +14,6 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
|||||||
use crate::update::new::indexer::document_changes::MostlySend;
|
use crate::update::new::indexer::document_changes::MostlySend;
|
||||||
use crate::CboRoaringBitmapCodec;
|
use crate::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
const KEY_SIZE: usize = 12;
|
|
||||||
|
|
||||||
// #[derive(Debug)]
|
|
||||||
pub struct CboCachedSorter<'extractor> {
|
|
||||||
cache: hashbrown::HashMap<
|
|
||||||
RefBytes<'extractor>,
|
|
||||||
DelAddRoaringBitmap,
|
|
||||||
hashbrown::DefaultHashBuilder,
|
|
||||||
RefBump<'extractor>,
|
|
||||||
>,
|
|
||||||
alloc: RefBump<'extractor>,
|
|
||||||
spilled_entries: UnorderedEntries,
|
|
||||||
deladd_buffer: Vec<u8>,
|
|
||||||
cbo_buffer: Vec<u8>,
|
|
||||||
}
|
|
||||||
|
|
||||||
// # How the Merge Algorithm works
|
// # How the Merge Algorithm works
|
||||||
//
|
//
|
||||||
// Each extractor create #Threads caches and balances the entries
|
// Each extractor create #Threads caches and balances the entries
|
||||||
@ -71,80 +58,205 @@ pub struct CboCachedSorter<'extractor> {
|
|||||||
// For now we can use a grenad sorter for spilling even thought I think
|
// For now we can use a grenad sorter for spilling even thought I think
|
||||||
// it's not the most efficient way (too many files open, sorting entries).
|
// it's not the most efficient way (too many files open, sorting entries).
|
||||||
|
|
||||||
impl<'extractor> CboCachedSorter<'extractor> {
|
pub struct CboCachedSorter<'extractor> {
|
||||||
/// TODO may add the capacity
|
hasher: DefaultHashBuilder,
|
||||||
pub fn new_in(alloc: RefBump<'extractor>) -> io::Result<Self> {
|
alloc: RefBump<'extractor>,
|
||||||
Ok(CboCachedSorter {
|
caches: InnerCaches<'extractor>,
|
||||||
cache: hashbrown::HashMap::new_in(RefBump::clone(&alloc)),
|
}
|
||||||
alloc,
|
|
||||||
spilled_entries: tempfile().map(UnorderedEntries::new)?,
|
enum InnerCaches<'extractor> {
|
||||||
deladd_buffer: Vec::new(),
|
Normal(NormalCaches<'extractor>),
|
||||||
cbo_buffer: Vec::new(),
|
Spilling(SpillingCaches<'extractor>),
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'extractor> CboCachedSorter<'extractor> {
|
impl<'extractor> CboCachedSorter<'extractor> {
|
||||||
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) {
|
pub fn new_in(buckets: usize, alloc: RefBump<'extractor>) -> Self {
|
||||||
match self.cache.raw_entry_mut().from_key(key) {
|
Self {
|
||||||
|
hasher: DefaultHashBuilder::default(),
|
||||||
|
caches: InnerCaches::Normal(NormalCaches {
|
||||||
|
caches: std::iter::repeat_with(|| RefBump::clone(&alloc))
|
||||||
|
.map(HashMap::new_in)
|
||||||
|
.take(buckets)
|
||||||
|
.collect(),
|
||||||
|
}),
|
||||||
|
alloc,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn buckets(&self) -> usize {
|
||||||
|
match &self.caches {
|
||||||
|
InnerCaches::Normal(caches) => caches.caches.len(),
|
||||||
|
InnerCaches::Spilling(caches) => caches.caches.len(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> io::Result<()> {
|
||||||
|
let buckets = self.buckets();
|
||||||
|
match &mut self.caches {
|
||||||
|
InnerCaches::Normal(normal) => {
|
||||||
|
normal.insert_del_u32(&self.hasher, &self.alloc, buckets, key, n);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
InnerCaches::Spilling(spilling) => {
|
||||||
|
spilling.insert_del_u32(&self.hasher, buckets, key, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> io::Result<()> {
|
||||||
|
let buckets = self.buckets();
|
||||||
|
match &mut self.caches {
|
||||||
|
InnerCaches::Normal(normal) => {
|
||||||
|
normal.insert_add_u32(&self.hasher, &self.alloc, buckets, key, n);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
InnerCaches::Spilling(spilling) => {
|
||||||
|
spilling.insert_add_u32(&self.hasher, buckets, key, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn freeze(&mut self) -> grenad::Result<()> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct NormalCaches<'extractor> {
|
||||||
|
caches: Vec<
|
||||||
|
HashMap<RefBytes<'extractor>, DelAddRoaringBitmap, DefaultHashBuilder, RefBump<'extractor>>,
|
||||||
|
>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'extractor> NormalCaches<'extractor> {
|
||||||
|
pub fn insert_del_u32(
|
||||||
|
&mut self,
|
||||||
|
hasher: &DefaultHashBuilder,
|
||||||
|
alloc: &RefBump<'extractor>,
|
||||||
|
buckets: usize,
|
||||||
|
key: &[u8],
|
||||||
|
n: u32,
|
||||||
|
) {
|
||||||
|
let hash = compute_bytes_hash(hasher, key);
|
||||||
|
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||||
|
|
||||||
|
match self.caches[bucket].raw_entry_mut().from_hash(hash, |k| k.as_ref() == key) {
|
||||||
RawEntryMut::Occupied(mut entry) => {
|
RawEntryMut::Occupied(mut entry) => {
|
||||||
let DelAddRoaringBitmap { del, add: _ } = entry.get_mut();
|
entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||||
del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
|
||||||
}
|
}
|
||||||
RawEntryMut::Vacant(entry) => {
|
RawEntryMut::Vacant(entry) => {
|
||||||
let alloc = RefBump::clone(&self.alloc);
|
let alloc = RefBump::clone(&alloc);
|
||||||
let key = RefBump::map(alloc, |b| b.alloc_slice_copy(key));
|
let key = RefBump::map(alloc, |a| a.alloc_slice_copy(key));
|
||||||
entry.insert(RefBytes(key), DelAddRoaringBitmap::new_del_u32(n));
|
entry.insert_hashed_nocheck(
|
||||||
|
hash,
|
||||||
|
RefBytes(key),
|
||||||
|
DelAddRoaringBitmap::new_del_u32(n),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert_add_u32(&mut self, key: &[u8], n: u32) {
|
pub fn insert_add_u32(
|
||||||
match self.cache.raw_entry_mut().from_key(key) {
|
&mut self,
|
||||||
|
hasher: &DefaultHashBuilder,
|
||||||
|
alloc: &RefBump<'extractor>,
|
||||||
|
buckets: usize,
|
||||||
|
key: &[u8],
|
||||||
|
n: u32,
|
||||||
|
) {
|
||||||
|
let hash = compute_bytes_hash(hasher, key);
|
||||||
|
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||||
|
match self.caches[bucket].raw_entry_mut().from_hash(hash, |k| k.as_ref() == key) {
|
||||||
RawEntryMut::Occupied(mut entry) => {
|
RawEntryMut::Occupied(mut entry) => {
|
||||||
let DelAddRoaringBitmap { del: _, add } = entry.get_mut();
|
entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||||
add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
|
||||||
}
|
}
|
||||||
RawEntryMut::Vacant(entry) => {
|
RawEntryMut::Vacant(entry) => {
|
||||||
let alloc = RefBump::clone(&self.alloc);
|
let alloc = RefBump::clone(&alloc);
|
||||||
let key = RefBump::map(alloc, |b| b.alloc_slice_copy(key));
|
let key = RefBump::map(alloc, |a| a.alloc_slice_copy(key));
|
||||||
entry.insert(RefBytes(key), DelAddRoaringBitmap::new_add_u32(n));
|
entry.insert_hashed_nocheck(
|
||||||
|
hash,
|
||||||
|
RefBytes(key),
|
||||||
|
DelAddRoaringBitmap::new_add_u32(n),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SpillingCaches<'extractor> {
|
||||||
|
caches: Vec<
|
||||||
|
HashMap<RefBytes<'extractor>, DelAddRoaringBitmap, DefaultHashBuilder, RefBump<'extractor>>,
|
||||||
|
>,
|
||||||
|
// TODO it must be a grenad Sorter with a DelAddCboRoaringBitmapCodec
|
||||||
|
spilled_entries: Vec<UnorderedEntries>,
|
||||||
|
deladd_buffer: Vec<u8>,
|
||||||
|
cbo_buffer: Vec<u8>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'extractor> SpillingCaches<'extractor> {
|
||||||
|
pub fn insert_del_u32(
|
||||||
|
&mut self,
|
||||||
|
hasher: &DefaultHashBuilder,
|
||||||
|
buckets: usize,
|
||||||
|
key: &[u8],
|
||||||
|
n: u32,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
let hash = compute_bytes_hash(hasher, key);
|
||||||
|
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||||
|
match self.caches[bucket].raw_entry_mut().from_hash(hash, |k| k.as_ref() == key) {
|
||||||
|
RawEntryMut::Occupied(mut entry) => {
|
||||||
|
entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
RawEntryMut::Vacant(_entry) => {
|
||||||
|
let deladd = DelAddRoaringBitmap::new_del_u32(n);
|
||||||
|
spill_entry_to_disk(
|
||||||
|
&mut self.spilled_entries[bucket],
|
||||||
|
&mut self.deladd_buffer,
|
||||||
|
&mut self.cbo_buffer,
|
||||||
|
key,
|
||||||
|
deladd,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spill_to_disk(self) -> io::Result<SpilledCache> {
|
pub fn insert_add_u32(
|
||||||
let Self { cache, alloc: _, mut spilled_entries, mut deladd_buffer, mut cbo_buffer } = self;
|
&mut self,
|
||||||
|
hasher: &DefaultHashBuilder,
|
||||||
for (key, deladd) in cache {
|
buckets: usize,
|
||||||
spill_entry_to_disk(
|
key: &[u8],
|
||||||
&mut spilled_entries,
|
n: u32,
|
||||||
&mut deladd_buffer,
|
) -> io::Result<()> {
|
||||||
&mut cbo_buffer,
|
let hash = compute_bytes_hash(hasher, key);
|
||||||
&key,
|
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||||
deladd,
|
match self.caches[bucket].raw_entry_mut().from_hash(hash, |k| k.as_ref() == key) {
|
||||||
)?;
|
RawEntryMut::Occupied(mut entry) => {
|
||||||
|
entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
RawEntryMut::Vacant(_entry) => {
|
||||||
|
let deladd = DelAddRoaringBitmap::new_add_u32(n);
|
||||||
|
spill_entry_to_disk(
|
||||||
|
&mut self.spilled_entries[bucket],
|
||||||
|
&mut self.deladd_buffer,
|
||||||
|
&mut self.cbo_buffer,
|
||||||
|
key,
|
||||||
|
deladd,
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(SpilledCache { spilled_entries, deladd_buffer, cbo_buffer })
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO Do not spill to disk if not necessary
|
fn compute_bytes_hash<S: BuildHasher>(hash_builder: &S, key: &[u8]) -> u64 {
|
||||||
pub fn into_unordered_entries(self) -> io::Result<UnorderedEntriesIntoIter> {
|
use std::hash::{Hash, Hasher};
|
||||||
let Self { cache, alloc: _, mut spilled_entries, mut cbo_buffer, mut deladd_buffer } = self;
|
let mut state = hash_builder.build_hasher();
|
||||||
|
key.hash(&mut state);
|
||||||
|
state.finish()
|
||||||
|
}
|
||||||
|
|
||||||
for (key, deladd) in cache {
|
fn compute_bucket_from_hash(buckets: usize, hash: u64) -> usize {
|
||||||
spill_entry_to_disk(
|
hash as usize % buckets
|
||||||
&mut spilled_entries,
|
|
||||||
&mut deladd_buffer,
|
|
||||||
&mut cbo_buffer,
|
|
||||||
&key,
|
|
||||||
deladd,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
spilled_entries.into_iter_bitmap()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn spill_entry_to_disk(
|
fn spill_entry_to_disk(
|
||||||
@ -182,27 +294,6 @@ fn spill_entry_to_disk(
|
|||||||
spilled_entries.push(key, bytes)
|
spilled_entries.push(key, bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct SpilledCache {
|
|
||||||
spilled_entries: UnorderedEntries,
|
|
||||||
deladd_buffer: Vec<u8>,
|
|
||||||
cbo_buffer: Vec<u8>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SpilledCache {
|
|
||||||
pub fn reconstruct(self, alloc: RefBump<'_>) -> CboCachedSorter<'_> {
|
|
||||||
let SpilledCache { spilled_entries, deladd_buffer, cbo_buffer } = self;
|
|
||||||
CboCachedSorter {
|
|
||||||
cache: hashbrown::HashMap::new_in(RefBump::clone(&alloc)),
|
|
||||||
alloc,
|
|
||||||
spilled_entries,
|
|
||||||
deladd_buffer,
|
|
||||||
cbo_buffer,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe impl<'extractor> MostlySend for CboCachedSorter<'extractor> {}
|
|
||||||
|
|
||||||
pub struct UnorderedEntries {
|
pub struct UnorderedEntries {
|
||||||
entry_sizes: Vec<(u32, u32)>,
|
entry_sizes: Vec<(u32, u32)>,
|
||||||
file: BufWriter<File>,
|
file: BufWriter<File>,
|
||||||
@ -291,40 +382,6 @@ impl UnorderedEntriesIntoIter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Debug)]
|
|
||||||
struct Stats {
|
|
||||||
pub len: usize,
|
|
||||||
pub average: f32,
|
|
||||||
pub mean: u32,
|
|
||||||
pub min: u32,
|
|
||||||
pub max: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Stats {
|
|
||||||
fn from_slice(slice: &mut [u32]) -> Stats {
|
|
||||||
slice.sort_unstable();
|
|
||||||
Self::from_sorted_slice(slice)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_slice_p99(slice: &mut [u32]) -> Stats {
|
|
||||||
slice.sort_unstable();
|
|
||||||
let new_len = slice.len() - (slice.len() as f32 / 100.0) as usize;
|
|
||||||
match slice.get(..new_len) {
|
|
||||||
Some(slice) => Self::from_sorted_slice(slice),
|
|
||||||
None => Stats::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_sorted_slice(slice: &[u32]) -> Stats {
|
|
||||||
let sum: f64 = slice.iter().map(|i| *i as f64).sum();
|
|
||||||
let average = (sum / slice.len() as f64) as f32;
|
|
||||||
let mean = *slice.len().checked_div(2).and_then(|middle| slice.get(middle)).unwrap_or(&0);
|
|
||||||
let min = *slice.first().unwrap_or(&0);
|
|
||||||
let max = *slice.last().unwrap_or(&0);
|
|
||||||
Stats { len: slice.len(), average, mean, min, max }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct DelAddRoaringBitmap {
|
pub struct DelAddRoaringBitmap {
|
||||||
pub(crate) del: Option<RoaringBitmap>,
|
pub(crate) del: Option<RoaringBitmap>,
|
||||||
|
@ -221,7 +221,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
|
|||||||
type Data = RefCell<Option<WordDocidsCachedSorters<'extractor>>>;
|
type Data = RefCell<Option<WordDocidsCachedSorters<'extractor>>>;
|
||||||
|
|
||||||
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
|
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
|
||||||
Ok(RefCell::new(Some(WordDocidsCachedSorters::new_in(extractor_alloc))))
|
Ok(RefCell::new(Some(WordDocidsCachedSorters::new_in(extractor_alloc)?)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process(
|
fn process(
|
||||||
|
Loading…
Reference in New Issue
Block a user