mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-10-30 01:38:49 +01:00
Make progress on the new cache system
This commit is contained in:
parent
f18fed9e32
commit
3a76ccb6e1
@ -1,55 +1,68 @@
|
||||
use std::cell::RefCell;
|
||||
use std::fmt::Write as _;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Read as _, Write as _};
|
||||
use std::io::{self, BufReader, BufWriter, Read as _, Seek, Write as _};
|
||||
use std::vec;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use grenad::{MergeFunction, Sorter};
|
||||
use hashbrown::hash_map::RawEntryMut;
|
||||
use raw_collections::alloc::{RefBump, RefBytes};
|
||||
use roaring::bitmap::Statistics;
|
||||
use roaring::RoaringBitmap;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::new::indexer::document_changes::MostlySend;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
const KEY_SIZE: usize = 12;
|
||||
|
||||
// #[derive(Debug)]
|
||||
pub struct CboCachedSorter<'extractor, MF> {
|
||||
pub struct CboCachedSorter<'extractor> {
|
||||
cache: hashbrown::HashMap<
|
||||
// TODO check the size of it
|
||||
RefBytes<'extractor>,
|
||||
DelAddRoaringBitmap,
|
||||
hashbrown::DefaultHashBuilder,
|
||||
RefBump<'extractor>,
|
||||
>,
|
||||
alloc: RefBump<'extractor>,
|
||||
sorter: Sorter<MF>,
|
||||
spilled_entries: UnorderedEntries,
|
||||
deladd_buffer: Vec<u8>,
|
||||
cbo_buffer: Vec<u8>,
|
||||
total_insertions: usize,
|
||||
fitted_in_key: usize,
|
||||
}
|
||||
|
||||
impl<'extractor, MF> CboCachedSorter<'extractor, MF> {
|
||||
// # How the Merge Algorithm works
|
||||
//
|
||||
// - Collect all hashmaps to the main thread
|
||||
// - Iterator over all the hashmaps in the different threads
|
||||
// - Each thread must take care of its own keys (regarding a hash number)
|
||||
// - Also read the spilled content which are inside
|
||||
// - Each thread must populate a local hashmap with the entries
|
||||
// - Every thread send the merged content to the main writing thread
|
||||
//
|
||||
// ## Next Step
|
||||
//
|
||||
// - Define the size of the buckets in advance to make sure everything fits in memory.
|
||||
// ```
|
||||
// let total_buckets = 32;
|
||||
// (0..total_buckets).par_iter().for_each(|n| {
|
||||
// let hash = todo!();
|
||||
// if hash % total_bucket == n {
|
||||
// // take care of this key
|
||||
// }
|
||||
// });
|
||||
// ```
|
||||
|
||||
impl<'extractor> CboCachedSorter<'extractor> {
|
||||
/// TODO may add the capacity
|
||||
pub fn new_in(sorter: Sorter<MF>, alloc: RefBump<'extractor>) -> Self {
|
||||
CboCachedSorter {
|
||||
pub fn new_in(alloc: RefBump<'extractor>) -> io::Result<Self> {
|
||||
Ok(CboCachedSorter {
|
||||
cache: hashbrown::HashMap::new_in(RefBump::clone(&alloc)),
|
||||
alloc,
|
||||
sorter,
|
||||
spilled_entries: tempfile().map(UnorderedEntries::new)?,
|
||||
deladd_buffer: Vec::new(),
|
||||
cbo_buffer: Vec::new(),
|
||||
total_insertions: 0,
|
||||
fitted_in_key: 0,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> {
|
||||
impl<'extractor> CboCachedSorter<'extractor> {
|
||||
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) {
|
||||
match self.cache.raw_entry_mut().from_key(key) {
|
||||
RawEntryMut::Occupied(mut entry) => {
|
||||
@ -57,8 +70,6 @@ impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> {
|
||||
del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
RawEntryMut::Vacant(entry) => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let alloc = RefBump::clone(&self.alloc);
|
||||
let key = RefBump::map(alloc, |b| b.alloc_slice_copy(key));
|
||||
entry.insert(RefBytes(key), DelAddRoaringBitmap::new_del_u32(n));
|
||||
@ -73,8 +84,6 @@ impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> {
|
||||
add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
RawEntryMut::Vacant(entry) => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let alloc = RefBump::clone(&self.alloc);
|
||||
let key = RefBump::map(alloc, |b| b.alloc_slice_copy(key));
|
||||
entry.insert(RefBytes(key), DelAddRoaringBitmap::new_add_u32(n));
|
||||
@ -82,164 +91,104 @@ impl<'extractor, MF: MergeFunction> CboCachedSorter<'extractor, MF> {
|
||||
}
|
||||
}
|
||||
|
||||
fn write_entry<A: AsRef<[u8]>>(
|
||||
sorter: &mut Sorter<MF>,
|
||||
deladd_buffer: &mut Vec<u8>,
|
||||
cbo_buffer: &mut Vec<u8>,
|
||||
key: A,
|
||||
deladd: DelAddRoaringBitmap,
|
||||
) -> grenad::Result<(), MF::Error> {
|
||||
deladd_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(deladd_buffer);
|
||||
match deladd {
|
||||
DelAddRoaringBitmap { del: Some(del), add: None } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
|
||||
}
|
||||
let bytes = value_writer.into_inner().unwrap();
|
||||
sorter.insert(key, bytes)
|
||||
}
|
||||
|
||||
pub fn spill_to_disk(self) -> grenad::Result<SpilledCache<MF>, MF::Error> {
|
||||
let Self {
|
||||
cache,
|
||||
alloc: _,
|
||||
mut sorter,
|
||||
mut deladd_buffer,
|
||||
mut cbo_buffer,
|
||||
total_insertions,
|
||||
fitted_in_key,
|
||||
} = self;
|
||||
pub fn spill_to_disk(self) -> io::Result<SpilledCache> {
|
||||
let Self { cache, alloc: _, mut spilled_entries, mut deladd_buffer, mut cbo_buffer } = self;
|
||||
|
||||
for (key, deladd) in cache {
|
||||
Self::write_entry(&mut sorter, &mut deladd_buffer, &mut cbo_buffer, key, deladd)?;
|
||||
spill_entry_to_disk(
|
||||
&mut spilled_entries,
|
||||
&mut deladd_buffer,
|
||||
&mut cbo_buffer,
|
||||
&key,
|
||||
deladd,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(SpilledCache { sorter, deladd_buffer, cbo_buffer, total_insertions, fitted_in_key })
|
||||
Ok(SpilledCache { spilled_entries, deladd_buffer, cbo_buffer })
|
||||
}
|
||||
|
||||
pub fn into_sorter(self) -> grenad::Result<Sorter<MF>, MF::Error> {
|
||||
let Self {
|
||||
cache,
|
||||
alloc: _,
|
||||
mut sorter,
|
||||
mut deladd_buffer,
|
||||
mut cbo_buffer,
|
||||
total_insertions,
|
||||
fitted_in_key,
|
||||
} = self;
|
||||
|
||||
let mut all_n_containers = Vec::new();
|
||||
let mut all_n_array_containers = Vec::new();
|
||||
let mut all_n_bitset_containers = Vec::new();
|
||||
let mut all_n_values_array_containers = Vec::new();
|
||||
let mut all_n_values_bitset_containers = Vec::new();
|
||||
let mut all_cardinality = Vec::new();
|
||||
|
||||
for (_key, deladd) in &cache {
|
||||
for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() {
|
||||
let Statistics {
|
||||
n_containers,
|
||||
n_array_containers,
|
||||
n_bitset_containers,
|
||||
n_values_array_containers,
|
||||
n_values_bitset_containers,
|
||||
cardinality,
|
||||
..
|
||||
} = bitmap.statistics();
|
||||
all_n_containers.push(n_containers);
|
||||
all_n_array_containers.push(n_array_containers);
|
||||
all_n_bitset_containers.push(n_bitset_containers);
|
||||
all_n_values_array_containers.push(n_values_array_containers);
|
||||
all_n_values_bitset_containers.push(n_values_bitset_containers);
|
||||
all_cardinality.push(cardinality as u32);
|
||||
}
|
||||
}
|
||||
// TODO Do not spill to disk if not necessary
|
||||
pub fn into_unordered_entries(self) -> io::Result<UnorderedEntriesIntoIter> {
|
||||
let Self { cache, alloc: _, mut spilled_entries, mut cbo_buffer, mut deladd_buffer } = self;
|
||||
|
||||
for (key, deladd) in cache {
|
||||
Self::write_entry(&mut sorter, &mut deladd_buffer, &mut cbo_buffer, key, deladd)?;
|
||||
spill_entry_to_disk(
|
||||
&mut spilled_entries,
|
||||
&mut deladd_buffer,
|
||||
&mut cbo_buffer,
|
||||
&key,
|
||||
deladd,
|
||||
)?;
|
||||
}
|
||||
|
||||
let mut output = String::new();
|
||||
|
||||
for (name, mut slice) in [
|
||||
("n_containers", all_n_containers),
|
||||
("n_array_containers", all_n_array_containers),
|
||||
("n_bitset_containers", all_n_bitset_containers),
|
||||
("n_values_array_containers", all_n_values_array_containers),
|
||||
("n_values_bitset_containers", all_n_values_bitset_containers),
|
||||
("cardinality", all_cardinality),
|
||||
] {
|
||||
let _ = writeln!(&mut output, "{name} (p100) {:?}", Stats::from_slice(&mut slice));
|
||||
// let _ = writeln!(&mut output, "{name} (p99) {:?}", Stats::from_slice_p99(&mut slice));
|
||||
}
|
||||
|
||||
let _ = writeln!(
|
||||
&mut output,
|
||||
"LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions",
|
||||
fitted_in_key,
|
||||
(fitted_in_key as f32 / total_insertions as f32) * 100.0,
|
||||
total_insertions,
|
||||
);
|
||||
|
||||
eprintln!("{output}");
|
||||
|
||||
Ok(sorter)
|
||||
spilled_entries.into_iter_bitmap()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SpilledCache<MF> {
|
||||
sorter: Sorter<MF>,
|
||||
fn spill_entry_to_disk(
|
||||
spilled_entries: &mut UnorderedEntries,
|
||||
deladd_buffer: &mut Vec<u8>,
|
||||
cbo_buffer: &mut Vec<u8>,
|
||||
key: &[u8],
|
||||
deladd: DelAddRoaringBitmap,
|
||||
) -> io::Result<()> {
|
||||
deladd_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(deladd_buffer);
|
||||
match deladd {
|
||||
DelAddRoaringBitmap { del: Some(del), add: None } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
|
||||
}
|
||||
let bytes = value_writer.into_inner().unwrap();
|
||||
spilled_entries.push(key, bytes)
|
||||
}
|
||||
|
||||
pub struct SpilledCache {
|
||||
spilled_entries: UnorderedEntries,
|
||||
deladd_buffer: Vec<u8>,
|
||||
cbo_buffer: Vec<u8>,
|
||||
total_insertions: usize,
|
||||
fitted_in_key: usize,
|
||||
}
|
||||
|
||||
impl<MF> SpilledCache<MF> {
|
||||
pub fn reconstruct(self, alloc: RefBump<'_>) -> CboCachedSorter<'_, MF> {
|
||||
let SpilledCache { sorter, deladd_buffer, cbo_buffer, total_insertions, fitted_in_key } =
|
||||
self;
|
||||
|
||||
impl SpilledCache {
|
||||
pub fn reconstruct(self, alloc: RefBump<'_>) -> CboCachedSorter<'_> {
|
||||
let SpilledCache { spilled_entries, deladd_buffer, cbo_buffer } = self;
|
||||
CboCachedSorter {
|
||||
cache: hashbrown::HashMap::new_in(RefBump::clone(&alloc)),
|
||||
alloc,
|
||||
sorter,
|
||||
spilled_entries,
|
||||
deladd_buffer,
|
||||
cbo_buffer,
|
||||
total_insertions,
|
||||
fitted_in_key,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl<'extractor, MF: Send> MostlySend for CboCachedSorter<'extractor, MF> {}
|
||||
unsafe impl<'extractor> MostlySend for CboCachedSorter<'extractor> {}
|
||||
|
||||
pub struct UnorderedEntries {
|
||||
entry_offsets: Vec<(u32, u32)>,
|
||||
entry_sizes: Vec<(u32, u32)>,
|
||||
file: BufWriter<File>,
|
||||
}
|
||||
|
||||
impl UnorderedEntries {
|
||||
pub fn new(file: File) -> Self {
|
||||
UnorderedEntries { entry_offsets: Vec::new(), file: BufWriter::new(file) }
|
||||
fn new(file: File) -> Self {
|
||||
UnorderedEntries { entry_sizes: Vec::new(), file: BufWriter::new(file) }
|
||||
}
|
||||
|
||||
/// Pushes a new tuple of key/value into a file.
|
||||
@ -249,33 +198,41 @@ impl UnorderedEntries {
|
||||
/// # Panics
|
||||
///
|
||||
/// - Panics if the key or value length is larger than 2^32 bytes.
|
||||
pub fn push(&mut self, key: &[u8], value: &[u8]) -> io::Result<()> {
|
||||
fn push(&mut self, key: &[u8], value: &[u8]) -> io::Result<()> {
|
||||
let key_len = key.len().try_into().unwrap();
|
||||
let value_len = key.len().try_into().unwrap();
|
||||
let value_len = value.len().try_into().unwrap();
|
||||
|
||||
self.file.write_all(key)?;
|
||||
self.file.write_all(value)?;
|
||||
|
||||
self.entry_offsets.push((key_len, value_len));
|
||||
self.entry_sizes.push((key_len, value_len));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn into_iter_ref(self) -> IntoIterRef {
|
||||
let Self { entry_offsets, file } = self;
|
||||
IntoIterRef { entry_offsets: entry_offsets.into_iter(), file, buffer: Vec::new() }
|
||||
fn into_iter_bitmap(self) -> io::Result<UnorderedEntriesIntoIter> {
|
||||
let Self { entry_sizes, file } = self;
|
||||
|
||||
let mut file = file.into_inner().map_err(|e| e.into_error())?;
|
||||
file.rewind()?;
|
||||
|
||||
Ok(UnorderedEntriesIntoIter {
|
||||
entry_sizes: entry_sizes.into_iter(),
|
||||
file: BufReader::new(file),
|
||||
buffer: Vec::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntoIterRef {
|
||||
entry_offsets: vec::IntoIter<(u32, u32)>,
|
||||
pub struct UnorderedEntriesIntoIter {
|
||||
entry_sizes: vec::IntoIter<(u32, u32)>,
|
||||
file: BufReader<File>,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl IntoIterRef {
|
||||
pub fn next(&mut self) -> io::Result<Option<(&[u8], &[u8])>> {
|
||||
match self.entry_offsets.next() {
|
||||
impl UnorderedEntriesIntoIter {
|
||||
fn next_ref(&mut self) -> io::Result<Option<(&[u8], &[u8])>> {
|
||||
match self.entry_sizes.next() {
|
||||
Some((key_len, value_len)) => {
|
||||
let key_len = key_len as usize;
|
||||
let value_len = value_len as usize;
|
||||
@ -287,10 +244,25 @@ impl IntoIterRef {
|
||||
self.file.read_exact(buffer)?;
|
||||
let buffer = &self.buffer[..total_len];
|
||||
|
||||
let (key, value) = buffer.split_at(key_len);
|
||||
debug_assert_eq!(value.len(), value_len);
|
||||
Ok(Some(buffer.split_at(key_len)))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some((key, value)))
|
||||
pub fn next_deladd_bitmap(&mut self) -> io::Result<Option<(&[u8], DelAddRoaringBitmap)>> {
|
||||
match self.next_ref()? {
|
||||
Some((key, value_bytes)) => {
|
||||
let reader = KvReaderDelAdd::from_slice(value_bytes);
|
||||
let del = match reader.get(DelAdd::Deletion) {
|
||||
Some(del_bytes) => Some(CboRoaringBitmapCodec::deserialize_from(del_bytes)?),
|
||||
None => None,
|
||||
};
|
||||
let add = match reader.get(DelAdd::Addition) {
|
||||
Some(add_bytes) => Some(CboRoaringBitmapCodec::deserialize_from(add_bytes)?),
|
||||
None => None,
|
||||
};
|
||||
Ok(Some((key, DelAddRoaringBitmap { del, add })))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
|
@ -1,14 +1,12 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::fs::File;
|
||||
use std::ops::DerefMut as _;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use grenad::{MergeFunction, Merger};
|
||||
use grenad::Merger;
|
||||
use heed::RoTxn;
|
||||
use raw_collections::alloc::RefBump;
|
||||
use rayon::iter::{ParallelBridge as _, ParallelIterator as _};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::super::cache::CboCachedSorter;
|
||||
@ -21,7 +19,7 @@ use crate::update::new::indexer::document_changes::{
|
||||
IndexingContext, RefCellExt, ThreadLocal,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub struct FacetedExtractorData<'extractor> {
|
||||
@ -31,24 +29,10 @@ pub struct FacetedExtractorData<'extractor> {
|
||||
}
|
||||
|
||||
impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
|
||||
type Data = RefCell<CboCachedSorter<'extractor, MergeDeladdCboRoaringBitmaps>>;
|
||||
type Data = RefCell<CboCachedSorter<'extractor>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
self.grenad_parameters.chunk_compression_type,
|
||||
self.grenad_parameters.chunk_compression_level,
|
||||
self.grenad_parameters.max_nb_chunks,
|
||||
self.max_memory,
|
||||
// *NOTE*: this must not be set to true:
|
||||
// 1. we're already using max parallelism in the pool, so it wouldn't help
|
||||
// 2. it creates correctness issues if it causes to yield a borrow-mut wielding task
|
||||
false,
|
||||
),
|
||||
extractor_alloc,
|
||||
)))
|
||||
Ok(RefCell::new(CboCachedSorter::new_in(extractor_alloc)?))
|
||||
}
|
||||
|
||||
fn process(
|
||||
@ -64,7 +48,7 @@ pub struct FacetedDocidsExtractor;
|
||||
|
||||
impl FacetedDocidsExtractor {
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
|
||||
context: &DocumentChangeContext<RefCell<CboCachedSorter>>,
|
||||
attributes_to_extract: &[&str],
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
@ -143,10 +127,10 @@ impl FacetedDocidsExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
fn facet_fn_with_options<'extractor, MF>(
|
||||
fn facet_fn_with_options<'extractor>(
|
||||
doc_alloc: &Bump,
|
||||
cached_sorter: &mut CboCachedSorter<'extractor, MF>,
|
||||
cache_fn: impl Fn(&mut CboCachedSorter<'extractor, MF>, &[u8], u32),
|
||||
cached_sorter: &mut CboCachedSorter<'extractor>,
|
||||
cache_fn: impl Fn(&mut CboCachedSorter<'extractor>, &[u8], u32),
|
||||
docid: DocumentId,
|
||||
fid: FieldId,
|
||||
value: &Value,
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::num::NonZero;
|
||||
use std::io;
|
||||
use std::ops::DerefMut as _;
|
||||
|
||||
use bumpalo::Bump;
|
||||
@ -17,17 +17,16 @@ use crate::update::new::indexer::document_changes::{
|
||||
IndexingContext, MostlySend, RefCellExt, ThreadLocal,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
pub struct WordDocidsCachedSorters<'indexer> {
|
||||
word_fid_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
|
||||
word_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
|
||||
exact_word_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
|
||||
word_position_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
|
||||
fid_word_count_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
|
||||
word_fid_docids: CboCachedSorter<'indexer>,
|
||||
word_docids: CboCachedSorter<'indexer>,
|
||||
exact_word_docids: CboCachedSorter<'indexer>,
|
||||
word_position_docids: CboCachedSorter<'indexer>,
|
||||
fid_word_count_docids: CboCachedSorter<'indexer>,
|
||||
fid_word_count: HashMap<FieldId, (usize, usize)>,
|
||||
current_docid: Option<DocumentId>,
|
||||
}
|
||||
@ -35,83 +34,16 @@ pub struct WordDocidsCachedSorters<'indexer> {
|
||||
unsafe impl<'indexer> MostlySend for WordDocidsCachedSorters<'indexer> {}
|
||||
|
||||
impl<'indexer> WordDocidsCachedSorters<'indexer> {
|
||||
pub fn new_in(
|
||||
indexer: GrenadParameters,
|
||||
max_memory: Option<usize>,
|
||||
alloc: RefBump<'indexer>,
|
||||
) -> Self {
|
||||
let max_memory = max_memory.map(|max_memory| max_memory / 4);
|
||||
|
||||
let word_fid_docids = CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
false,
|
||||
),
|
||||
RefBump::clone(&alloc),
|
||||
);
|
||||
let word_docids = CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
false,
|
||||
),
|
||||
RefBump::clone(&alloc),
|
||||
);
|
||||
let exact_word_docids = CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
false,
|
||||
),
|
||||
RefBump::clone(&alloc),
|
||||
);
|
||||
let word_position_docids = CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
false,
|
||||
),
|
||||
RefBump::clone(&alloc),
|
||||
);
|
||||
let fid_word_count_docids = CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
false,
|
||||
),
|
||||
alloc,
|
||||
);
|
||||
|
||||
Self {
|
||||
word_fid_docids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
pub fn new_in(alloc: RefBump<'indexer>) -> io::Result<Self> {
|
||||
Ok(Self {
|
||||
word_fid_docids: CboCachedSorter::new_in(RefBump::clone(&alloc))?,
|
||||
word_docids: CboCachedSorter::new_in(RefBump::clone(&alloc))?,
|
||||
exact_word_docids: CboCachedSorter::new_in(RefBump::clone(&alloc))?,
|
||||
word_position_docids: CboCachedSorter::new_in(RefBump::clone(&alloc))?,
|
||||
fid_word_count_docids: CboCachedSorter::new_in(alloc)?,
|
||||
fid_word_count: HashMap::new(),
|
||||
current_docid: None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn insert_add_u32(
|
||||
@ -253,21 +185,17 @@ impl WordDocidsMergerBuilders {
|
||||
current_docid: _,
|
||||
} = other;
|
||||
|
||||
let word_fid_docids_readers =
|
||||
word_fid_docids.into_sorter().and_then(|s| s.into_reader_cursors());
|
||||
let word_docids_readers = word_docids.into_sorter().and_then(|s| s.into_reader_cursors());
|
||||
let exact_word_docids_readers =
|
||||
exact_word_docids.into_sorter().and_then(|s| s.into_reader_cursors());
|
||||
let word_position_docids_readers =
|
||||
word_position_docids.into_sorter().and_then(|s| s.into_reader_cursors());
|
||||
let fid_word_count_docids_readers =
|
||||
fid_word_count_docids.into_sorter().and_then(|s| s.into_reader_cursors());
|
||||
let word_fid_docids_entries = word_fid_docids.into_unordered_entries()?;
|
||||
let word_docids_entries = word_docids.into_unordered_entries()?;
|
||||
let exact_word_docids_entries = exact_word_docids.into_unordered_entries()?;
|
||||
let word_position_docids_entries = word_position_docids.into_unordered_entries()?;
|
||||
let fid_word_count_docids_entries = fid_word_count_docids.into_unordered_entries()?;
|
||||
|
||||
self.word_fid_docids.extend(word_fid_docids_readers?);
|
||||
self.word_docids.extend(word_docids_readers?);
|
||||
self.exact_word_docids.extend(exact_word_docids_readers?);
|
||||
self.word_position_docids.extend(word_position_docids_readers?);
|
||||
self.fid_word_count_docids.extend(fid_word_count_docids_readers?);
|
||||
self.word_fid_docids.push(word_fid_docids_entries);
|
||||
self.word_docids.push(word_docids_entries);
|
||||
self.exact_word_docids.push(exact_word_docids_entries);
|
||||
self.word_position_docids.push(word_position_docids_entries);
|
||||
self.fid_word_count_docids.push(fid_word_count_docids_entries);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -293,11 +221,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
|
||||
type Data = RefCell<Option<WordDocidsCachedSorters<'extractor>>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(Some(WordDocidsCachedSorters::new_in(
|
||||
self.grenad_parameters,
|
||||
self.max_memory,
|
||||
extractor_alloc,
|
||||
))))
|
||||
Ok(RefCell::new(Some(WordDocidsCachedSorters::new_in(extractor_alloc))))
|
||||
}
|
||||
|
||||
fn process(
|
||||
@ -357,7 +281,6 @@ pub struct WordDocidsExtractors;
|
||||
|
||||
impl WordDocidsExtractors {
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||
extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
|
||||
|
@ -31,7 +31,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
// and to store the docids of the documents that have a number of words in a given field
|
||||
// equal to or under than MAX_COUNTED_WORDS.
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
|
||||
context: &DocumentChangeContext<RefCell<CboCachedSorter>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
|
@ -12,7 +12,6 @@ pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||
use grenad::Merger;
|
||||
use heed::RoTxn;
|
||||
use raw_collections::alloc::RefBump;
|
||||
use rayon::iter::{ParallelBridge, ParallelIterator};
|
||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
|
||||
use super::cache::CboCachedSorter;
|
||||
@ -22,7 +21,7 @@ use crate::update::new::indexer::document_changes::{
|
||||
IndexingContext, ThreadLocal,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> {
|
||||
@ -35,21 +34,10 @@ pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> {
|
||||
impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
||||
for SearchableExtractorData<'extractor, EX>
|
||||
{
|
||||
type Data = RefCell<CboCachedSorter<'extractor, MergeDeladdCboRoaringBitmaps>>;
|
||||
type Data = RefCell<CboCachedSorter<'extractor>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(CboCachedSorter::new_in(
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
self.grenad_parameters.chunk_compression_type,
|
||||
self.grenad_parameters.chunk_compression_level,
|
||||
self.grenad_parameters.max_nb_chunks,
|
||||
self.max_memory,
|
||||
false,
|
||||
),
|
||||
extractor_alloc,
|
||||
)))
|
||||
Ok(RefCell::new(CboCachedSorter::new_in(extractor_alloc)?))
|
||||
}
|
||||
|
||||
fn process(
|
||||
|
Loading…
Reference in New Issue
Block a user