Fix more errors around the cache

This commit is contained in:
Clément Renault 2024-10-16 15:57:06 +02:00
parent 05a015b27c
commit 495742e113
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 79 additions and 92 deletions

View File

@ -9,6 +9,7 @@ use roaring::bitmap::Statistics;
use roaring::RoaringBitmap;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::new::indexer::document_changes::MostlySend;
use crate::CboRoaringBitmapCodec;
const KEY_SIZE: usize = 12;
@ -273,6 +274,8 @@ impl<MF> SpilledCache<MF> {
}
}
unsafe impl<'extractor, MF: Send> MostlySend for CboCachedSorter<'extractor, MF> {}
#[derive(Default, Debug)]
struct Stats {
pub len: usize,

View File

@ -7,6 +7,7 @@ use std::ops::DerefMut as _;
use bumpalo::Bump;
use grenad::{MergeFunction, Merger};
use heed::RoTxn;
use raw_collections::alloc::RefBump;
use rayon::iter::{ParallelBridge as _, ParallelIterator as _};
use serde_json::Value;
@ -30,15 +31,10 @@ pub struct FacetedExtractorData<'extractor> {
}
impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
type Data = FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>;
type Data = RefCell<CboCachedSorter<'extractor, MergeDeladdCboRoaringBitmaps>>;
fn init_data(
&self,
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
) -> Result<Self::Data> {
Ok(FullySend(RefCell::new(CboCachedSorter::new(
// TODO use a better value
1_000_000.try_into().unwrap(),
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
Ok(RefCell::new(CboCachedSorter::new_in(
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
@ -51,13 +47,14 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
// 2. it creates correctness issues if it causes to yield a borrow-mut wielding task
false,
),
))))
extractor_alloc,
)))
}
fn process(
&self,
change: DocumentChange,
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
context: &DocumentChangeContext<Self::Data>,
) -> Result<()> {
FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change)
}
@ -67,16 +64,14 @@ pub struct FacetedDocidsExtractor;
impl FacetedDocidsExtractor {
fn extract_document_change(
context: &DocumentChangeContext<
FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
>,
context: &DocumentChangeContext<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
attributes_to_extract: &[&str],
document_change: DocumentChange,
) -> Result<()> {
let index = &context.index;
let rtxn = &context.txn;
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let mut cached_sorter = context.data.0.borrow_mut_or_yield();
let mut cached_sorter = context.data.borrow_mut_or_yield();
match document_change {
DocumentChange::Deletion(inner) => extract_document_facets(
attributes_to_extract,
@ -90,7 +85,8 @@ impl FacetedDocidsExtractor {
inner.docid(),
fid,
value,
)
);
Ok(())
},
),
DocumentChange::Update(inner) => {
@ -106,7 +102,8 @@ impl FacetedDocidsExtractor {
inner.docid(),
fid,
value,
)
);
Ok(())
},
)?;
@ -122,7 +119,8 @@ impl FacetedDocidsExtractor {
inner.docid(),
fid,
value,
)
);
Ok(())
},
)
}
@ -138,31 +136,27 @@ impl FacetedDocidsExtractor {
inner.docid(),
fid,
value,
)
);
Ok(())
},
),
}
}
fn facet_fn_with_options<MF>(
fn facet_fn_with_options<'extractor, MF>(
doc_alloc: &Bump,
cached_sorter: &mut CboCachedSorter<MF>,
cache_fn: impl Fn(&mut CboCachedSorter<MF>, &[u8], u32) -> grenad::Result<(), MF::Error>,
cached_sorter: &mut CboCachedSorter<'extractor, MF>,
cache_fn: impl Fn(&mut CboCachedSorter<'extractor, MF>, &[u8], u32),
docid: DocumentId,
fid: FieldId,
value: &Value,
) -> Result<()>
where
MF: MergeFunction,
MF::Error: Debug,
grenad::Error<MF::Error>: Into<crate::Error>,
{
) {
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
// Exists
// key: fid
buffer.push(FacetKind::Exists as u8);
buffer.extend_from_slice(&fid.to_be_bytes());
cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)?;
cache_fn(cached_sorter, &buffer, docid);
match value {
// Number
@ -177,10 +171,7 @@ impl FacetedDocidsExtractor {
buffer.push(0); // level 0
buffer.extend_from_slice(&ordered);
buffer.extend_from_slice(&n.to_be_bytes());
cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
} else {
Ok(())
cache_fn(cached_sorter, &buffer, docid);
}
}
// String
@ -193,7 +184,7 @@ impl FacetedDocidsExtractor {
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.push(0); // level 0
buffer.extend_from_slice(truncated.as_bytes());
cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
cache_fn(cached_sorter, &buffer, docid);
}
// Null
// key: fid
@ -201,7 +192,7 @@ impl FacetedDocidsExtractor {
buffer.clear();
buffer.push(FacetKind::Null as u8);
buffer.extend_from_slice(&fid.to_be_bytes());
cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
cache_fn(cached_sorter, &buffer, docid);
}
// Empty
// key: fid
@ -209,17 +200,17 @@ impl FacetedDocidsExtractor {
buffer.clear();
buffer.push(FacetKind::Empty as u8);
buffer.extend_from_slice(&fid.to_be_bytes());
cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
cache_fn(cached_sorter, &buffer, docid);
}
Value::Object(o) if o.is_empty() => {
buffer.clear();
buffer.push(FacetKind::Empty as u8);
buffer.extend_from_slice(&fid.to_be_bytes());
cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
cache_fn(cached_sorter, &buffer, docid);
}
// Otherwise, do nothing
/// TODO: What about Value::Bool?
_ => Ok(()),
_ => (),
}
}

View File

@ -14,7 +14,7 @@ use crate::update::new::extract::cache::CboCachedSorter;
use crate::update::new::extract::perm_json_p::contained_in;
use crate::update::new::indexer::document_changes::{
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
IndexingContext, RefCellExt, ThreadLocal,
IndexingContext, MostlySend, RefCellExt, ThreadLocal,
};
use crate::update::new::DocumentChange;
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
@ -22,26 +22,27 @@ use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_
const MAX_COUNTED_WORDS: usize = 30;
pub struct WordDocidsCachedSorters {
word_fid_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
exact_word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
word_position_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
fid_word_count_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
pub struct WordDocidsCachedSorters<'indexer> {
word_fid_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
word_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
exact_word_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
word_position_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
fid_word_count_docids: CboCachedSorter<'indexer, MergeDeladdCboRoaringBitmaps>,
fid_word_count: HashMap<FieldId, (usize, usize)>,
current_docid: Option<DocumentId>,
}
impl WordDocidsCachedSorters {
pub fn new(
unsafe impl<'indexer> MostlySend for WordDocidsCachedSorters<'indexer> {}
impl<'indexer> WordDocidsCachedSorters<'indexer> {
pub fn new_in(
indexer: GrenadParameters,
max_memory: Option<usize>,
capacity: NonZero<usize>,
alloc: RefBump<'indexer>,
) -> Self {
let max_memory = max_memory.map(|max_memory| max_memory / 4);
let word_fid_docids = CboCachedSorter::new(
capacity,
let word_fid_docids = CboCachedSorter::new_in(
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
@ -51,9 +52,9 @@ impl WordDocidsCachedSorters {
max_memory,
false,
),
RefBump::clone(&alloc),
);
let word_docids = CboCachedSorter::new(
capacity,
let word_docids = CboCachedSorter::new_in(
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
@ -63,9 +64,9 @@ impl WordDocidsCachedSorters {
max_memory,
false,
),
RefBump::clone(&alloc),
);
let exact_word_docids = CboCachedSorter::new(
capacity,
let exact_word_docids = CboCachedSorter::new_in(
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
@ -75,9 +76,9 @@ impl WordDocidsCachedSorters {
max_memory,
false,
),
RefBump::clone(&alloc),
);
let word_position_docids = CboCachedSorter::new(
capacity,
let word_position_docids = CboCachedSorter::new_in(
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
@ -87,9 +88,9 @@ impl WordDocidsCachedSorters {
max_memory,
false,
),
RefBump::clone(&alloc),
);
let fid_word_count_docids = CboCachedSorter::new(
capacity,
let fid_word_count_docids = CboCachedSorter::new_in(
create_sorter(
grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps,
@ -99,6 +100,7 @@ impl WordDocidsCachedSorters {
max_memory,
false,
),
alloc,
);
Self {
@ -120,29 +122,29 @@ impl WordDocidsCachedSorters {
exact: bool,
docid: u32,
buffer: &mut Vec<u8>,
) -> Result<()> {
) {
let key = word.as_bytes();
if exact {
self.exact_word_docids.insert_add_u32(key, docid)?;
self.exact_word_docids.insert_add_u32(key, docid);
} else {
self.word_docids.insert_add_u32(key, docid)?;
self.word_docids.insert_add_u32(key, docid);
}
buffer.clear();
buffer.extend_from_slice(word.as_bytes());
buffer.push(0);
buffer.extend_from_slice(&field_id.to_be_bytes());
self.word_fid_docids.insert_add_u32(buffer, docid)?;
self.word_fid_docids.insert_add_u32(buffer, docid);
let position = bucketed_position(position);
buffer.clear();
buffer.extend_from_slice(word.as_bytes());
buffer.push(0);
buffer.extend_from_slice(&position.to_be_bytes());
self.word_position_docids.insert_add_u32(buffer, docid)?;
self.word_position_docids.insert_add_u32(buffer, docid);
if self.current_docid.map_or(false, |id| docid != id) {
self.flush_fid_word_count(buffer)?;
self.flush_fid_word_count(buffer);
}
self.fid_word_count
@ -150,8 +152,6 @@ impl WordDocidsCachedSorters {
.and_modify(|(_current_count, new_count)| *new_count += 1)
.or_insert((0, 1));
self.current_docid = Some(docid);
Ok(())
}
fn insert_del_u32(
@ -162,61 +162,56 @@ impl WordDocidsCachedSorters {
exact: bool,
docid: u32,
buffer: &mut Vec<u8>,
) -> Result<()> {
) {
let key = word.as_bytes();
if exact {
self.exact_word_docids.insert_del_u32(key, docid)?;
self.exact_word_docids.insert_del_u32(key, docid);
} else {
self.word_docids.insert_del_u32(key, docid)?;
self.word_docids.insert_del_u32(key, docid);
}
buffer.clear();
buffer.extend_from_slice(word.as_bytes());
buffer.push(0);
buffer.extend_from_slice(&field_id.to_be_bytes());
self.word_fid_docids.insert_del_u32(buffer, docid)?;
self.word_fid_docids.insert_del_u32(buffer, docid);
let position = bucketed_position(position);
buffer.clear();
buffer.extend_from_slice(word.as_bytes());
buffer.push(0);
buffer.extend_from_slice(&position.to_be_bytes());
self.word_position_docids.insert_del_u32(buffer, docid)?;
self.word_position_docids.insert_del_u32(buffer, docid);
if self.current_docid.map_or(false, |id| docid != id) {
self.flush_fid_word_count(buffer)?;
self.flush_fid_word_count(buffer);
}
self.fid_word_count
.entry(field_id)
.and_modify(|(current_count, _new_count)| *current_count += 1)
.or_insert((1, 0));
self.current_docid = Some(docid);
Ok(())
self.current_docid = Some(docid);
}
fn flush_fid_word_count(&mut self, buffer: &mut Vec<u8>) -> Result<()> {
fn flush_fid_word_count(&mut self, buffer: &mut Vec<u8>) {
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
if current_count != new_count {
if current_count <= MAX_COUNTED_WORDS {
buffer.clear();
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.push(current_count as u8);
self.fid_word_count_docids
.insert_del_u32(buffer, self.current_docid.unwrap())?;
self.fid_word_count_docids.insert_del_u32(buffer, self.current_docid.unwrap());
}
if new_count <= MAX_COUNTED_WORDS {
buffer.clear();
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.push(new_count as u8);
self.fid_word_count_docids
.insert_add_u32(buffer, self.current_docid.unwrap())?;
self.fid_word_count_docids.insert_add_u32(buffer, self.current_docid.unwrap());
}
}
}
Ok(())
}
}
@ -312,24 +307,20 @@ pub struct WordDocidsExtractorData<'extractor> {
}
impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
type Data = FullySend<RefCell<WordDocidsCachedSorters>>;
type Data = RefCell<WordDocidsCachedSorters<'extractor>>;
fn init_data(
&self,
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
) -> Result<Self::Data> {
Ok(FullySend(RefCell::new(WordDocidsCachedSorters::new(
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
Ok(RefCell::new(WordDocidsCachedSorters::new_in(
self.grenad_parameters,
self.max_memory,
// TODO use a better value
200_000.try_into().unwrap(),
))))
extractor_alloc,
)))
}
fn process(
&self,
change: DocumentChange,
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
context: &DocumentChangeContext<Self::Data>,
) -> Result<()> {
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
}
@ -343,7 +334,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
return Ok(());
}
let mut data = data.0.borrow_mut();
let mut data = data.borrow_mut();
let WordDocidsCachedSorters {
word_fid_docids,
word_docids,
@ -454,7 +445,7 @@ impl WordDocidsExtractors {
}
fn extract_document_change(
context: &DocumentChangeContext<FullySend<RefCell<WordDocidsCachedSorters>>>,
context: &DocumentChangeContext<RefCell<WordDocidsCachedSorters>>,
document_tokenizer: &DocumentTokenizer,
document_change: DocumentChange,
) -> Result<()> {

View File

@ -104,6 +104,8 @@ pub struct FullySend<T>(pub T);
// SAFETY: a type **fully** send is always mostly send as well.
unsafe impl<T> MostlySend for FullySend<T> where T: Send {}
unsafe impl<T> MostlySend for RefCell<T> where T: MostlySend {}
impl<T> FullySend<T> {
pub fn into(self) -> T {
self.0