Fix even more errors around the cache

This commit is contained in:
Clément Renault 2024-10-16 16:21:19 +02:00
parent 495742e113
commit 27fc50c476
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
6 changed files with 78 additions and 93 deletions

View File

@ -307,14 +307,14 @@ pub struct WordDocidsExtractorData<'extractor> {
} }
impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> { impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
type Data = RefCell<WordDocidsCachedSorters<'extractor>>; type Data = RefCell<Option<WordDocidsCachedSorters<'extractor>>>;
fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> { fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
Ok(RefCell::new(WordDocidsCachedSorters::new_in( Ok(RefCell::new(Some(WordDocidsCachedSorters::new_in(
self.grenad_parameters, self.grenad_parameters,
self.max_memory, self.max_memory,
extractor_alloc, extractor_alloc,
))) ))))
} }
fn process( fn process(
@ -334,7 +334,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
return Ok(()); return Ok(());
} }
let mut data = data.borrow_mut(); let sorters = data.borrow_mut().take().unwrap();
let WordDocidsCachedSorters { let WordDocidsCachedSorters {
word_fid_docids, word_fid_docids,
word_docids, word_docids,
@ -343,7 +343,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
fid_word_count_docids, fid_word_count_docids,
fid_word_count, fid_word_count,
current_docid, current_docid,
} = &mut *data; } = sorters;
let spilled_word_fid_docids = word_fid_docids.spill_to_disk()?; let spilled_word_fid_docids = word_fid_docids.spill_to_disk()?;
let spilled_word_docids = word_docids.spill_to_disk()?; let spilled_word_docids = word_docids.spill_to_disk()?;
@ -356,14 +356,17 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
extractor_alloc.borrow_mut().reset(); extractor_alloc.borrow_mut().reset();
let alloc = RefBump::new(extractor_alloc.borrow()); let alloc = RefBump::new(extractor_alloc.borrow());
data.word_fid_docids = spilled_word_fid_docids.reconstruct(RefBump::clone(&alloc)); *data.borrow_mut() = Some(WordDocidsCachedSorters {
data.word_docids = spilled_word_docids.reconstruct(RefBump::clone(&alloc)); word_fid_docids: spilled_word_fid_docids.reconstruct(RefBump::clone(&alloc)),
data.exact_word_docids = spilled_exact_word_docids.reconstruct(RefBump::clone(&alloc)); word_docids: spilled_word_docids.reconstruct(RefBump::clone(&alloc)),
data.word_position_docids = exact_word_docids: spilled_exact_word_docids.reconstruct(RefBump::clone(&alloc)),
spilled_word_position_docids.reconstruct(RefBump::clone(&alloc)); word_position_docids: spilled_word_position_docids.reconstruct(RefBump::clone(&alloc)),
data.fid_word_count_docids = spilled_fid_word_count_docids.reconstruct(alloc); fid_word_count_docids: spilled_fid_word_count_docids.reconstruct(alloc),
// data.fid_word_count = spilled_fid_word_count.reconstruct(); // fid_word_count: spilled_fid_word_count.reconstruct(),
// data.current_docid = spilled_current_docid.reconstruct(); // current_docid: spilled_current_docid.reconstruct(),
fid_word_count,
current_docid,
});
Ok(()) Ok(())
} }
@ -436,7 +439,7 @@ impl WordDocidsExtractors {
tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
let _entered = span.enter(); let _entered = span.enter();
let mut builder = WordDocidsMergerBuilders::new(); let mut builder = WordDocidsMergerBuilders::new();
for cache in datastore.into_iter().map(|cache| cache.0.into_inner()) { for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
builder.add_sorters(cache)?; builder.add_sorters(cache)?;
} }
@ -445,14 +448,14 @@ impl WordDocidsExtractors {
} }
fn extract_document_change( fn extract_document_change(
context: &DocumentChangeContext<RefCell<WordDocidsCachedSorters>>, context: &DocumentChangeContext<RefCell<Option<WordDocidsCachedSorters>>>,
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()> { ) -> Result<()> {
let index = &context.index; let index = &context.index;
let rtxn = &context.txn; let rtxn = &context.txn;
let mut cached_sorter = context.data.0.borrow_mut_or_yield(); let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
let cached_sorter = cached_sorter.deref_mut(); let cached_sorter = cached_sorter.as_mut().unwrap();
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let new_fields_ids_map = new_fields_ids_map.deref_mut(); let new_fields_ids_map = new_fields_ids_map.deref_mut();
@ -463,16 +466,14 @@ impl WordDocidsExtractors {
match document_change { match document_change {
DocumentChange::Deletion(inner) => { DocumentChange::Deletion(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| { let mut token_fn = |fname: &str, fid, pos, word: &str| {
cached_sorter cached_sorter.insert_del_u32(
.insert_del_u32( fid,
fid, pos,
pos, word,
word, is_exact_attribute(fname),
is_exact_attribute(fname), inner.docid(),
inner.docid(), &mut buffer,
&mut buffer, );
)
.map_err(crate::Error::from)
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.current(rtxn, index, context.db_fields_ids_map)?, inner.current(rtxn, index, context.db_fields_ids_map)?,
@ -482,16 +483,14 @@ impl WordDocidsExtractors {
} }
DocumentChange::Update(inner) => { DocumentChange::Update(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| { let mut token_fn = |fname: &str, fid, pos, word: &str| {
cached_sorter cached_sorter.insert_del_u32(
.insert_del_u32( fid,
fid, pos,
pos, word,
word, is_exact_attribute(fname),
is_exact_attribute(fname), inner.docid(),
inner.docid(), &mut buffer,
&mut buffer, );
)
.map_err(crate::Error::from)
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.current(rtxn, index, context.db_fields_ids_map)?, inner.current(rtxn, index, context.db_fields_ids_map)?,
@ -500,16 +499,14 @@ impl WordDocidsExtractors {
)?; )?;
let mut token_fn = |fname: &str, fid, pos, word: &str| { let mut token_fn = |fname: &str, fid, pos, word: &str| {
cached_sorter cached_sorter.insert_add_u32(
.insert_add_u32( fid,
fid, pos,
pos, word,
word, is_exact_attribute(fname),
is_exact_attribute(fname), inner.docid(),
inner.docid(), &mut buffer,
&mut buffer, );
)
.map_err(crate::Error::from)
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.new(rtxn, index, context.db_fields_ids_map)?, inner.new(rtxn, index, context.db_fields_ids_map)?,
@ -519,16 +516,14 @@ impl WordDocidsExtractors {
} }
DocumentChange::Insertion(inner) => { DocumentChange::Insertion(inner) => {
let mut token_fn = |fname: &str, fid, pos, word: &str| { let mut token_fn = |fname: &str, fid, pos, word: &str| {
cached_sorter cached_sorter.insert_add_u32(
.insert_add_u32( fid,
fid, pos,
pos, word,
word, is_exact_attribute(fname),
is_exact_attribute(fname), inner.docid(),
inner.docid(), &mut buffer,
&mut buffer, );
)
.map_err(crate::Error::from)
}; };
document_tokenizer.tokenize_document( document_tokenizer.tokenize_document(
inner.new(), inner.new(),
@ -538,7 +533,9 @@ impl WordDocidsExtractors {
} }
} }
cached_sorter.flush_fid_word_count(&mut buffer) cached_sorter.flush_fid_word_count(&mut buffer);
Ok(())
} }
fn attributes_to_extract<'a>( fn attributes_to_extract<'a>(

View File

@ -28,11 +28,10 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
} }
// This method is reimplemented to count the number of words in the document in each field // This method is reimplemented to count the number of words in the document in each field
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. // and to store the docids of the documents that have a number of words in a given field
// equal to or under than MAX_COUNTED_WORDS.
fn extract_document_change( fn extract_document_change(
context: &DocumentChangeContext< context: &DocumentChangeContext<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
>,
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()> { ) -> Result<()> {
@ -48,7 +47,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let new_fields_ids_map = &mut *new_fields_ids_map; let new_fields_ids_map = &mut *new_fields_ids_map;
let mut cached_sorter = context.data.0.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield();
let cached_sorter = &mut *cached_sorter; let cached_sorter = &mut *cached_sorter;
// is a vecdequeue, and will be smol, so can stay on the heap for now // is a vecdequeue, and will be smol, so can stay on the heap for now
@ -109,14 +108,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2); del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in del_word_pair_proximity.iter() { for ((w1, w2), prox) in del_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer); let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)?; cached_sorter.insert_del_u32(key, docid);
} }
add_word_pair_proximity.sort_unstable(); add_word_pair_proximity.sort_unstable();
add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2); add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in add_word_pair_proximity.iter() { for ((w1, w2), prox) in add_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer); let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)?; cached_sorter.insert_add_u32(key, docid);
} }
Ok(()) Ok(())
} }
@ -139,7 +138,7 @@ fn build_key<'a>(
fn word_positions_into_word_pair_proximity( fn word_positions_into_word_pair_proximity(
word_positions: &mut VecDeque<(Rc<str>, u16)>, word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8), word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
) -> Result<()> { ) {
let (head_word, head_position) = word_positions.pop_front().unwrap(); let (head_word, head_position) = word_positions.pop_front().unwrap();
for (word, position) in word_positions.iter() { for (word, position) in word_positions.iter() {
let prox = index_proximity(head_position as u32, *position as u32) as u8; let prox = index_proximity(head_position as u32, *position as u32) as u8;
@ -147,7 +146,6 @@ fn word_positions_into_word_pair_proximity(
word_pair_proximity((head_word.clone(), word.clone()), prox); word_pair_proximity((head_word.clone(), word.clone()), prox);
} }
} }
Ok(())
} }
fn process_document_tokens<'doc>( fn process_document_tokens<'doc>(
@ -163,17 +161,16 @@ fn process_document_tokens<'doc>(
.front() .front()
.map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
{ {
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)
} }
// insert the new word. // insert the new word.
word_positions.push_back((Rc::from(word), pos)); word_positions.push_back((Rc::from(word), pos));
Ok(())
}; };
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
while !word_positions.is_empty() { while !word_positions.is_empty() {
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
} }
Ok(()) Ok(())

View File

@ -11,6 +11,7 @@ pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
use grenad::Merger; use grenad::Merger;
use heed::RoTxn; use heed::RoTxn;
use raw_collections::alloc::RefBump;
use rayon::iter::{ParallelBridge, ParallelIterator}; use rayon::iter::{ParallelBridge, ParallelIterator};
use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use tokenize_document::{tokenizer_builder, DocumentTokenizer};
@ -34,15 +35,10 @@ pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> {
impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
for SearchableExtractorData<'extractor, EX> for SearchableExtractorData<'extractor, EX>
{ {
type Data = FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>; type Data = RefCell<CboCachedSorter<'extractor, MergeDeladdCboRoaringBitmaps>>;
fn init_data( fn init_data(&self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data> {
&self, Ok(RefCell::new(CboCachedSorter::new_in(
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
) -> Result<Self::Data> {
Ok(FullySend(RefCell::new(CboCachedSorter::new(
// TODO use a better value
1_000_000.try_into().unwrap(),
create_sorter( create_sorter(
grenad::SortAlgorithm::Stable, grenad::SortAlgorithm::Stable,
MergeDeladdCboRoaringBitmaps, MergeDeladdCboRoaringBitmaps,
@ -52,13 +48,14 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
self.max_memory, self.max_memory,
false, false,
), ),
)))) extractor_alloc,
)))
} }
fn process( fn process(
&self, &self,
change: DocumentChange, change: DocumentChange,
context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>, context: &DocumentChangeContext<Self::Data>,
) -> Result<()> { ) -> Result<()> {
EX::extract_document_change(context, self.tokenizer, change) EX::extract_document_change(context, self.tokenizer, change)
} }
@ -150,9 +147,7 @@ pub trait SearchableExtractor: Sized + Sync {
} }
fn extract_document_change( fn extract_document_change(
context: &DocumentChangeContext< context: &DocumentChangeContext<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
>,
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()>; ) -> Result<()>;

View File

@ -26,7 +26,7 @@ impl<'a> DocumentTokenizer<'a> {
&self, &self,
document: impl Document<'doc>, document: impl Document<'doc>,
field_id_map: &mut GlobalFieldsIdsMap, field_id_map: &mut GlobalFieldsIdsMap,
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, token_fn: &mut impl FnMut(&str, FieldId, u16, &str),
) -> Result<()> { ) -> Result<()> {
let mut field_position = HashMap::new(); let mut field_position = HashMap::new();
@ -50,7 +50,7 @@ impl<'a> DocumentTokenizer<'a> {
Value::Number(n) => { Value::Number(n) => {
let token = n.to_string(); let token = n.to_string();
if let Ok(position) = (*position).try_into() { if let Ok(position) = (*position).try_into() {
token_fn(name, field_id, position, token.as_str())?; token_fn(name, field_id, position, token.as_str());
} }
Ok(()) Ok(())
@ -74,7 +74,7 @@ impl<'a> DocumentTokenizer<'a> {
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
*position = index; *position = index;
if let Ok(position) = (*position).try_into() { if let Ok(position) = (*position).try_into() {
token_fn(name, field_id, position, token)?; token_fn(name, field_id, position, token);
} }
} }
} }
@ -171,7 +171,6 @@ mod test {
use bumpalo::Bump; use bumpalo::Bump;
use charabia::TokenizerBuilder; use charabia::TokenizerBuilder;
use meili_snap::snapshot; use meili_snap::snapshot;
use raw_collections::RawMap; use raw_collections::RawMap;
use serde_json::json; use serde_json::json;
use serde_json::value::RawValue; use serde_json::value::RawValue;
@ -230,7 +229,6 @@ mod test {
&mut global_fields_ids_map, &mut global_fields_ids_map,
&mut |_fname, fid, pos, word| { &mut |_fname, fid, pos, word| {
words.insert([fid, pos], word.to_string()); words.insert([fid, pos], word.to_string());
Ok(())
}, },
) )
.unwrap(); .unwrap();

View File

@ -106,6 +106,8 @@ unsafe impl<T> MostlySend for FullySend<T> where T: Send {}
unsafe impl<T> MostlySend for RefCell<T> where T: MostlySend {} unsafe impl<T> MostlySend for RefCell<T> where T: MostlySend {}
unsafe impl<T> MostlySend for Option<T> where T: MostlySend {}
impl<T> FullySend<T> { impl<T> FullySend<T> {
pub fn into(self) -> T { pub fn into(self) -> T {
self.0 self.0

View File

@ -95,11 +95,7 @@ mod test {
fn test_deletions() { fn test_deletions() {
struct DeletionWithData<'extractor> { struct DeletionWithData<'extractor> {
deleted: RefCell< deleted: RefCell<
hashbrown::HashSet< hashbrown::HashSet<DocumentId, hashbrown::DefaultHashBuilder, RefBump<'extractor>>,
DocumentId,
hashbrown::hash_map::DefaultHashBuilder,
RefBump<'extractor>,
>,
>, >,
} }