Move the arroy building after the writing loop

This commit is contained in:
Clément Renault 2024-11-27 10:19:59 +01:00
parent 8442db8101
commit 2094ce8a9a
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F

View File

@ -76,7 +76,7 @@ where
MSP: Fn() -> bool + Sync, MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync, SP: Fn(Progress) + Sync,
{ {
/// TODO restrict memory and remove this memory from the extractors bum allocators /// TODO restrict memory and remove this memory from the extractors bump allocators
let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) let bbbuffers: Vec<_> = (0..rayon::current_num_threads())
.map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread
.collect(); .collect();
@ -100,6 +100,7 @@ where
send_progress, send_progress,
}; };
let mut index_embeddings = index.embedding_configs(wtxn)?;
let mut field_distribution = index.field_distribution(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?;
let mut document_ids = index.documents_ids(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?;
@ -296,7 +297,6 @@ where
'vectors: { 'vectors: {
let mut index_embeddings = index.embedding_configs(&rtxn)?;
if index_embeddings.is_empty() { if index_embeddings.is_empty() {
break 'vectors; break 'vectors;
} }
@ -322,8 +322,6 @@ where
} }
} }
} }
embedding_sender.finish(index_embeddings).unwrap();
} }
'geo: { 'geo: {
@ -457,46 +455,47 @@ where
embeddings.append(embedding).unwrap(); embeddings.append(embedding).unwrap();
} }
writer.del_items(wtxn, *dimensions, docid)?; writer.del_items(wtxn, *dimensions, docid)?;
writer.add_items(wtxn, docid, &embeddings)?; writer.add_items(wtxn, docid, &embeddings)?;
} }
ArroyOperation::SetVector { docid, embedder_id, embedding } => { ArroyOperation::SetVector { docid, embedder_id, embedding } => {
let (_, _, writer, dimensions) = arroy_writers let (_, _, writer, dimensions) =
.get(&embedder_id) arroy_writers.get(&embedder_id).expect("requested a missing embedder");
.expect("requested a missing embedder"); writer.del_items(wtxn, *dimensions, docid)?;
writer.del_items(wtxn, *dimensions, docid)?; writer.add_item(wtxn, docid, &embedding)?;
writer.add_item(wtxn, docid, &embedding)?; }
} _otherwise => unreachable!(),
ArroyOperation::Finish { configs } => { },
let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build");
let _entered = span.enter();
(indexing_context.send_progress)(Progress::from_step(
Step::WritingEmbeddingsToDatabase,
));
for (
_embedder_index,
(_embedder_name, _embedder, writer, dimensions),
) in &mut arroy_writers
{
let dimensions = *dimensions;
writer.build_and_quantize(
wtxn,
&mut rng,
dimensions,
false,
&indexing_context.must_stop_processing,
)?;
}
index.put_embedding_configs(wtxn, configs)?;
}
},
}
} }
} }
'vectors: {
let span =
tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build");
let _entered = span.enter();
if index_embeddings.is_empty() {
break 'vectors;
}
(indexing_context.send_progress)(Progress::from_step(
Step::WritingEmbeddingsToDatabase,
));
for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers {
let dimensions = *dimensions;
writer.build_and_quantize(
wtxn,
&mut rng,
dimensions,
false,
&indexing_context.must_stop_processing,
)?;
}
index.put_embedding_configs(wtxn, index_embeddings)?;
}
(indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors));
let facet_field_ids_delta = extractor_handle.join().unwrap()?; let facet_field_ids_delta = extractor_handle.join().unwrap()?;