Remove options

This commit is contained in:
Mubelotix 2025-06-24 15:10:15 +02:00
parent 695877043a
commit d08e89ea3d
9 changed files with 31 additions and 33 deletions

View File

@ -684,12 +684,10 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
embedder: Arc<Embedder>,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
unused_vectors_distribution: &UnusedVectorsDistribution,
request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> {
println!("Extract embedder stats {}:", embedder_stats.is_some());
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
@ -791,7 +789,7 @@ fn embed_chunks(
text_chunks: Vec<Vec<String>>,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
unused_vectors_distribution: &UnusedVectorsDistribution,
request_threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embedding>>> {

View File

@ -274,7 +274,7 @@ fn send_original_documents_data(
embedder.clone(),
&embedder_name,
&possible_embedding_mistakes,
Some(embedder_stats.clone()),
embedder_stats.clone(),
&unused_vectors_distribution,
request_threads(),
) {

View File

@ -23,7 +23,7 @@ pub struct EmbeddingExtractor<'a, 'b> {
embedders: &'a EmbeddingConfigs,
sender: EmbeddingSender<'a, 'b>,
possible_embedding_mistakes: PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
threads: &'a ThreadPoolNoAbort,
}
@ -32,7 +32,7 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> {
embedders: &'a EmbeddingConfigs,
sender: EmbeddingSender<'a, 'b>,
field_distribution: &'a FieldDistribution,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
threads: &'a ThreadPoolNoAbort,
) -> Self {
let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution);
@ -311,7 +311,7 @@ struct Chunks<'a, 'b, 'extractor> {
dimensions: usize,
prompt: &'a Prompt,
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
threads: &'a ThreadPoolNoAbort,
sender: EmbeddingSender<'a, 'b>,
@ -327,7 +327,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
prompt: &'a Prompt,
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
threads: &'a ThreadPoolNoAbort,
sender: EmbeddingSender<'a, 'b>,
doc_alloc: &'a Bump,
@ -416,7 +416,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
embedder_id: u8,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
unused_vectors_distribution: &UnusedVectorsDistributionBump,
threads: &ThreadPoolNoAbort,
sender: EmbeddingSender<'a, 'b>,

View File

@ -248,7 +248,7 @@ where
embedders,
embedding_sender,
field_distribution,
Some(embedder_stats),
embedder_stats,
request_threads(),
);
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());

View File

@ -196,7 +196,7 @@ impl SubEmbedder {
&self,
text_chunks: Vec<Vec<String>>,
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
match self {
SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
@ -218,7 +218,7 @@ impl SubEmbedder {
&self,
texts: &[&str],
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> std::result::Result<Vec<Embedding>, EmbedError> {
match self {
SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),

View File

@ -749,7 +749,7 @@ impl Embedder {
&self,
text_chunks: Vec<Vec<String>>,
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
match self {
Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
@ -772,7 +772,7 @@ impl Embedder {
&self,
texts: &[&str],
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> std::result::Result<Vec<Embedding>, EmbedError> {
match self {
Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),

View File

@ -121,21 +121,21 @@ impl Embedder {
&self,
text_chunks: Vec<Vec<String>>,
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
// This condition helps reduce the number of active rayon jobs
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
if threads.active_operations() >= REQUEST_PARALLELISM {
text_chunks
.into_iter()
.map(move |chunk| self.embed(&chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone())))
.collect()
} else {
threads
.install(move || {
text_chunks
.into_par_iter()
.map(move |chunk| self.embed(&chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone())))
.collect()
})
.map_err(|error| EmbedError {
@ -149,14 +149,14 @@ impl Embedder {
&self,
texts: &[&str],
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<Vec<Vec<f32>>, EmbedError> {
// This condition helps reduce the number of active rayon jobs
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
if threads.active_operations() >= REQUEST_PARALLELISM {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone())))
.collect();
let embeddings = embeddings?;
@ -166,7 +166,7 @@ impl Embedder {
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone())))
.collect();
let embeddings = embeddings?;

View File

@ -262,21 +262,21 @@ impl Embedder {
&self,
text_chunks: Vec<Vec<String>>,
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
// This condition helps reduce the number of active rayon jobs
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
if threads.active_operations() >= REQUEST_PARALLELISM {
text_chunks
.into_iter()
.map(move |chunk| self.embed(&chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone())))
.collect()
} else {
threads
.install(move || {
text_chunks
.into_par_iter()
.map(move |chunk| self.embed(&chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats.clone())))
.collect()
})
.map_err(|error| EmbedError {
@ -290,14 +290,14 @@ impl Embedder {
&self,
texts: &[&str],
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<Vec<Vec<f32>>, EmbedError> {
// This condition helps reduce the number of active rayon jobs
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
if threads.active_operations() >= REQUEST_PARALLELISM {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone())))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
@ -306,7 +306,7 @@ impl Embedder {
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone())))
.collect();
let embeddings = embeddings?;

View File

@ -208,21 +208,21 @@ impl Embedder {
&self,
text_chunks: Vec<Vec<String>>,
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
// This condition helps reduce the number of active rayon jobs
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
if threads.active_operations() >= REQUEST_PARALLELISM {
text_chunks
.into_iter()
.map(move |chunk| self.embed(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone())))
.collect()
} else {
threads
.install(move || {
text_chunks
.into_par_iter()
.map(move |chunk| self.embed(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats.clone())))
.collect()
})
.map_err(|error| EmbedError {
@ -236,14 +236,14 @@ impl Embedder {
&self,
texts: &[&str],
threads: &ThreadPoolNoAbort,
embedder_stats: Option<Arc<EmbedderStats>>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<Vec<Embedding>, EmbedError> {
// This condition helps reduce the number of active rayon jobs
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
if threads.active_operations() >= REQUEST_PARALLELISM {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed_ref(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats.clone())))
.collect();
let embeddings = embeddings?;
@ -253,7 +253,7 @@ impl Embedder {
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed_ref(chunk, None, embedder_stats.clone()))
.map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats.clone())))
.collect();
let embeddings = embeddings?;