OpenAI: embed only the first MAX_TOKENS tokens

This commit is contained in:
Louis Dureuil 2024-07-15 16:27:26 +02:00
parent 65d0c32aa7
commit 5adacf2f45
No known key found for this signature in database

View File

@ -210,7 +210,6 @@ impl Embedder {
}
fn try_embed_tokenized(&self, text: &[String]) -> Result<Vec<Embeddings<f32>>, EmbedError> {
pub const OVERLAP_SIZE: usize = 200;
let mut all_embeddings = Vec::with_capacity(text.len());
for text in text {
let max_token_count = self.options.embedding_model.max_token();
@ -221,21 +220,10 @@ impl Embedder {
continue;
}
let mut tokens = encoded.as_slice();
let tokens = &encoded.as_slice()[0..max_token_count];
let mut embeddings_for_prompt = Embeddings::new(self.dimensions());
while tokens.len() > max_token_count {
let window = &tokens[..max_token_count];
let embedding = self.rest_embedder.embed_tokens(window)?;
embeddings_for_prompt.append(embedding.into_inner()).map_err(|got| {
EmbedError::openai_unexpected_dimension(self.dimensions(), got.len())
})?;
tokens = &tokens[max_token_count - OVERLAP_SIZE..];
}
// end of text
let embedding = self.rest_embedder.embed_tokens(tokens)?;
embeddings_for_prompt.append(embedding.into_inner()).map_err(|got| {
EmbedError::openai_unexpected_dimension(self.dimensions(), got.len())
})?;