1
0
Fork 0
mirror of git://git.gnupg.org/gnupg.git synced 2025-07-02 22:46:30 +02:00

Improved AES performance.

This commit is contained in:
Werner Koch 2008-03-22 17:01:37 +00:00
parent a614eabba9
commit 537cada38e
10 changed files with 392 additions and 242 deletions

View file

@ -1,5 +1,5 @@
/* Rijndael (AES) for GnuPG
* Copyright (C) 2000, 2001 Free Software Foundation, Inc.
* Copyright (C) 2000, 2001, 2008 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
*
@ -1866,86 +1866,119 @@ prepare_decryption( RIJNDAEL_context *ctx )
/* Encrypt one block. A and B may be the same. */
/* Encrypt one block. A and B need to be aligned on a 4 byte
boundary. A and B may be the same. */
static void
do_encrypt (const RIJNDAEL_context *ctx, byte *b, const byte *a)
do_encrypt_aligned (const RIJNDAEL_context *ctx,
unsigned char *b, const unsigned char *a)
{
int r;
byte temp[4][4];
int ROUNDS = ctx->ROUNDS;
#define rk (ctx->keySched)
int ROUNDS = ctx->ROUNDS;
int r;
union
{
u32 tempu32[4]; /* Force correct alignment. */
byte temp[4][4];
} u;
*((u32*)temp[0]) = *((u32*)(a )) ^ *((u32*)rk[0][0]);
*((u32*)temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
*((u32*)temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
*((u32*)temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
*((u32*)(b )) = *((u32*)T1[temp[0][0]])
^ *((u32*)T2[temp[1][1]])
^ *((u32*)T3[temp[2][2]])
^ *((u32*)T4[temp[3][3]]);
*((u32*)(b + 4)) = *((u32*)T1[temp[1][0]])
^ *((u32*)T2[temp[2][1]])
^ *((u32*)T3[temp[3][2]])
^ *((u32*)T4[temp[0][3]]);
*((u32*)(b + 8)) = *((u32*)T1[temp[2][0]])
^ *((u32*)T2[temp[3][1]])
^ *((u32*)T3[temp[0][2]])
^ *((u32*)T4[temp[1][3]]);
*((u32*)(b +12)) = *((u32*)T1[temp[3][0]])
^ *((u32*)T2[temp[0][1]])
^ *((u32*)T3[temp[1][2]])
^ *((u32*)T4[temp[2][3]]);
for (r = 1; r < ROUNDS-1; r++) {
*((u32*)temp[0]) = *((u32*)(b )) ^ *((u32*)rk[r][0]);
*((u32*)temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
*((u32*)temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
*((u32*)temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
*((u32*)u.temp[0]) = *((u32*)(a )) ^ *((u32*)rk[0][0]);
*((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
*((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
*((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
*((u32*)(b )) = (*((u32*)T1[u.temp[0][0]])
^ *((u32*)T2[u.temp[1][1]])
^ *((u32*)T3[u.temp[2][2]])
^ *((u32*)T4[u.temp[3][3]]));
*((u32*)(b + 4)) = (*((u32*)T1[u.temp[1][0]])
^ *((u32*)T2[u.temp[2][1]])
^ *((u32*)T3[u.temp[3][2]])
^ *((u32*)T4[u.temp[0][3]]));
*((u32*)(b + 8)) = (*((u32*)T1[u.temp[2][0]])
^ *((u32*)T2[u.temp[3][1]])
^ *((u32*)T3[u.temp[0][2]])
^ *((u32*)T4[u.temp[1][3]]));
*((u32*)(b +12)) = (*((u32*)T1[u.temp[3][0]])
^ *((u32*)T2[u.temp[0][1]])
^ *((u32*)T3[u.temp[1][2]])
^ *((u32*)T4[u.temp[2][3]]));
*((u32*)(b )) = *((u32*)T1[temp[0][0]])
^ *((u32*)T2[temp[1][1]])
^ *((u32*)T3[temp[2][2]])
^ *((u32*)T4[temp[3][3]]);
*((u32*)(b + 4)) = *((u32*)T1[temp[1][0]])
^ *((u32*)T2[temp[2][1]])
^ *((u32*)T3[temp[3][2]])
^ *((u32*)T4[temp[0][3]]);
*((u32*)(b + 8)) = *((u32*)T1[temp[2][0]])
^ *((u32*)T2[temp[3][1]])
^ *((u32*)T3[temp[0][2]])
^ *((u32*)T4[temp[1][3]]);
*((u32*)(b +12)) = *((u32*)T1[temp[3][0]])
^ *((u32*)T2[temp[0][1]])
^ *((u32*)T3[temp[1][2]])
^ *((u32*)T4[temp[2][3]]);
for (r = 1; r < ROUNDS-1; r++)
{
*((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[r][0]);
*((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
*((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
*((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
*((u32*)(b )) = (*((u32*)T1[u.temp[0][0]])
^ *((u32*)T2[u.temp[1][1]])
^ *((u32*)T3[u.temp[2][2]])
^ *((u32*)T4[u.temp[3][3]]));
*((u32*)(b + 4)) = (*((u32*)T1[u.temp[1][0]])
^ *((u32*)T2[u.temp[2][1]])
^ *((u32*)T3[u.temp[3][2]])
^ *((u32*)T4[u.temp[0][3]]));
*((u32*)(b + 8)) = (*((u32*)T1[u.temp[2][0]])
^ *((u32*)T2[u.temp[3][1]])
^ *((u32*)T3[u.temp[0][2]])
^ *((u32*)T4[u.temp[1][3]]));
*((u32*)(b +12)) = (*((u32*)T1[u.temp[3][0]])
^ *((u32*)T2[u.temp[0][1]])
^ *((u32*)T3[u.temp[1][2]])
^ *((u32*)T4[u.temp[2][3]]));
}
/* last round is special */
*((u32*)temp[0]) = *((u32*)(b )) ^ *((u32*)rk[ROUNDS-1][0]);
*((u32*)temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
*((u32*)temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
*((u32*)temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
b[ 0] = T1[temp[0][0]][1];
b[ 1] = T1[temp[1][1]][1];
b[ 2] = T1[temp[2][2]][1];
b[ 3] = T1[temp[3][3]][1];
b[ 4] = T1[temp[1][0]][1];
b[ 5] = T1[temp[2][1]][1];
b[ 6] = T1[temp[3][2]][1];
b[ 7] = T1[temp[0][3]][1];
b[ 8] = T1[temp[2][0]][1];
b[ 9] = T1[temp[3][1]][1];
b[10] = T1[temp[0][2]][1];
b[11] = T1[temp[1][3]][1];
b[12] = T1[temp[3][0]][1];
b[13] = T1[temp[0][1]][1];
b[14] = T1[temp[1][2]][1];
b[15] = T1[temp[2][3]][1];
*((u32*)(b )) ^= *((u32*)rk[ROUNDS][0]);
*((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
*((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
*((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
/* Last round is special. */
*((u32*)u.temp[0]) = *((u32*)(b )) ^ *((u32*)rk[ROUNDS-1][0]);
*((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
*((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
*((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
b[ 0] = T1[u.temp[0][0]][1];
b[ 1] = T1[u.temp[1][1]][1];
b[ 2] = T1[u.temp[2][2]][1];
b[ 3] = T1[u.temp[3][3]][1];
b[ 4] = T1[u.temp[1][0]][1];
b[ 5] = T1[u.temp[2][1]][1];
b[ 6] = T1[u.temp[3][2]][1];
b[ 7] = T1[u.temp[0][3]][1];
b[ 8] = T1[u.temp[2][0]][1];
b[ 9] = T1[u.temp[3][1]][1];
b[10] = T1[u.temp[0][2]][1];
b[11] = T1[u.temp[1][3]][1];
b[12] = T1[u.temp[3][0]][1];
b[13] = T1[u.temp[0][1]][1];
b[14] = T1[u.temp[1][2]][1];
b[15] = T1[u.temp[2][3]][1];
*((u32*)(b )) ^= *((u32*)rk[ROUNDS][0]);
*((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
*((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
*((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
#undef rk
}
static void
do_encrypt (const RIJNDAEL_context *ctx,
unsigned char *bx, const unsigned char *ax)
{
/* BX and AX are not necessary correctly aligned. Thus we need to
copy them here. */
union
{
u32 dummy[4];
byte a[16];
} a;
union
{
u32 dummy[4];
byte b[16];
} b;
memcpy (a.a, ax, 16);
do_encrypt_aligned (ctx, b.b, a.a);
memcpy (bx, b.b, 16);
}
static void
rijndael_encrypt (void *ctx, byte *b, const byte *a)
{
@ -1953,56 +1986,6 @@ rijndael_encrypt (void *ctx, byte *b, const byte *a)
burn_stack (16 + 2*sizeof(int));
}
#if 0
/* Experimental code. Needs to be generalized and we might want to
have variants for all possible sizes of the largest scalar type.
Also need to make sure that INBUF and OUTBUF are properlu
aligned. */
void
rijndael_cfb_encrypt (void *ctx, byte *iv,
byte *outbuf, const byte *inbuf, size_t nbytes)
{
/* if ( ((unsigned long)inbuf & 3) || ((unsigned long)outbuf & 3) ) */
/* { */
/* Not properly aligned, use the slow version. Actually the
compiler might even optimize it this pretty well if the
target CPU has relaxed alignment requirements. Thus it is
questionable whether we should at all go into the hassles of
doing alignment wise optimizations by ourself. A quick test
with gcc 4.0 on ia32 did showed any advantages. */
byte *ivp;
int i;
while (nbytes >= 16)
{
do_encrypt (ctx, iv, iv);
for (i=0, ivp = iv; i < 16; i++)
*outbuf++ = (*ivp++ ^= *inbuf++);
nbytes -= 16;
}
/* } */
/* else */
/* { */
/* u32 *ivp; */
/* u32 *ob = (u32*)outbuf; */
/* const u32 *ib = (const u32*)inbuf; */
/* while (nbytes >= 16) */
/* { */
/* do_encrypt (ctx, iv, iv); */
/* ivp = iv; */
/* *ob++ = (*ivp++ ^= *ib++); */
/* *ob++ = (*ivp++ ^= *ib++); */
/* *ob++ = (*ivp++ ^= *ib++); */
/* *ob++ = (*ivp ^= *ib++); */
/* nbytes -= 16; */
/* } */
/* } */
burn_stack (16 + 2*sizeof(int));
}
#endif
/* Decrypt one block. a and b may be the same. */
@ -2097,6 +2080,67 @@ rijndael_decrypt (void *ctx, byte *b, const byte *a)
do_decrypt (ctx, b, a);
burn_stack (16+2*sizeof(int));
}
/* Bulk encryption of complete blocks in CFB mode. Caller needs to
make sure that IV is aligned on an unsigned long boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
void
rijndael_cfb_enc (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char *ivp;
int i;
for ( ;nblocks; nblocks-- )
{
/* Encrypt the IV. */
do_encrypt_aligned (ctx, iv, iv);
/* XOR the input with the IV and store input into IV. */
for (ivp=iv,i=0; i < 16; i++ )
*outbuf++ = (*ivp++ ^= *inbuf++);
}
burn_stack (16 + 2*sizeof(int));
}
/* Bulk decryption of complete blocks in CFB mode. Caller needs to
make sure that IV is aligned on an unisgned lonhg boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
void
rijndael_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char *ivp;
unsigned char temp;
int i;
for ( ;nblocks; nblocks-- )
{
do_encrypt_aligned (ctx, iv, iv);
for (ivp=iv,i=0; i < 16; i++ )
{
temp = *inbuf++;
*outbuf++ = *ivp ^ temp;
*ivp++ = temp;
}
}
burn_stack (16 + 2*sizeof(int));
}
/* Test a single encryption and decryption with each key size. */