diff --git a/cipher/ChangeLog b/cipher/ChangeLog index b5732524d..f0e7efb0f 100644 --- a/cipher/ChangeLog +++ b/cipher/ChangeLog @@ -1,3 +1,9 @@ +2005-08-11 Werner Koch + + * rijndael.c (rijndael_cfb_encrypt): Experimental code to improve + AES performance. Got about 25% on ia32. + * cipher.c (do_cfb_encrypt): Ditto. + 2005-06-07 David Shaw * random.c: Fix prototype of the fast random gatherer. Noted by diff --git a/cipher/cipher.c b/cipher/cipher.c index 591ce208e..311919fe1 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -536,7 +536,25 @@ do_cfb_encrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nbytes ) *outbuf++ = (*ivp++ ^= *inbuf++); } - /* now we can process complete blocks */ + /* Now we can process complete blocks. */ +#if 0 + /* Experimental code. We may only use this for standard CFB + because for Phil's mode we need to save the IV of before the + last encryption - we don't want to do this in tghe fasf CFB + encryption routine. */ + if (c->algo == CIPHER_ALGO_AES + && nbytes >= blocksize + && c->mode != CIPHER_MODE_PHILS_CFB) { + size_t n; + + memcpy( c->lastiv, c->iv, blocksize ); + n = (nbytes / blocksize) * blocksize; + rijndael_cfb_encrypt (&c->context.c, c->iv, outbuf, inbuf, n); + inbuf += n; + outbuf += n; + nbytes -= n; + } +#endif while( nbytes >= blocksize ) { int i; /* encrypt the IV (and save the current one) */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index adf276531..e52e01e96 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -1955,6 +1955,57 @@ rijndael_encrypt (void *ctx, byte *b, const byte *a) burn_stack (16 + 2*sizeof(int)); } +#if 0 +/* Experimental code. Needs to be generalized and we might want to + have variants for all possible sizes of the largest scalar type. + Also need to make sure that INBUF and OUTBUF are properlu + aligned. */ +void +rijndael_cfb_encrypt (void *ctx, byte *iv, + byte *outbuf, const byte *inbuf, size_t nbytes) +{ +/* if ( ((unsigned long)inbuf & 3) || ((unsigned long)outbuf & 3) ) */ +/* { */ + /* Not properly aligned, use the slow version. Actually the + compiler might even optimize it this pretty well if the + target CPU has relaxed alignment requirements. Thus it is + questionable whether we should at all go into the hassles of + doing alignment wise optimizations by ourself. A quick test + with gcc 4.0 on ia32 did showed any advantages. */ + byte *ivp; + int i; + + while (nbytes >= 16) + { + do_encrypt (ctx, iv, iv); + for (i=0, ivp = iv; i < 16; i++) + *outbuf++ = (*ivp++ ^= *inbuf++); + nbytes -= 16; + } +/* } */ +/* else */ +/* { */ +/* u32 *ivp; */ +/* u32 *ob = (u32*)outbuf; */ +/* const u32 *ib = (const u32*)inbuf; */ + +/* while (nbytes >= 16) */ +/* { */ +/* do_encrypt (ctx, iv, iv); */ +/* ivp = iv; */ +/* *ob++ = (*ivp++ ^= *ib++); */ +/* *ob++ = (*ivp++ ^= *ib++); */ +/* *ob++ = (*ivp++ ^= *ib++); */ +/* *ob++ = (*ivp ^= *ib++); */ +/* nbytes -= 16; */ +/* } */ +/* } */ + burn_stack (16 + 2*sizeof(int)); +} +#endif + + + /* Decrypt one block. a and b may be the same. */ static void