Experimental code to improve AES performance. Got about 25% on ia32.

2025-07-02 22:46:30 +02:00 · 2005-08-11 16:57:29 +00:00 · 2005-08-11 16:57:29 +00:00 · 0a3eda24ee
commit 0a3eda24ee
parent 2e38eab93c
3 changed files with 76 additions and 1 deletions
--- a/cipher/ChangeLog
+++ b/cipher/ChangeLog
@ -1,3 +1,9 @@
+2005-08-11  Werner Koch  <wk@g10code.com>
+
+	* rijndael.c (rijndael_cfb_encrypt): Experimental code to improve
+	AES performance.  Got about 25% on ia32.
+	* cipher.c (do_cfb_encrypt): Ditto.
+
 2005-06-07  David Shaw  <dshaw@jabberwocky.com>

 	* random.c: Fix prototype of the fast random gatherer.  Noted by
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@ -536,7 +536,25 @@ do_cfb_encrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nbytes )
 	    *outbuf++ = (*ivp++ ^= *inbuf++);
    }

-    /* now we can process complete blocks */
+    /* Now we can process complete blocks. */
+#if 0 
+    /* Experimental code.  We may only use this for standard CFB
+       because for Phil's mode we need to save the IV of before the
+       last encryption - we don't want to do this in tghe fasf CFB
+       encryption routine.  */
+    if (c->algo == CIPHER_ALGO_AES
+        && nbytes >= blocksize 
+        && c->mode != CIPHER_MODE_PHILS_CFB) {
+        size_t n;
+
+	memcpy( c->lastiv, c->iv, blocksize );
+        n = (nbytes / blocksize) * blocksize;
+        rijndael_cfb_encrypt (&c->context.c, c->iv, outbuf, inbuf, n);
+        inbuf  += n;
+        outbuf += n;
+	nbytes -= n;
+    }
+#endif
    while( nbytes >= blocksize ) {
 	int i;
 	/* encrypt the IV (and save the current one) */
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@ -1955,6 +1955,57 @@ rijndael_encrypt (void *ctx, byte *b, const byte *a)
    burn_stack (16 + 2*sizeof(int));
 }

+#if 0
+/* Experimental code.  Needs to be generalized and we might want to
+   have variants for all possible sizes of the largest scalar type.
+   Also need to make sure that INBUF and OUTBUF are properlu
+   aligned. */
+void
+rijndael_cfb_encrypt (void *ctx, byte *iv,
+                      byte *outbuf, const byte *inbuf, size_t nbytes)
+{
+/*   if ( ((unsigned long)inbuf & 3) || ((unsigned long)outbuf & 3) ) */
+/*     {  */
+      /* Not properly aligned, use the slow version.  Actually the
+         compiler might even optimize it this pretty well if the
+         target CPU has relaxed alignment requirements. Thus it is
+         questionable whether we should at all go into the hassles of
+         doing alignment wise optimizations by ourself.  A quick test
+         with gcc 4.0 on ia32 did showed any advantages. */
+      byte *ivp;
+      int i;
+
+      while (nbytes >= 16)
+        {
+          do_encrypt (ctx, iv, iv);
+          for (i=0, ivp = iv; i < 16; i++)
+            *outbuf++ = (*ivp++ ^= *inbuf++);
+          nbytes -= 16;
+        }
+/*     } */
+/*   else */
+/*     { */
+/*       u32 *ivp; */
+/*       u32 *ob = (u32*)outbuf; */
+/*       const u32 *ib = (const u32*)inbuf; */
+
+/*       while (nbytes >= 16) */
+/*         { */
+/*           do_encrypt (ctx, iv, iv); */
+/*           ivp = iv; */
+/*           *ob++ = (*ivp++ ^= *ib++); */
+/*           *ob++ = (*ivp++ ^= *ib++); */
+/*           *ob++ = (*ivp++ ^= *ib++); */
+/*           *ob++ = (*ivp   ^= *ib++); */
+/*           nbytes -= 16; */
+/*         } */
+/*     } */
+  burn_stack (16 + 2*sizeof(int));
+}
+#endif
+
+
+

 /* Decrypt one block.  a and b may be the same. */
 static void