Improved AES performance.

2025-07-02 22:46:30 +02:00 · 2008-03-22 17:01:37 +00:00 · 2008-03-22 17:01:37 +00:00 · 537cada38e
commit 537cada38e
parent a614eabba9
10 changed files with 392 additions and 242 deletions
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@ -1,5 +1,5 @@
 /* Rijndael (AES) for GnuPG
- *	Copyright (C) 2000, 2001 Free Software Foundation, Inc.
+ *	Copyright (C) 2000, 2001, 2008 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -1866,86 +1866,119 @@ prepare_decryption( RIJNDAEL_context *ctx )



-/* Encrypt one block.  A and B may be the same. */
+/* Encrypt one block.  A and B need to be aligned on a 4 byte
+   boundary.  A and B may be the same. */
 static void
-do_encrypt (const RIJNDAEL_context *ctx, byte *b, const byte *a)
+do_encrypt_aligned (const RIJNDAEL_context *ctx, 
+                    unsigned char *b, const unsigned char *a)
 {
-    int r;
-    byte temp[4][4];
-    int ROUNDS = ctx->ROUNDS;
 #define rk (ctx->keySched)
+  int ROUNDS = ctx->ROUNDS;
+  int r;
+  union
+  {
+    u32  tempu32[4];  /* Force correct alignment. */
+    byte temp[4][4];
+  } u;

-    *((u32*)temp[0]) = *((u32*)(a   )) ^ *((u32*)rk[0][0]);
-    *((u32*)temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
-    *((u32*)temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
-    *((u32*)temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
-    *((u32*)(b    )) = *((u32*)T1[temp[0][0]])
-        ^ *((u32*)T2[temp[1][1]])
-        ^ *((u32*)T3[temp[2][2]]) 
-        ^ *((u32*)T4[temp[3][3]]);
-    *((u32*)(b + 4)) = *((u32*)T1[temp[1][0]])
-        ^ *((u32*)T2[temp[2][1]])
-        ^ *((u32*)T3[temp[3][2]]) 
-        ^ *((u32*)T4[temp[0][3]]);
-    *((u32*)(b + 8)) = *((u32*)T1[temp[2][0]])
-        ^ *((u32*)T2[temp[3][1]])
-        ^ *((u32*)T3[temp[0][2]]) 
-        ^ *((u32*)T4[temp[1][3]]);
-    *((u32*)(b +12)) = *((u32*)T1[temp[3][0]])
-        ^ *((u32*)T2[temp[0][1]])
-        ^ *((u32*)T3[temp[1][2]]) 
-        ^ *((u32*)T4[temp[2][3]]);
-    for (r = 1; r < ROUNDS-1; r++) {
-        *((u32*)temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[r][0]);
-        *((u32*)temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
-        *((u32*)temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
-        *((u32*)temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
+  *((u32*)u.temp[0]) = *((u32*)(a   )) ^ *((u32*)rk[0][0]);
+  *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
+  *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
+  *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
+  *((u32*)(b    ))   = (*((u32*)T1[u.temp[0][0]])
+                        ^ *((u32*)T2[u.temp[1][1]])
+                        ^ *((u32*)T3[u.temp[2][2]]) 
+                        ^ *((u32*)T4[u.temp[3][3]]));
+  *((u32*)(b + 4))   = (*((u32*)T1[u.temp[1][0]])
+                        ^ *((u32*)T2[u.temp[2][1]])
+                        ^ *((u32*)T3[u.temp[3][2]]) 
+                        ^ *((u32*)T4[u.temp[0][3]]));
+  *((u32*)(b + 8))   = (*((u32*)T1[u.temp[2][0]])
+                        ^ *((u32*)T2[u.temp[3][1]])
+                        ^ *((u32*)T3[u.temp[0][2]]) 
+                        ^ *((u32*)T4[u.temp[1][3]]));
+  *((u32*)(b +12))   = (*((u32*)T1[u.temp[3][0]])
+                        ^ *((u32*)T2[u.temp[0][1]])
+                        ^ *((u32*)T3[u.temp[1][2]]) 
+                        ^ *((u32*)T4[u.temp[2][3]]));

-        *((u32*)(b    )) = *((u32*)T1[temp[0][0]])
-            ^ *((u32*)T2[temp[1][1]])
-            ^ *((u32*)T3[temp[2][2]]) 
-            ^ *((u32*)T4[temp[3][3]]);
-        *((u32*)(b + 4)) = *((u32*)T1[temp[1][0]])
-            ^ *((u32*)T2[temp[2][1]])
-            ^ *((u32*)T3[temp[3][2]]) 
-            ^ *((u32*)T4[temp[0][3]]);
-        *((u32*)(b + 8)) = *((u32*)T1[temp[2][0]])
-            ^ *((u32*)T2[temp[3][1]])
-            ^ *((u32*)T3[temp[0][2]]) 
-            ^ *((u32*)T4[temp[1][3]]);
-        *((u32*)(b +12)) = *((u32*)T1[temp[3][0]])
-            ^ *((u32*)T2[temp[0][1]])
-            ^ *((u32*)T3[temp[1][2]]) 
-            ^ *((u32*)T4[temp[2][3]]);
+  for (r = 1; r < ROUNDS-1; r++)
+    {
+      *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[r][0]);
+      *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
+      *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
+      *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
+
+      *((u32*)(b    ))   = (*((u32*)T1[u.temp[0][0]])
+                            ^ *((u32*)T2[u.temp[1][1]])
+                            ^ *((u32*)T3[u.temp[2][2]]) 
+                            ^ *((u32*)T4[u.temp[3][3]]));
+      *((u32*)(b + 4))   = (*((u32*)T1[u.temp[1][0]])
+                            ^ *((u32*)T2[u.temp[2][1]])
+                            ^ *((u32*)T3[u.temp[3][2]]) 
+                            ^ *((u32*)T4[u.temp[0][3]]));
+      *((u32*)(b + 8))   = (*((u32*)T1[u.temp[2][0]])
+                            ^ *((u32*)T2[u.temp[3][1]])
+                            ^ *((u32*)T3[u.temp[0][2]]) 
+                            ^ *((u32*)T4[u.temp[1][3]]));
+      *((u32*)(b +12))   = (*((u32*)T1[u.temp[3][0]])
+                            ^ *((u32*)T2[u.temp[0][1]])
+                            ^ *((u32*)T3[u.temp[1][2]]) 
+                            ^ *((u32*)T4[u.temp[2][3]]));
    }
-    /* last round is special */   
-    *((u32*)temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[ROUNDS-1][0]);
-    *((u32*)temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
-    *((u32*)temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
-    *((u32*)temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
-    b[ 0] = T1[temp[0][0]][1];
-    b[ 1] = T1[temp[1][1]][1];
-    b[ 2] = T1[temp[2][2]][1];
-    b[ 3] = T1[temp[3][3]][1];
-    b[ 4] = T1[temp[1][0]][1];
-    b[ 5] = T1[temp[2][1]][1];
-    b[ 6] = T1[temp[3][2]][1];
-    b[ 7] = T1[temp[0][3]][1];
-    b[ 8] = T1[temp[2][0]][1];
-    b[ 9] = T1[temp[3][1]][1];
-    b[10] = T1[temp[0][2]][1];
-    b[11] = T1[temp[1][3]][1];
-    b[12] = T1[temp[3][0]][1];
-    b[13] = T1[temp[0][1]][1];
-    b[14] = T1[temp[1][2]][1];
-    b[15] = T1[temp[2][3]][1];
-    *((u32*)(b   )) ^= *((u32*)rk[ROUNDS][0]);
-    *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
-    *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
-    *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
+
+  /* Last round is special. */   
+  *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[ROUNDS-1][0]);
+  *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
+  *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
+  *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
+  b[ 0] = T1[u.temp[0][0]][1];
+  b[ 1] = T1[u.temp[1][1]][1];
+  b[ 2] = T1[u.temp[2][2]][1];
+  b[ 3] = T1[u.temp[3][3]][1];
+  b[ 4] = T1[u.temp[1][0]][1];
+  b[ 5] = T1[u.temp[2][1]][1];
+  b[ 6] = T1[u.temp[3][2]][1];
+  b[ 7] = T1[u.temp[0][3]][1];
+  b[ 8] = T1[u.temp[2][0]][1];
+  b[ 9] = T1[u.temp[3][1]][1];
+  b[10] = T1[u.temp[0][2]][1];
+  b[11] = T1[u.temp[1][3]][1];
+  b[12] = T1[u.temp[3][0]][1];
+  b[13] = T1[u.temp[0][1]][1];
+  b[14] = T1[u.temp[1][2]][1];
+  b[15] = T1[u.temp[2][3]][1];
+  *((u32*)(b   )) ^= *((u32*)rk[ROUNDS][0]);
+  *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
+  *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
+  *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
 #undef rk
 }

+
+static void
+do_encrypt (const RIJNDAEL_context *ctx,
+            unsigned char *bx, const unsigned char *ax)
+{
+  /* BX and AX are not necessary correctly aligned.  Thus we need to
+     copy them here. */
+  union
+  {
+    u32  dummy[4]; 
+    byte a[16];
+  } a;
+  union
+  {
+    u32  dummy[4]; 
+    byte b[16];
+  } b;
+
+  memcpy (a.a, ax, 16);
+  do_encrypt_aligned (ctx, b.b, a.a);
+  memcpy (bx, b.b, 16);
+}
+
+
 static void
 rijndael_encrypt (void *ctx, byte *b, const byte *a)
 {
@ -1953,56 +1986,6 @@ rijndael_encrypt (void *ctx, byte *b, const byte *a)
    burn_stack (16 + 2*sizeof(int));
 }

-#if 0
-/* Experimental code.  Needs to be generalized and we might want to
-   have variants for all possible sizes of the largest scalar type.
-   Also need to make sure that INBUF and OUTBUF are properlu
-   aligned. */
-void
-rijndael_cfb_encrypt (void *ctx, byte *iv,
-                      byte *outbuf, const byte *inbuf, size_t nbytes)
-{
-/*   if ( ((unsigned long)inbuf & 3) || ((unsigned long)outbuf & 3) ) */
-/*     {  */
-      /* Not properly aligned, use the slow version.  Actually the
-         compiler might even optimize it this pretty well if the
-         target CPU has relaxed alignment requirements. Thus it is
-         questionable whether we should at all go into the hassles of
-         doing alignment wise optimizations by ourself.  A quick test
-         with gcc 4.0 on ia32 did showed any advantages. */
-      byte *ivp;
-      int i;
-
-      while (nbytes >= 16)
-        {
-          do_encrypt (ctx, iv, iv);
-          for (i=0, ivp = iv; i < 16; i++)
-            *outbuf++ = (*ivp++ ^= *inbuf++);
-          nbytes -= 16;
-        }
-/*     } */
-/*   else */
-/*     { */
-/*       u32 *ivp; */
-/*       u32 *ob = (u32*)outbuf; */
-/*       const u32 *ib = (const u32*)inbuf; */
-
-/*       while (nbytes >= 16) */
-/*         { */
-/*           do_encrypt (ctx, iv, iv); */
-/*           ivp = iv; */
-/*           *ob++ = (*ivp++ ^= *ib++); */
-/*           *ob++ = (*ivp++ ^= *ib++); */
-/*           *ob++ = (*ivp++ ^= *ib++); */
-/*           *ob++ = (*ivp   ^= *ib++); */
-/*           nbytes -= 16; */
-/*         } */
-/*     } */
-  burn_stack (16 + 2*sizeof(int));
-}
-#endif
-
-


 /* Decrypt one block.  a and b may be the same. */
@ -2097,6 +2080,67 @@ rijndael_decrypt (void *ctx, byte *b, const byte *a)
    do_decrypt (ctx, b, a);
    burn_stack (16+2*sizeof(int));
 }
+
+
+
+/* Bulk encryption of complete blocks in CFB mode.  Caller needs to
+   make sure that IV is aligned on an unsigned long boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+void
+rijndael_cfb_enc (void *context, unsigned char *iv, 
+                  void *outbuf_arg, const void *inbuf_arg,
+                  unsigned int nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *ivp;
+  int i;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the IV. */
+      do_encrypt_aligned (ctx, iv, iv);
+      /* XOR the input with the IV and store input into IV.  */
+      for (ivp=iv,i=0; i < 16; i++ )
+        *outbuf++ = (*ivp++ ^= *inbuf++);
+    }
+
+  burn_stack (16 + 2*sizeof(int));
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  Caller needs to
+   make sure that IV is aligned on an unisgned lonhg boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+void
+rijndael_cfb_dec (void *context, unsigned char *iv, 
+                  void *outbuf_arg, const void *inbuf_arg,
+                  unsigned int nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *ivp;
+  unsigned char temp;
+  int i;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_aligned (ctx, iv, iv);
+      for (ivp=iv,i=0; i < 16; i++ )
+        {
+          temp = *inbuf++;
+          *outbuf++ = *ivp ^ temp;
+          *ivp++ = temp;
+        }
+    }
+
+  burn_stack (16 + 2*sizeof(int));
+}
+
+

 /* Test a single encryption and decryption with each key size. */