Improved AES performance.

2025-07-03 22:56:33 +02:00 · 2008-03-22 17:01:37 +00:00 · 2008-03-22 17:01:37 +00:00 · 537cada38e
commit 537cada38e
parent a614eabba9
10 changed files with 392 additions and 242 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
+Noteworthy changes in version 1.4.9 (unreleased)
+------------------------------------------------
+
+    * Improved AES encryption performance by more than 20% (on ia32).
+      Decryption is also a bit faster.
+
+
+
 Noteworthy changes in version 1.4.8 (2007-12-20)
 ------------------------------------------------
        
--- a/cipher/ChangeLog
+++ b/cipher/ChangeLog
@ -1,3 +1,13 @@
+2008-03-22  Werner Koch  <wk@g10code.com>
+
+	* cipher.c (struct cipher_handle_s): Make sure IV is u32
+	aligned.  Change all users of IV.
+	(do_cfb_encrypt): Optimize and use bulk code for AES.
+	(do_cfb_decrypt): Ditto.
+	* rijndael.c (do_encrypt): Remove.
+	(do_encrypt_aligned, do_encrypt): New.  Taken from libgcrypt.
+	(rijndael_cfb_enc, rijndael_cfb_dec): New.
+
 2007-12-12  Werner Koch  <wk@g10code.com>

 	* pubkey.c (pubkey_encrypt, pubkey_decrypt): Allow type 20 keys.
--- a/cipher/algorithms.h
+++ b/cipher/algorithms.h
@ -118,8 +118,6 @@ twofish_get_info( int algo, size_t *keylen,
 		  void (**decryptf)( void *c, byte *outbuf, const byte *inbuf )
 		  );

-/* this is just a kludge for the time we have not yet changed the cipher
- * stuff to the scheme we use for random and digests */
 const char *
 rijndael_get_info( int algo, size_t *keylen,
 		   size_t *blocksize, size_t *contextsize,
@ -127,6 +125,12 @@ rijndael_get_info( int algo, size_t *keylen,
 		   void (**encryptf)(void *c, byte *outbuf, const byte *inbuf),
 		   void (**decryptf)(void *c, byte *outbuf, const byte *inbuf)
 		   );
+void rijndael_cfb_enc (void *context, unsigned char *iv, 
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks);
+void rijndael_cfb_dec (void *context, unsigned char *iv, 
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks);

 const char *
 idea_get_info( int algo, size_t *keylen,
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@ -1,6 +1,6 @@
 /* cipher.c  -	cipher dispatcher
 * Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
- *               2007 Free Software Foundation, Inc.
+ *               2007, 2008 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -52,17 +52,26 @@ static struct cipher_table_s cipher_table[TABLE_SIZE];
 static int disabled_algos[TABLE_SIZE];


-struct cipher_handle_s {
-    int  algo;
-    int  mode;
-    size_t blocksize;
-    byte iv[MAX_BLOCKSIZE];	/* (this should be ulong aligned) */
-    byte lastiv[MAX_BLOCKSIZE];
-    int  unused;  /* in IV */
-    int  (*setkey)( void *c, const byte *key, unsigned keylen );
-    void (*encrypt)( void *c, byte *outbuf, const byte *inbuf );
-    void (*decrypt)( void *c, byte *outbuf, const byte *inbuf );
-    PROPERLY_ALIGNED_TYPE context;
+struct cipher_handle_s 
+{
+  int  algo;
+  int  mode;
+  size_t blocksize;
+  
+  /* The initialization vector.  To help code optimization we make
+     sure that it is aligned on an unsigned long and u32 boundary.  */
+  union {
+    unsigned long dummy_ul_iv;         
+    u32 dummy_u32_iv;
+    unsigned char iv[MAX_BLOCKSIZE];	
+  } u_iv;
+  
+  byte lastiv[MAX_BLOCKSIZE];
+  int  unused;  /* in IV */
+  int  (*setkey)( void *c, const byte *key, unsigned keylen );
+  void (*encrypt)( void *c, byte *outbuf, const byte *inbuf );
+  void (*decrypt)( void *c, byte *outbuf, const byte *inbuf );
+  PROPERLY_ALIGNED_TYPE context;
 };


@ -459,14 +468,14 @@ cipher_setkey( CIPHER_HANDLE c, byte *key, unsigned keylen )
 void
 cipher_setiv( CIPHER_HANDLE c, const byte *iv, unsigned ivlen )
 {
-    memset( c->iv, 0, c->blocksize );
+    memset( c->u_iv.iv, 0, c->blocksize );
    if( iv ) {
 	if( ivlen != c->blocksize )
 	    log_info("WARNING: cipher_setiv: ivlen=%u blklen=%u\n",
 					     ivlen, (unsigned)c->blocksize );
 	if( ivlen > c->blocksize )
 	    ivlen = c->blocksize;
-	memcpy( c->iv, iv, ivlen );
+	memcpy( c->u_iv.iv, iv, ivlen );
    }
    c->unused = 0;
 }
@ -507,10 +516,10 @@ do_cbc_encrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nblocks )
 	/* fixme: the xor should works on words and not on
 	 * bytes.  Maybe it is a good idea to enhance the cipher backend
 	 * API to allow for CBC handling in the backend */
-	for(ivp=c->iv,i=0; i < blocksize; i++ )
+	for(ivp=c->u_iv.iv,i=0; i < blocksize; i++ )
 	    outbuf[i] = inbuf[i] ^ *ivp++;
 	(*c->encrypt)( &c->context.c, outbuf, outbuf );
-	memcpy(c->iv, outbuf, blocksize );
+	memcpy(c->u_iv.iv, outbuf, blocksize );
 	inbuf  += c->blocksize;
 	outbuf += c->blocksize;
    }
@ -530,9 +539,9 @@ do_cbc_decrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nblocks )
 	 * for this here because it is not used otherwise */
 	memcpy(c->lastiv, inbuf, blocksize );
 	(*c->decrypt)( &c->context.c, outbuf, inbuf );
-	for(ivp=c->iv,i=0; i < blocksize; i++ )
+	for(ivp=c->u_iv.iv,i=0; i < blocksize; i++ )
 	    outbuf[i] ^= *ivp++;
-	memcpy(c->iv, c->lastiv, blocksize );
+	memcpy(c->u_iv.iv, c->lastiv, blocksize );
 	inbuf  += c->blocksize;
 	outbuf += c->blocksize;
    }
@ -542,119 +551,181 @@ do_cbc_decrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nblocks )
 static void
 do_cfb_encrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nbytes )
 {
-    byte *ivp;
-    size_t blocksize = c->blocksize;
+  byte *ivp;
+  size_t blocksize = c->blocksize;
+  size_t blocksize_x_2 = blocksize + blocksize;

-    if( nbytes <= c->unused ) {
-	/* short enough to be encoded by the remaining XOR mask */
-	/* XOR the input with the IV and store input into IV */
-	for(ivp=c->iv+c->blocksize - c->unused; nbytes; nbytes--, c->unused-- )
+  if ( nbytes <= c->unused )
+    {
+      /* Short enough to be encoded by the remaining XOR mask.  XOR
+	 the input with the IV and store input into IV.  */
+      for (ivp=c->u_iv.iv+c->blocksize - c->unused; nbytes; 
+            nbytes--, c->unused-- )
 	    *outbuf++ = (*ivp++ ^= *inbuf++);
 	return;
    }
-
-    if( c->unused ) {
-	/* XOR the input with the IV and store input into IV */
-	nbytes -= c->unused;
-	for(ivp=c->iv+blocksize - c->unused; c->unused; c->unused-- )
-	    *outbuf++ = (*ivp++ ^= *inbuf++);
+  
+  if ( c->unused )
+    {
+      /* XOR the input with the IV and store input into IV.  */
+      nbytes -= c->unused;
+      for (ivp=c->u_iv.iv+blocksize - c->unused; c->unused; c->unused-- )
+        *outbuf++ = (*ivp++ ^= *inbuf++);
    }

-    /* Now we can process complete blocks. */
-#if 0 
-    /* Experimental code.  We may only use this for standard CFB
-       because for Phil's mode we need to save the IV of before the
-       last encryption - we don't want to do this in tghe fasf CFB
-       encryption routine.  */
-    if (c->algo == CIPHER_ALGO_AES
-        && nbytes >= blocksize 
-        && c->mode != CIPHER_MODE_PHILS_CFB) {
-        size_t n;
+  /* Now we can process complete blocks.  We use a loop as long as we
+     have at least 2 blocks and use conditions for the rest.  This
+     also allows to use a bulk encryption function if available.  */
+#ifdef USE_AES
+  if (nbytes >= blocksize_x_2 
+      && (c->algo == CIPHER_ALGO_AES
+          || c->algo == CIPHER_ALGO_AES256
+          || c->algo == CIPHER_ALGO_AES192))
+    {
+      unsigned int nblocks = nbytes / blocksize;
+      rijndael_cfb_enc (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks); 
+      outbuf += nblocks * blocksize;
+      inbuf  += nblocks * blocksize;
+      nbytes -= nblocks * blocksize;
+    }
+  else
+#endif /*USE_AES*/
+    {
+      while ( nbytes >= blocksize_x_2 )
+        {
+          int i;
+          /* Encrypt the IV. */
+          c->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+          /* XOR the input with the IV and store input into IV.  */
+          for(ivp=c->u_iv.iv,i=0; i < blocksize; i++ )
+            *outbuf++ = (*ivp++ ^= *inbuf++);
+          nbytes -= blocksize;
+        }
+    }

-	memcpy( c->lastiv, c->iv, blocksize );
-        n = (nbytes / blocksize) * blocksize;
-        rijndael_cfb_encrypt (&c->context.c, c->iv, outbuf, inbuf, n);
-        inbuf  += n;
-        outbuf += n;
-	nbytes -= n;
+  if ( nbytes >= blocksize )
+    {
+      int i;
+      /* Save the current IV and then encrypt the IV. */
+      memcpy( c->lastiv, c->u_iv.iv, blocksize );
+      c->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      /* XOR the input with the IV and store input into IV */
+      for(ivp=c->u_iv.iv,i=0; i < blocksize; i++ )
+        *outbuf++ = (*ivp++ ^= *inbuf++);
+      nbytes -= blocksize;
    }
-#endif
-    while( nbytes >= blocksize ) {
-	int i;
-	/* encrypt the IV (and save the current one) */
-	memcpy( c->lastiv, c->iv, blocksize );
-	(*c->encrypt)( &c->context.c, c->iv, c->iv );
-	/* XOR the input with the IV and store input into IV */
-	for(ivp=c->iv,i=0; i < blocksize; i++ )
-	    *outbuf++ = (*ivp++ ^= *inbuf++);
-	nbytes -= blocksize;
-    }
-    if( nbytes ) { /* process the remaining bytes */
-	/* encrypt the IV (and save the current one) */
-	memcpy( c->lastiv, c->iv, blocksize );
-	(*c->encrypt)( &c->context.c, c->iv, c->iv );
-	c->unused = blocksize;
-	/* and apply the xor */
-	c->unused -= nbytes;
-	for(ivp=c->iv; nbytes; nbytes-- )
-	    *outbuf++ = (*ivp++ ^= *inbuf++);
+  if ( nbytes ) 
+    {
+      /* Save the current IV and then encrypt the IV. */
+      memcpy (c->lastiv, c->u_iv.iv, blocksize );
+      c->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      c->unused = blocksize;
+      /* Apply the XOR. */
+      c->unused -= nbytes;
+      for(ivp=c->u_iv.iv; nbytes; nbytes-- )
+        *outbuf++ = (*ivp++ ^= *inbuf++);
    }
 }

+
 static void
 do_cfb_decrypt( CIPHER_HANDLE c, byte *outbuf, byte *inbuf, unsigned nbytes )
 {
-    byte *ivp;
-    ulong temp;
-    size_t blocksize = c->blocksize;
-
-    if( nbytes <= c->unused ) {
-	/* short enough to be encoded by the remaining XOR mask */
-	/* XOR the input with the IV and store input into IV */
-	for(ivp=c->iv+blocksize - c->unused; nbytes; nbytes--,c->unused--){
-	    temp = *inbuf++;
-	    *outbuf++ = *ivp ^ temp;
-	    *ivp++ = temp;
-	}
-	return;
+  unsigned char *ivp;
+  unsigned long temp;
+  int i;
+  size_t blocksize = c->blocksize;
+  size_t blocksize_x_2 = blocksize + blocksize;
+  
+  if (nbytes <= c->unused)
+    {
+      /* Short enough to be encoded by the remaining XOR mask. */
+      /* XOR the input with the IV and store input into IV. */
+      for (ivp=c->u_iv.iv+blocksize - c->unused;
+           nbytes; 
+           nbytes--, c->unused--)
+        {
+          temp = *inbuf++;
+          *outbuf++ = *ivp ^ temp;
+          *ivp++ = temp;
+        }
+      return;
+    }
+  
+  if (c->unused)
+    {
+      /* XOR the input with the IV and store input into IV. */
+      nbytes -= c->unused;
+      for (ivp=c->u_iv.iv+blocksize - c->unused; c->unused; c->unused-- )
+        {
+          temp = *inbuf++;
+          *outbuf++ = *ivp ^ temp;
+          *ivp++ = temp;
+        }
+    }
+  
+  /* Now we can process complete blocks.  We use a loop as long as we
+     have at least 2 blocks and use conditions for the rest.  This
+     also allows to use a bulk encryption function if available.  */
+#ifdef USE_AES
+  if (nbytes >= blocksize_x_2 
+      && (c->algo == CIPHER_ALGO_AES
+          || c->algo == CIPHER_ALGO_AES256
+          || c->algo == CIPHER_ALGO_AES192))
+    {
+      unsigned int nblocks = nbytes / blocksize;
+      rijndael_cfb_dec (&c->context.c, c->u_iv.iv, outbuf, inbuf, nblocks); 
+      outbuf += nblocks * blocksize;
+      inbuf  += nblocks * blocksize;
+      nbytes -= nblocks * blocksize;
+    }
+  else
+#endif /*USE_AES*/
+    {
+      while (nbytes >= blocksize_x_2 )
+        {
+          /* Encrypt the IV. */
+          c->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+          /* XOR the input with the IV and store input into IV. */
+          for (ivp=c->u_iv.iv,i=0; i < blocksize; i++ )
+            {
+              temp = *inbuf++;
+              *outbuf++ = *ivp ^ temp;
+              *ivp++ = temp;
+            }
+          nbytes -= blocksize;
+        }
    }

-    if( c->unused ) {
-	/* XOR the input with the IV and store input into IV */
-	nbytes -= c->unused;
-	for(ivp=c->iv+blocksize - c->unused; c->unused; c->unused-- ) {
-	    temp = *inbuf++;
-	    *outbuf++ = *ivp ^ temp;
-	    *ivp++ = temp;
-	}
+  if (nbytes >= blocksize )
+    {
+      /* Save the current IV and then encrypt the IV. */
+      memcpy ( c->lastiv, c->u_iv.iv, blocksize);
+      c->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      /* XOR the input with the IV and store input into IV */
+      for (ivp=c->u_iv.iv,i=0; i < blocksize; i++ )
+        {
+          temp = *inbuf++;
+          *outbuf++ = *ivp ^ temp;
+          *ivp++ = temp;
+        }
+      nbytes -= blocksize;
    }

-    /* now we can process complete blocks */
-    while( nbytes >= blocksize ) {
-	int i;
-	/* encrypt the IV (and save the current one) */
-	memcpy( c->lastiv, c->iv, blocksize );
-	(*c->encrypt)( &c->context.c, c->iv, c->iv );
-	/* XOR the input with the IV and store input into IV */
-	for(ivp=c->iv,i=0; i < blocksize; i++ ) {
-	    temp = *inbuf++;
-	    *outbuf++ = *ivp ^ temp;
-	    *ivp++ = temp;
-	}
-	nbytes -= blocksize;
-    }
-    if( nbytes ) { /* process the remaining bytes */
-	/* encrypt the IV (and save the current one) */
-	memcpy( c->lastiv, c->iv, blocksize );
-	(*c->encrypt)( &c->context.c, c->iv, c->iv );
-	c->unused = blocksize;
-	/* and apply the xor */
-	c->unused -= nbytes;
-	for(ivp=c->iv; nbytes; nbytes-- ) {
-	    temp = *inbuf++;
-	    *outbuf++ = *ivp ^ temp;
-	    *ivp++ = temp;
-	}
+  if (nbytes)
+    { 
+      /* Save the current IV and then encrypt the IV. */
+      memcpy ( c->lastiv, c->u_iv.iv, blocksize );
+      c->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      c->unused = blocksize;
+      /* Apply the XOR. */
+      c->unused -= nbytes;
+      for (ivp=c->u_iv.iv; nbytes; nbytes-- )
+        {
+          temp = *inbuf++;
+          *outbuf++ = *ivp ^ temp;
+          *ivp++ = temp;
+        }
    }
 }

@ -732,8 +803,8 @@ void
 cipher_sync( CIPHER_HANDLE c )
 {
    if( c->mode == CIPHER_MODE_PHILS_CFB && c->unused ) {
-	memmove(c->iv + c->unused, c->iv, c->blocksize - c->unused );
-	memcpy(c->iv, c->lastiv + c->blocksize - c->unused, c->unused);
+	memmove(c->u_iv.iv + c->unused, c->u_iv.iv, c->blocksize - c->unused );
+	memcpy(c->u_iv.iv, c->lastiv + c->blocksize - c->unused, c->unused);
 	c->unused = 0;
    }
 }
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@ -1,5 +1,5 @@
 /* Rijndael (AES) for GnuPG
- *	Copyright (C) 2000, 2001 Free Software Foundation, Inc.
+ *	Copyright (C) 2000, 2001, 2008 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -1866,86 +1866,119 @@ prepare_decryption( RIJNDAEL_context *ctx )



-/* Encrypt one block.  A and B may be the same. */
+/* Encrypt one block.  A and B need to be aligned on a 4 byte
+   boundary.  A and B may be the same. */
 static void
-do_encrypt (const RIJNDAEL_context *ctx, byte *b, const byte *a)
+do_encrypt_aligned (const RIJNDAEL_context *ctx, 
+                    unsigned char *b, const unsigned char *a)
 {
-    int r;
-    byte temp[4][4];
-    int ROUNDS = ctx->ROUNDS;
 #define rk (ctx->keySched)
+  int ROUNDS = ctx->ROUNDS;
+  int r;
+  union
+  {
+    u32  tempu32[4];  /* Force correct alignment. */
+    byte temp[4][4];
+  } u;

-    *((u32*)temp[0]) = *((u32*)(a   )) ^ *((u32*)rk[0][0]);
-    *((u32*)temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
-    *((u32*)temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
-    *((u32*)temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
-    *((u32*)(b    )) = *((u32*)T1[temp[0][0]])
-        ^ *((u32*)T2[temp[1][1]])
-        ^ *((u32*)T3[temp[2][2]]) 
-        ^ *((u32*)T4[temp[3][3]]);
-    *((u32*)(b + 4)) = *((u32*)T1[temp[1][0]])
-        ^ *((u32*)T2[temp[2][1]])
-        ^ *((u32*)T3[temp[3][2]]) 
-        ^ *((u32*)T4[temp[0][3]]);
-    *((u32*)(b + 8)) = *((u32*)T1[temp[2][0]])
-        ^ *((u32*)T2[temp[3][1]])
-        ^ *((u32*)T3[temp[0][2]]) 
-        ^ *((u32*)T4[temp[1][3]]);
-    *((u32*)(b +12)) = *((u32*)T1[temp[3][0]])
-        ^ *((u32*)T2[temp[0][1]])
-        ^ *((u32*)T3[temp[1][2]]) 
-        ^ *((u32*)T4[temp[2][3]]);
-    for (r = 1; r < ROUNDS-1; r++) {
-        *((u32*)temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[r][0]);
-        *((u32*)temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
-        *((u32*)temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
-        *((u32*)temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
+  *((u32*)u.temp[0]) = *((u32*)(a   )) ^ *((u32*)rk[0][0]);
+  *((u32*)u.temp[1]) = *((u32*)(a+ 4)) ^ *((u32*)rk[0][1]);
+  *((u32*)u.temp[2]) = *((u32*)(a+ 8)) ^ *((u32*)rk[0][2]);
+  *((u32*)u.temp[3]) = *((u32*)(a+12)) ^ *((u32*)rk[0][3]);
+  *((u32*)(b    ))   = (*((u32*)T1[u.temp[0][0]])
+                        ^ *((u32*)T2[u.temp[1][1]])
+                        ^ *((u32*)T3[u.temp[2][2]]) 
+                        ^ *((u32*)T4[u.temp[3][3]]));
+  *((u32*)(b + 4))   = (*((u32*)T1[u.temp[1][0]])
+                        ^ *((u32*)T2[u.temp[2][1]])
+                        ^ *((u32*)T3[u.temp[3][2]]) 
+                        ^ *((u32*)T4[u.temp[0][3]]));
+  *((u32*)(b + 8))   = (*((u32*)T1[u.temp[2][0]])
+                        ^ *((u32*)T2[u.temp[3][1]])
+                        ^ *((u32*)T3[u.temp[0][2]]) 
+                        ^ *((u32*)T4[u.temp[1][3]]));
+  *((u32*)(b +12))   = (*((u32*)T1[u.temp[3][0]])
+                        ^ *((u32*)T2[u.temp[0][1]])
+                        ^ *((u32*)T3[u.temp[1][2]]) 
+                        ^ *((u32*)T4[u.temp[2][3]]));

-        *((u32*)(b    )) = *((u32*)T1[temp[0][0]])
-            ^ *((u32*)T2[temp[1][1]])
-            ^ *((u32*)T3[temp[2][2]]) 
-            ^ *((u32*)T4[temp[3][3]]);
-        *((u32*)(b + 4)) = *((u32*)T1[temp[1][0]])
-            ^ *((u32*)T2[temp[2][1]])
-            ^ *((u32*)T3[temp[3][2]]) 
-            ^ *((u32*)T4[temp[0][3]]);
-        *((u32*)(b + 8)) = *((u32*)T1[temp[2][0]])
-            ^ *((u32*)T2[temp[3][1]])
-            ^ *((u32*)T3[temp[0][2]]) 
-            ^ *((u32*)T4[temp[1][3]]);
-        *((u32*)(b +12)) = *((u32*)T1[temp[3][0]])
-            ^ *((u32*)T2[temp[0][1]])
-            ^ *((u32*)T3[temp[1][2]]) 
-            ^ *((u32*)T4[temp[2][3]]);
+  for (r = 1; r < ROUNDS-1; r++)
+    {
+      *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[r][0]);
+      *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[r][1]);
+      *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[r][2]);
+      *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[r][3]);
+
+      *((u32*)(b    ))   = (*((u32*)T1[u.temp[0][0]])
+                            ^ *((u32*)T2[u.temp[1][1]])
+                            ^ *((u32*)T3[u.temp[2][2]]) 
+                            ^ *((u32*)T4[u.temp[3][3]]));
+      *((u32*)(b + 4))   = (*((u32*)T1[u.temp[1][0]])
+                            ^ *((u32*)T2[u.temp[2][1]])
+                            ^ *((u32*)T3[u.temp[3][2]]) 
+                            ^ *((u32*)T4[u.temp[0][3]]));
+      *((u32*)(b + 8))   = (*((u32*)T1[u.temp[2][0]])
+                            ^ *((u32*)T2[u.temp[3][1]])
+                            ^ *((u32*)T3[u.temp[0][2]]) 
+                            ^ *((u32*)T4[u.temp[1][3]]));
+      *((u32*)(b +12))   = (*((u32*)T1[u.temp[3][0]])
+                            ^ *((u32*)T2[u.temp[0][1]])
+                            ^ *((u32*)T3[u.temp[1][2]]) 
+                            ^ *((u32*)T4[u.temp[2][3]]));
    }
-    /* last round is special */   
-    *((u32*)temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[ROUNDS-1][0]);
-    *((u32*)temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
-    *((u32*)temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
-    *((u32*)temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
-    b[ 0] = T1[temp[0][0]][1];
-    b[ 1] = T1[temp[1][1]][1];
-    b[ 2] = T1[temp[2][2]][1];
-    b[ 3] = T1[temp[3][3]][1];
-    b[ 4] = T1[temp[1][0]][1];
-    b[ 5] = T1[temp[2][1]][1];
-    b[ 6] = T1[temp[3][2]][1];
-    b[ 7] = T1[temp[0][3]][1];
-    b[ 8] = T1[temp[2][0]][1];
-    b[ 9] = T1[temp[3][1]][1];
-    b[10] = T1[temp[0][2]][1];
-    b[11] = T1[temp[1][3]][1];
-    b[12] = T1[temp[3][0]][1];
-    b[13] = T1[temp[0][1]][1];
-    b[14] = T1[temp[1][2]][1];
-    b[15] = T1[temp[2][3]][1];
-    *((u32*)(b   )) ^= *((u32*)rk[ROUNDS][0]);
-    *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
-    *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
-    *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
+
+  /* Last round is special. */   
+  *((u32*)u.temp[0]) = *((u32*)(b   )) ^ *((u32*)rk[ROUNDS-1][0]);
+  *((u32*)u.temp[1]) = *((u32*)(b+ 4)) ^ *((u32*)rk[ROUNDS-1][1]);
+  *((u32*)u.temp[2]) = *((u32*)(b+ 8)) ^ *((u32*)rk[ROUNDS-1][2]);
+  *((u32*)u.temp[3]) = *((u32*)(b+12)) ^ *((u32*)rk[ROUNDS-1][3]);
+  b[ 0] = T1[u.temp[0][0]][1];
+  b[ 1] = T1[u.temp[1][1]][1];
+  b[ 2] = T1[u.temp[2][2]][1];
+  b[ 3] = T1[u.temp[3][3]][1];
+  b[ 4] = T1[u.temp[1][0]][1];
+  b[ 5] = T1[u.temp[2][1]][1];
+  b[ 6] = T1[u.temp[3][2]][1];
+  b[ 7] = T1[u.temp[0][3]][1];
+  b[ 8] = T1[u.temp[2][0]][1];
+  b[ 9] = T1[u.temp[3][1]][1];
+  b[10] = T1[u.temp[0][2]][1];
+  b[11] = T1[u.temp[1][3]][1];
+  b[12] = T1[u.temp[3][0]][1];
+  b[13] = T1[u.temp[0][1]][1];
+  b[14] = T1[u.temp[1][2]][1];
+  b[15] = T1[u.temp[2][3]][1];
+  *((u32*)(b   )) ^= *((u32*)rk[ROUNDS][0]);
+  *((u32*)(b+ 4)) ^= *((u32*)rk[ROUNDS][1]);
+  *((u32*)(b+ 8)) ^= *((u32*)rk[ROUNDS][2]);
+  *((u32*)(b+12)) ^= *((u32*)rk[ROUNDS][3]);
 #undef rk
 }

+
+static void
+do_encrypt (const RIJNDAEL_context *ctx,
+            unsigned char *bx, const unsigned char *ax)
+{
+  /* BX and AX are not necessary correctly aligned.  Thus we need to
+     copy them here. */
+  union
+  {
+    u32  dummy[4]; 
+    byte a[16];
+  } a;
+  union
+  {
+    u32  dummy[4]; 
+    byte b[16];
+  } b;
+
+  memcpy (a.a, ax, 16);
+  do_encrypt_aligned (ctx, b.b, a.a);
+  memcpy (bx, b.b, 16);
+}
+
+
 static void
 rijndael_encrypt (void *ctx, byte *b, const byte *a)
 {
@ -1953,56 +1986,6 @@ rijndael_encrypt (void *ctx, byte *b, const byte *a)
    burn_stack (16 + 2*sizeof(int));
 }

-#if 0
-/* Experimental code.  Needs to be generalized and we might want to
-   have variants for all possible sizes of the largest scalar type.
-   Also need to make sure that INBUF and OUTBUF are properlu
-   aligned. */
-void
-rijndael_cfb_encrypt (void *ctx, byte *iv,
-                      byte *outbuf, const byte *inbuf, size_t nbytes)
-{
-/*   if ( ((unsigned long)inbuf & 3) || ((unsigned long)outbuf & 3) ) */
-/*     {  */
-      /* Not properly aligned, use the slow version.  Actually the
-         compiler might even optimize it this pretty well if the
-         target CPU has relaxed alignment requirements. Thus it is
-         questionable whether we should at all go into the hassles of
-         doing alignment wise optimizations by ourself.  A quick test
-         with gcc 4.0 on ia32 did showed any advantages. */
-      byte *ivp;
-      int i;
-
-      while (nbytes >= 16)
-        {
-          do_encrypt (ctx, iv, iv);
-          for (i=0, ivp = iv; i < 16; i++)
-            *outbuf++ = (*ivp++ ^= *inbuf++);
-          nbytes -= 16;
-        }
-/*     } */
-/*   else */
-/*     { */
-/*       u32 *ivp; */
-/*       u32 *ob = (u32*)outbuf; */
-/*       const u32 *ib = (const u32*)inbuf; */
-
-/*       while (nbytes >= 16) */
-/*         { */
-/*           do_encrypt (ctx, iv, iv); */
-/*           ivp = iv; */
-/*           *ob++ = (*ivp++ ^= *ib++); */
-/*           *ob++ = (*ivp++ ^= *ib++); */
-/*           *ob++ = (*ivp++ ^= *ib++); */
-/*           *ob++ = (*ivp   ^= *ib++); */
-/*           nbytes -= 16; */
-/*         } */
-/*     } */
-  burn_stack (16 + 2*sizeof(int));
-}
-#endif
-
-


 /* Decrypt one block.  a and b may be the same. */
@ -2097,6 +2080,67 @@ rijndael_decrypt (void *ctx, byte *b, const byte *a)
    do_decrypt (ctx, b, a);
    burn_stack (16+2*sizeof(int));
 }
+
+
+
+/* Bulk encryption of complete blocks in CFB mode.  Caller needs to
+   make sure that IV is aligned on an unsigned long boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+void
+rijndael_cfb_enc (void *context, unsigned char *iv, 
+                  void *outbuf_arg, const void *inbuf_arg,
+                  unsigned int nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *ivp;
+  int i;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the IV. */
+      do_encrypt_aligned (ctx, iv, iv);
+      /* XOR the input with the IV and store input into IV.  */
+      for (ivp=iv,i=0; i < 16; i++ )
+        *outbuf++ = (*ivp++ ^= *inbuf++);
+    }
+
+  burn_stack (16 + 2*sizeof(int));
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  Caller needs to
+   make sure that IV is aligned on an unisgned lonhg boundary.  This
+   function is only intended for the bulk encryption feature of
+   cipher.c. */
+void
+rijndael_cfb_dec (void *context, unsigned char *iv, 
+                  void *outbuf_arg, const void *inbuf_arg,
+                  unsigned int nblocks)
+{
+  RIJNDAEL_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *ivp;
+  unsigned char temp;
+  int i;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_aligned (ctx, iv, iv);
+      for (ivp=iv,i=0; i < 16; i++ )
+        {
+          temp = *inbuf++;
+          *outbuf++ = *ivp ^ temp;
+          *ivp++ = temp;
+        }
+    }
+
+  burn_stack (16 + 2*sizeof(int));
+}
+
+

 /* Test a single encryption and decryption with each key size. */

--- a/configure.ac
+++ b/configure.ac
@ -25,7 +25,7 @@ min_automake_version="1.9.3"
 # Remember to change the version number immediately *after* a release.
 # Set my_issvn to "yes" for non-released code.  Remember to run an
 # "svn up" and "autogen.sh --force" right before creating a distribution.
-m4_define([my_version], [1.4.9rc1])
+m4_define([my_version], [1.4.9])
 m4_define([my_issvn], [yes])

 m4_define([svn_revision], m4_esyscmd([echo $((svn info 2>/dev/null \
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@ -1,3 +1,9 @@
+2007-12-21  Werner Koch  <wk@g10code.com>
+
+	* README.W32: Tell that Vista is supported and that at least NT-4
+	is required.  It might still work on older systems, but I don't
+	know for sure.
+
 2007-12-12  Werner Koch  <wk@g10code.com>

 	* gpg.texi, specify-user-id.texi: Update from gnupg-2.
--- a/doc/README.W32
+++ b/doc/README.W32
@ -1,7 +1,8 @@
 README.W32                                                -*- text -*-

-This is a binary package with GnuPG for MS-Windows 95, 98, WNT, W2000
-and XP. See the file README for generic instructions and usage hints.
+This is a binary package with GnuPG for MS-Windows NT-4, W2000, XP and
+Vista.  A native version for 64 bit is not available.  See the file
+README for generic instructions and usage hints.

 A FAQ comes with this package and a probably more recent one can be
 found online at http://www.gnupg.org/faq.html.  See
--- a/scripts/ChangeLog
+++ b/scripts/ChangeLog
@ -1,3 +1,7 @@
+2008-01-30  Werner Koch  <wk@g10code.com>
+
+	* w32installer.nsi: Set the OutPath back.
+
 2007-12-12  Werner Koch  <wk@g10code.com>

 	* config.sub, config.guess: Update to version 2007-11-19.
--- a/scripts/w32installer.nsi
+++ b/scripts/w32installer.nsi
@ -351,6 +351,8 @@ Section "-Finish"
  WriteRegStr HKCU "Software\GNU\GnuPG" "Lang" $R3
  ;;

+  # Set the Outpath pack so that the README file can be displayed.
+  SetOutPath "$INSTDIR"

 SectionEnd ; "-Finish"