See ChangeLog: Mon Jul 17 16:35:47 CEST 2000 Werner Koch

2025-07-02 22:46:30 +02:00 · 2000-07-17 14:32:21 +00:00 · 2000-07-17 14:32:21 +00:00 · 0bf44b072c
commit 0bf44b072c
parent 92cd255508
45 changed files with 31954 additions and 7194 deletions
--- a/mpi/ChangeLog
+++ b/mpi/ChangeLog
@ -1,3 +1,30 @@
+Mon Jul 17 16:35:47 CEST 2000  Werner Koch  <wk@>
+
+  * power/: Add all files from GMP for this CPU. Converted comments to
+  CPP comments because some ASes complain about ' in comments.
+
+  * config.links:  Support for BSDI 4.x; by Wayne Chapeskie. Add support
+  for FreeBSD 5 and made the case stmt looking nicer; by Jun Kuriyama.
+  Add support for NetBSD.
+  (sparc8): Made the search path the same as sparc9
+  (sparc64-unknown-linux-gnu): use udiv module; by Adam Mitchell.
+
+  * Makefile.am: c/SFLAGS/ASFLAGS/. This has only been used by the
+  powerpc and actually never passed the -Wa,foo to the cc.
+
+  * mpih-div.c (mpihelp_divrem): The MPN_COPY_DECR copied one element
+  too many.  This is a gmp2.0.2p9.txt patch.
+
+  * longlong.h (umul_ppmm): Fixes for ARM-4. By Sean MacLennan.
+
+  * mpi-internal.h (karatsuba_ctx): New.
+  * mpih-mul.c (mpihelp_release_karatsuba_ctx): New.
+  (mpihelp_mul_karatsuba_case): New.
+  (mpihelp_mul): Splitted to make use of the new functions.
+  * mpi-pow.c (mpi_powm): Make use of the new splitted function to avoid
+  multiple allocation of temporary memory during the karatsuba operations.
+  * mpi_mpow.c: Removed the unused Barrett code.
+
 2000-03-21 16:17:30  Werner Koch  (wk@habibti.openit.de)

 	* config.links: Add support for FreeBSD 5.
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@ -3,12 +3,13 @@

 INCLUDES =  -I$(top_srcdir)/gcrypt
 CFLAGS = @CFLAGS@ @MPI_OPT_FLAGS@
-SFLAGS = @MPI_SFLAGS@
+ASFLAGS = @MPI_SFLAGS@

 EXTRA_DIST = config.links
 DISTCLEANFILES = mpih-add1.S mpih-mul1.S mpih-mul2.S mpih-mul3.S  \
 		 mpih-lshift.S mpih-rshift.S mpih-sub1.S asm-syntax.h sysdep.h
 # Note: we only use .S files so we should delete all left over .s
+# CLEANFILES = _*.s
 CLEANFILES = *.s

 noinst_LTLIBRARIES = libmpi.la
@ -56,4 +57,9 @@ libmpi_la_LIBADD = $(common_asm_objects) @MPI_EXTRA_ASM_OBJS@
 .S.s:
 	 $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' >$*.s

+# Hmmm, we should use this, so that OSes which do not distinguish
+# filename case still work.  We have to see how libtool can handle this
+#   $(CPP) $(INCLUDES) $(DEFS) $< | grep -v '^#' > _$*.s
+#   $(COMPILE) -c _$*.s
+#   mv -f _$*.o $*.o

--- a/mpi/config.links
+++ b/mpi/config.links
@ -1,4 +1,4 @@
-# sourced my ../configure to get the list of files to link
+# sourced by ../configure to get the list of files to link
 # this should set $mpi_ln_src and mpi_ln_dst.
 # Note: this is called from the above directory.

@ -12,23 +12,40 @@ echo '/* created by config.links - do not edit */' >./mpi/asm-syntax.h

 if test "$try_asm_modules" = "yes" ; then
 case "${target}" in
-    i[34]86*-*-freebsd*-elf | i[34]86*-*-freebsd[3-9]* | i[34]86*-*-freebsdelf*)
+    i[34]86*-*-freebsd*-elf  | \
+    i[34]86*-*-freebsd[3-9]* | \
+    i[34]86*-*-freebsdelf*   | \
+    i[34]86*-*-netbsd* )
       echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
       cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
       path="i386"
       ;;
-    i[56]86*-*-freebsd*-elf | i[56]86*-*-freebsd[3-9]* | i[56]86*-*-freebsdelf*)
+    i[56]86*-*-freebsd*-elf  | \
+    i[56]86*-*-freebsd[3-9]* | \
+    i[56]86*-*-freebsdelf*   | \
+    i[56]86*-*-netbsd*	     | \
+    pentium-*-netbsd*	     | \
+    pentiumpro-*-netbsd*)
       echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
       cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
       path="i586 i386"
       ;;
-    i[34]86*-*-linuxaout* | i[34]86*-*-linuxoldld* | i[34]86*-*-*bsd*)
+    i[34]86*-*-bsdi4*)
+       echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
+       cat  $srcdir/mpi/i386/syntax.h	 >>./mpi/asm-syntax.h
+       path="i386"
+       ;;
+    i[34]86*-*-linuxaout*  | \
+    i[34]86*-*-linuxoldld* | \
+    i[34]86*-*-*bsd*)
 	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
 	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i386"
 	;;
-    i[56]86*-*-linuxaout* | i[56]86*-*-linuxoldld* | i[56]86*-*-*bsd*)
+    i[56]86*-*-linuxaout*  | \
+    i[56]86*-*-linuxoldld* | \
+    i[56]86*-*-*bsd*)
 	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
 	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
@ -49,7 +66,9 @@ case "${target}" in
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i386"
 	;;
-    i[56]86*-*-* | pentium-*-* | pentiumpro-*-*)
+    i[56]86*-*-*  | \
+    pentium-*-*   | \
+    pentiumpro-*-*)
 	echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i586 i386"
@ -74,13 +93,23 @@ case "${target}" in
 	path="pa7100 hppa1.1 hppa"
 	mpi_extra_modules="udiv-qrnnd"
 	;;
-    sparc9*-*-* | sparc64*-*-* | ultrasparc*-*-*)
+    sparc64-*-linux-gnu)
+	# An extra rule because we have an report for this one only.
+	# Should be compared against the next GMP version
+	echo '/* configured for sparc64-*-linux-gnu */' >>./mpi/asm-syntax.h
+	path="sparc32v8 sparc32"
+	mpi_extra_modules="udiv"
+	;;
+    sparc9*-*-*     | \
+    sparc64*-*-*    | \
+    ultrasparc*-*-* )
 	echo '/* configured for sparc9 or higher */' >>./mpi/asm-syntax.h
 	path="sparc32v8 sparc32"
 	;;
-    sparc8*-*-* | microsparc*-*-*)
+    sparc8*-*-*     | \
+    microsparc*-*-*)
 	echo '/* configured for sparc8 */' >>./mpi/asm-syntax.h
-	path="sparc32v8"
+	path="sparc32v8 sparc32"
 	;;
    supersparc*-*-*)
 	echo '/* configured for supersparc */' >>./mpi/asm-syntax.h
@ -92,7 +121,8 @@ case "${target}" in
 	path="sparc32"
 	mpi_extra_modules="udiv"
 	;;
-    mips[34]*-*-* | mips*-*-irix6*)
+    mips[34]*-*-* | \
+    mips*-*-irix6*)
       echo '/* configured for MIPS3 */' >>./mpi/asm-syntax.h
       path="mips3"
       ;;
@ -103,7 +133,8 @@ case "${target}" in

    # Motorola 68k configurations.  Let m68k mean 68020-68040.
    # mc68000 or mc68060 configurations need to be specified explicitly
-    m680[234]0*-*-linuxaout* | m68k*-*-linuxaout*)
+    m680[234]0*-*-linuxaout* | \
+    m68k*-*-linuxaout*)
 	echo '#define MIT_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68020 m68k"
@ -113,7 +144,8 @@ case "${target}" in
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k"
 	;;
-    m680[234]0*-*-linux* | m68k*-*-linux*)
+    m680[234]0*-*-linux* | \
+    m68k*-*-linux*)
 	echo '#define ELF_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	;;
@ -127,12 +159,14 @@ case "${target}" in
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68020 m68k"
 	;;
-    m68000*-*-* | m68060*-*-*)
+    m68000*-*-* | \
+    m68060*-*-*)
 	echo '#define MIT_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68000"
 	;;
-    m680[234]0*-*-* | m68k*-*-*)
+    m680[234]0*-*-* | \
+    m68k*-*-*)
 	echo '#define MIT_SYNTAX'           >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/m68k/syntax.h	    >>./mpi/asm-syntax.h
 	path="m68k/mc68020 m68k"
@ -144,25 +178,37 @@ case "${target}" in
 	cat   $srcdir/mpi/powerpc32/syntax.h	>>./mpi/asm-syntax.h
 	path="powerpc32"
 	;;
-    rs6000-*-aix[456789]* | rs6000-*-aix3.2.[456789])
+    rs6000-*-aix[456789]*    | \
+    rs6000-*-aix3.2.[456789])
 	mpi_sflags="-Wa,-mpwr"
 	path="power"
 	mpi_extra_modules="udiv-w-sdiv"
 	;;
-    rs6000-*-* | power-*-* | power2-*-*)
+    rs6000-*-* | \
+    power-*-*  | \
+    power2-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="power"
 	mpi_extra_modules="udiv-w-sdiv"
 	;;
+    powerpc-ibm-aix4.2.* )
+	# I am not sure about this one but a machine identified by
+	# powerpc-ibm-aix4.2.1.0 cannot use the powerpc32 code.
+	mpi_sflags="-Wa,-mpwr"
+	path="power"
+	mpi_extra_modules="udiv-w-sdiv"
+	;;
    ppc601-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="power powerpc32"
 	;;
-    ppc60[234]*-*-* | powerpc*-*-*)
+    ppc60[234]*-*-* | \
+    powerpc*-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="powerpc32"
 	;;
-    ppc620-*-* | powerpc64*-*-*)
+    ppc620-*-*	    | \
+    powerpc64*-*-*)
 	mpi_sflags="-Wa,-mppc"
 	path="powerpc64"
 	;;
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@ -199,6 +199,8 @@ extern UDItype __udiv_qrnnd ();
 	     "rI" ((USItype)(bh)),                                      \
 	     "r" ((USItype)(al)),                                       \
 	     "rI" ((USItype)(bl)))
+#ifdef __ARM_ARCH_3__
+/* SAM This does not work on arm4 */
 #define umul_ppmm(xh, xl, a, b) \
  __asm__ ("%@ Inlined umul_ppmm
 	mov	%|r0, %2, lsr #16
@ -218,6 +220,18 @@ extern UDItype __udiv_qrnnd ();
 	   : "r" ((USItype)(a)),                                        \
 	     "r" ((USItype)(b))                                         \
 	   : "r0", "r1", "r2")
+#elif __ARM_ARCH_4__
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("%@ Inlined umul_ppmm
+	umull	%r1, %r0, %r2, %r3" \
+		   : "=&r" ((USItype)(xh)), \
+		     "=r" ((USItype)(xl)) \
+		   : "r" ((USItype)(a)), \
+		     "r" ((USItype)(b)) \
+		   : "r0", "r1")
+#else
+#error Untested architecture
+#endif
 #define UMUL_TIME 20
 #define UDIV_TIME 100
 #endif /* __arm__ */
--- a/mpi/mpi-internal.h
+++ b/mpi/mpi-internal.h
@ -1,6 +1,6 @@
 /* mpi-internal.h  -  Internal to the Multi Precision Integers
 *	Copyright (C) 1998 Free Software Foundation, Inc.
- *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -186,6 +186,17 @@ mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size,
 int mpihelp_cmp( mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size );

 /*-- mpihelp-mul.c --*/
+
+struct karatsuba_ctx {
+    struct karatsuba_ctx *next;
+    mpi_ptr_t tspace;
+    mpi_size_t tspace_size;
+    mpi_ptr_t tp;
+    mpi_size_t tp_size;
+};
+
+void mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx );
+
 mpi_limb_t mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			     mpi_size_t s1_size, mpi_limb_t s2_limb);
 mpi_limb_t mpihelp_submul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
@ -198,6 +209,12 @@ void mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size );
 void mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size,
 						mpi_ptr_t tspace);

+void mpihelp_mul_karatsuba_case( mpi_ptr_t prodp,
+				 mpi_ptr_t up, mpi_size_t usize,
+				 mpi_ptr_t vp, mpi_size_t vsize,
+				 struct karatsuba_ctx *ctx );
+
+
 /*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/
 mpi_limb_t mpihelp_mul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr,
 			  mpi_size_t s1_size, mpi_limb_t s2_limb);
--- a/mpi/mpi-pow.c
+++ b/mpi/mpi-pow.c
@ -1,6 +1,6 @@
 /* mpi-pow.c  -  MPI functions
 *	Copyright (C) 1998 Free Software Foundation, Inc.
- *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -30,9 +30,10 @@
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
+#include <string.h>
 #include "mpi-internal.h"
 #include "longlong.h"
+#include <assert.h>


 /****************
@ -159,7 +160,9 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 	int c;
 	mpi_limb_t e;
 	mpi_limb_t carry_limb;
+	struct karatsuba_ctx karactx;

+	memset( &karactx, 0, sizeof karactx );
 	negative_result = (ep[0] & 1) && base->sign;

 	i = esize - 1;
@ -177,6 +180,7 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 	 * by RP (==RES->d), and with 50% probability in the area originally
 	 * pointed to by XP.
 	 */
+
 	for(;;) {
 	    while( c ) {
 		mpi_ptr_t tp;
@ -194,7 +198,6 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 			mpi_free_limb_space( tspace );
 			tsize = 2 * rsize;
 			tspace = mpi_alloc_limb_space( tsize, 0 );
-
 		    }
 		    mpih_sqr_n( xp, rp, rsize, tspace );
 		}
@ -209,7 +212,15 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 		rsize = xsize;

 		if( (mpi_limb_signed_t)e < 0 ) {
-		    mpihelp_mul( xp, rp, rsize, bp, bsize );
+		    /*mpihelp_mul( xp, rp, rsize, bp, bsize );*/
+		    if( bsize < KARATSUBA_THRESHOLD ) {
+			mpihelp_mul( xp, rp, rsize, bp, bsize );
+		    }
+		    else {
+			mpihelp_mul_karatsuba_case(
+				     xp, rp, rsize, bp, bsize, &karactx );
+		    }
+
 		    xsize = rsize + bsize;
 		    if( xsize > msize ) {
 			mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize);
@ -258,6 +269,8 @@ gcry_mpi_powm( MPI res, MPI base, MPI exp, MPI mod)
 	if( mod_shift_cnt )
 	    mpihelp_rshift( rp, rp, rsize, mod_shift_cnt);
 	MPN_NORMALIZE (rp, rsize);
+
+	mpihelp_release_karatsuba_ctx( &karactx );
    }

    if( negative_result && rsize ) {
--- a/mpi/mpih-div.c
+++ b/mpi/mpih-div.c
@ -1,6 +1,6 @@
 /* mpihelp-div.c  -  MPI helper functions
 *	Copyright (C) 1998 Free Software Foundation, Inc.
- *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -338,7 +338,7 @@ mpihelp_divrem( mpi_ptr_t qp, mpi_size_t qextra_limbs,
 		}
 		else {
 		    n2 = np[dsize - 1];
-		    MPN_COPY_DECR (np + 1, np, dsize);
+		    MPN_COPY_DECR (np + 1, np, dsize - 1);
 		    np[0] = 0;
 		}

--- a/mpi/mpih-mul.c
+++ b/mpi/mpih-mul.c
@ -1,5 +1,5 @@
 /* mpihelp-mul.c  -  MPI helper functions
- * Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc.
+ * Copyright (C) 1994, 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
 *
 * This file is part of GnuPG.
 *
@ -29,10 +29,10 @@
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include "mpi-internal.h"
 #include "longlong.h"
-#include "g10lib.h" /* for g10_is_secure() */
-
+#include "g10lib.h" /* g10_is_secure() */


 #define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \
@ -373,6 +373,86 @@ mpihelp_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
 }


+
+void
+mpihelp_mul_karatsuba_case( mpi_ptr_t prodp,
+			    mpi_ptr_t up, mpi_size_t usize,
+			    mpi_ptr_t vp, mpi_size_t vsize,
+			    struct karatsuba_ctx *ctx )
+{
+    mpi_limb_t cy;
+
+    if( !ctx->tspace || ctx->tspace_size < vsize ) {
+	if( ctx->tspace )
+	    mpi_free_limb_space( ctx->tspace );
+	ctx->tspace = mpi_alloc_limb_space( 2 * vsize,
+				       g10_is_secure( up ) || g10_is_secure( vp ) );
+	ctx->tspace_size = vsize;
+    }
+
+    MPN_MUL_N_RECURSE( prodp, up, vp, vsize, ctx->tspace );
+
+    prodp += vsize;
+    up += vsize;
+    usize -= vsize;
+    if( usize >= vsize ) {
+	if( !ctx->tp || ctx->tp_size < vsize ) {
+	    if( ctx->tp )
+		mpi_free_limb_space( ctx->tp );
+	    ctx->tp = mpi_alloc_limb_space( 2 * vsize, g10_is_secure( up )
+						      || g10_is_secure( vp ) );
+	    ctx->tp_size = vsize;
+	}
+
+	do {
+	    MPN_MUL_N_RECURSE( ctx->tp, up, vp, vsize, ctx->tspace );
+	    cy = mpihelp_add_n( prodp, prodp, ctx->tp, vsize );
+	    mpihelp_add_1( prodp + vsize, ctx->tp + vsize, vsize, cy );
+	    prodp += vsize;
+	    up += vsize;
+	    usize -= vsize;
+	} while( usize >= vsize );
+    }
+
+    if( usize ) {
+	if( usize < KARATSUBA_THRESHOLD ) {
+	    mpihelp_mul( ctx->tspace, vp, vsize, up, usize );
+	}
+	else {
+	    if( !ctx->next ) {
+		ctx->next = g10_xcalloc( 1, sizeof *ctx );
+	    }
+	    mpihelp_mul_karatsuba_case( ctx->tspace,
+					vp, vsize,
+					up, usize,
+					ctx->next );
+	}
+
+	cy = mpihelp_add_n( prodp, prodp, ctx->tspace, vsize);
+	mpihelp_add_1( prodp + vsize, ctx->tspace + vsize, usize, cy );
+    }
+}
+
+
+void
+mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx )
+{
+    struct karatsuba_ctx *ctx2;
+
+    if( ctx->tp )
+	mpi_free_limb_space( ctx->tp );
+    if( ctx->tspace )
+	mpi_free_limb_space( ctx->tspace );
+    for( ctx=ctx->next; ctx; ctx = ctx2 ) {
+	ctx2 = ctx->next;
+	if( ctx->tp )
+	    mpi_free_limb_space( ctx->tp );
+	if( ctx->tspace )
+	    mpi_free_limb_space( ctx->tspace );
+	g10_free( ctx );
+    }
+}
+
 /* Multiply the natural numbers u (pointed to by UP, with USIZE limbs)
 * and v (pointed to by VP, with VSIZE limbs), and store the result at
 * PRODP.  USIZE + VSIZE limbs are always stored, but if the input
@ -394,7 +474,7 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
 {
    mpi_ptr_t prod_endp = prodp + usize + vsize - 1;
    mpi_limb_t cy;
-    mpi_ptr_t tspace;
+    struct karatsuba_ctx ctx;

    if( vsize < KARATSUBA_THRESHOLD ) {
 	mpi_size_t i;
@ -438,34 +518,9 @@ mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize,
 	return cy;
    }

-    tspace = mpi_alloc_limb_space( 2 * vsize,
-				   g10_is_secure( up ) || g10_is_secure( vp ) );
-    MPN_MUL_N_RECURSE( prodp, up, vp, vsize, tspace );
-
-    prodp += vsize;
-    up += vsize;
-    usize -= vsize;
-    if( usize >= vsize ) {
-	mpi_ptr_t tp = mpi_alloc_limb_space( 2 * vsize, g10_is_secure( up )
-							|| g10_is_secure( vp ) );
-	do {
-	    MPN_MUL_N_RECURSE( tp, up, vp, vsize, tspace );
-	    cy = mpihelp_add_n( prodp, prodp, tp, vsize );
-	    mpihelp_add_1( prodp + vsize, tp + vsize, vsize, cy );
-	    prodp += vsize;
-	    up += vsize;
-	    usize -= vsize;
-	} while( usize >= vsize );
-	mpi_free_limb_space( tp );
-    }
-
-    if( usize ) {
-	mpihelp_mul( tspace, vp, vsize, up, usize );
-	cy = mpihelp_add_n( prodp, prodp, tspace, vsize);
-	mpihelp_add_1( prodp + vsize, tspace + vsize, usize, cy );
-    }
-
-    mpi_free_limb_space( tspace );
+    memset( &ctx, 0, sizeof ctx );
+    mpihelp_mul_karatsuba_case( prodp, up, usize, vp, vsize, &ctx );
+    mpihelp_release_karatsuba_ctx( &ctx );
    return *prod_endp;
 }

--- a/mpi/power/distfiles
+++ b/mpi/power/distfiles
@ -0,0 +1,7 @@
+mpih-add1.S
+mpih-lshift.S
+mpih-mul1.S
+mpih-mul2.S
+mpih-mul3.S
+mpih-rshift.S
+mpih-sub1.S
--- a/mpi/power/mpih-add1.S
+++ b/mpi/power/mpih-add1.S
@ -0,0 +1,86 @@
+/* IBM POWER add_n -- Add two limb vectors of equal, non-zero length.
+ *
+ * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+ */
+
+	.toc
+	.extern mpihelp_add_n[DS]
+	.extern .mpihelp_add_n
+.csect [PR]
+	.align 2
+	.globl mpihelp_add_n
+	.globl .mpihelp_add_n
+	.csect mpihelp_add_n[DS]
+mpihelp_add_n:
+	.long .mpihelp_add_n, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_add_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3) 	# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	a	7,0,8		# add least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven 	# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bne	1,L1		# branch if not
+	st	7,4(3)
+	aze	3,10		# use the fact that r10 is zero...
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	ae	7,0,8		# add limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	ae	11,9,10 	# add previous limbs with cy, set cy
+	stu	7,4(3)		#
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	ae	7,0,8		# add previous limbs with cy, set cy
+	stu	11,4(3) 	#
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	ae	11,9,10 	# add limbs with cy, set cy
+	st	7,4(3)		#
+	st	11,8(3) 	#
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
+
--- a/mpi/power/mpih-lshift.S
+++ b/mpi/power/mpih-lshift.S
@ -0,0 +1,64 @@
+/* IBM POWER lshift
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr 	r4
+# size		r5
+# cnt		r6
+ */
+
+	.toc
+	.extern mpihelp_lshift[DS]
+	.extern .mpihelp_lshift
+.csect [PR]
+	.align 2
+	.globl mpihelp_lshift
+	.globl .mpihelp_lshift
+	.csect mpihelp_lshift[DS]
+mpihelp_lshift:
+	.long .mpihelp_lshift, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4) 	# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4) 	# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4) 	# load next lower limb
+	stu	7,-4(9) 	# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9) 	# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st	7,-4(9) 	# store it
+	br
+
--- a/mpi/power/mpih-mul1.S
+++ b/mpi/power/mpih-mul1.S
@ -0,0 +1,115 @@
+/* IBM POWER  mul_1 -- Multiply a limb vector with a limb and store
+ * the result in a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.	We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+	.toc
+	.csect .mpihelp_mul_1[PR]
+	.align 2
+	.globl mpihelp_mul_1
+	.globl .mpihelp_mul_1
+	.csect mpihelp_mul_1[DS]
+mpihelp_mul_1:
+	.long .mpihelp_mul_1[PR], TOC[tc0], 0
+	.csect .mpihelp_mul_1[PR]
+.mpihelp_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0 	# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+
--- a/mpi/power/mpih-mul2.S
+++ b/mpi/power/mpih-mul2.S
@ -0,0 +1,130 @@
+/* IBM POWER addmul_1 -- Multiply a limb vector with a limb and add
+ *			 the result to a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.	We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+	.toc
+	.csect .mpihelp_addmul_1[PR]
+	.align 2
+	.globl mpihelp_addmul_1
+	.globl .mpihelp_addmul_1
+	.csect mpihelp_addmul_1[DS]
+mpihelp_addmul_1:
+	.long .mpihelp_addmul_1[PR], TOC[tc0], 0
+	.csect .mpihelp_addmul_1[PR]
+.mpihelp_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0 	# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+
--- a/mpi/power/mpih-mul3.S
+++ b/mpi/power/mpih-mul3.S
@ -0,0 +1,135 @@
+/* IBM POWER submul_1 -- Multiply a limb vector with a limb and subtract
+ *			 the result from a second limb vector.
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+/*
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction.  To
+# obtain that operation, we have to use the 32x32->64 signed multiplication
+# instruction, and add the appropriate compensation to the high limb of the
+# result.  We add the multiplicand if the multiplier has its most significant
+# bit set, and we add the multiplier if the multiplicand has its most
+# significant bit set.	We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit
+# we can branch in zero cycles, so that's how we perform the additions.
+ */
+
+	.toc
+	.csect .mpihelp_submul_1[PR]
+	.align 2
+	.globl mpihelp_submul_1
+	.globl .mpihelp_submul_1
+	.csect mpihelp_submul_1[DS]
+mpihelp_submul_1:
+	.long .mpihelp_submul_1[PR], TOC[tc0], 0
+	.csect .mpihelp_submul_1[PR]
+.mpihelp_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop: lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0 	# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6 	# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11 	# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+
--- a/mpi/power/mpih-rshift.S
+++ b/mpi/power/mpih-rshift.S
@ -0,0 +1,64 @@
+/* IBM POWER rshift
+ *
+ * Copyright (C) 1992, 1994, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr 	r4
+# size		r5
+# cnt		r6
+*/
+
+	.toc
+	.extern mpihelp_rshift[DS]
+	.extern .mpihelp_rshift
+.csect [PR]
+	.align 2
+	.globl mpihelp_rshift
+	.globl .mpihelp_rshift
+	.csect mpihelp_rshift[DS]
+mpihelp_rshift:
+	.long .mpihelp_rshift, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st	7,4(9)		# store it
+	br
+
+
--- a/mpi/power/mpih-sub1.S
+++ b/mpi/power/mpih-sub1.S
@ -0,0 +1,87 @@
+/* IBM POWER sub_n -- Subtract two limb vectors of equal, non-zero length.
+ *
+ * Copyright (C) 1992, 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+/*
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+ */
+
+	.toc
+	.extern mpihelp_sub_n[DS]
+	.extern .mpihelp_sub_n
+.csect [PR]
+	.align 2
+	.globl mpihelp_sub_n
+	.globl .mpihelp_sub_n
+	.csect mpihelp_sub_n[DS]
+mpihelp_sub_n:
+	.long .mpihelp_sub_n, TOC[tc0], 0
+	.csect [PR]
+.mpihelp_sub_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3) 	# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	sf	7,0,8		# subtract least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven 	# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bne	1,L1		# branch if not
+	st	7,4(3)
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	sfe	7,0,8		# subtract limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	sfe	11,10,9 	# subtract previous limbs with cy, set cy
+	stu	7,4(3)		#
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5) 	# load s2 limb and update s2_ptr
+	sfe	7,0,8		# subtract previous limbs with cy, set cy
+	stu	11,4(3) 	#
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	sfe	11,10,9 	# subtract limbs with cy, set cy
+	st	7,4(3)		#
+	st	11,8(3) 	#
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
+