51 files changed, 10923 insertions, 1785 deletions
diff --git a/crypto/bn/.cvsignore b/crypto/bn/.cvsignore
index c6d03a9dbc..ebe4b61bb3 100644
--- a/crypto/bn/.cvsignore
+++ b/crypto/bn/.cvsignore
@@ -1,2 +1,7 @@
 lib
 Makefile.save
+*.flc
+semantic.cache
+co-*.s
+bn-*.s
+*-mont.s
diff --git a/crypto/bn/Makefile.ssl b/crypto/bn/Makefile.ssl
deleted file mode 100644
index 817bfa54cd..0000000000
--- a/crypto/bn/Makefile.ssl
+++ /dev/null
@@ -1,338 +0,0 @@
-#
-# SSLeay/crypto/bn/Makefile
-#
-
-DIR=	bn
-TOP=	../..
-CC=	cc
-CPP=    $(CC) -E
-INCLUDES= -I.. -I$(TOP) -I../../include
-CFLAG=-g
-INSTALL_PREFIX=
-OPENSSLDIR=     /usr/local/ssl
-INSTALLTOP=/usr/local/ssl
-MAKE=		make -f Makefile.ssl
-MAKEDEPPROG=	makedepend
-MAKEDEPEND=	$(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG)
-MAKEFILE=	Makefile.ssl
-AR=		ar r
-
-BN_ASM=		bn_asm.o
-# or use
-#BN_ASM=	bn86-elf.o
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-ASFLAGS= $(INCLUDES) $(ASFLAG)
-AFLAGS= $(ASFLAGS)
-
-GENERAL=Makefile
-TEST=bntest.c exptest.c
-APPS=
-
-LIB=$(TOP)/libcrypto.a
-LIBSRC=	bn_add.c bn_div.c bn_exp.c bn_lib.c bn_ctx.c bn_mul.c bn_mod.c \
-	bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
-	bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c bn_asm.c \
-	bn_recp.c bn_mont.c bn_mpi.c bn_exp2.c bn_gf2m.c bn_nist.c \
-	bn_depr.c
-
-LIBOBJ=	bn_add.o bn_div.o bn_exp.o bn_lib.o bn_ctx.o bn_mul.o bn_mod.o \
-	bn_print.o bn_rand.o bn_shift.o bn_word.o bn_blind.o \
-	bn_kron.o bn_sqrt.o bn_gcd.o bn_prime.o bn_err.o bn_sqr.o $(BN_ASM) \
-	bn_recp.o bn_mont.o bn_mpi.o bn_exp2.o bn_gf2m.o bn_nist.o \
-	bn_depr.o
-
-SRC= $(LIBSRC)
-
-EXHEADER= bn.h
-HEADER=	bn_lcl.h bn_prime.h $(EXHEADER)
-
-ALL=    $(GENERAL) $(SRC) $(HEADER)
-
-top:
-	(cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-
-all:	lib
-
-bn_prime.h: bn_prime.pl
-	$(PERL) bn_prime.pl >bn_prime.h
-
-divtest: divtest.c ../../libcrypto.a
-	cc -I../../include divtest.c -o divtest ../../libcrypto.a
-
-bnbug: bnbug.c ../../libcrypto.a top
-	cc -g -I../../include bnbug.c -o bnbug ../../libcrypto.a
-
-lib:	$(LIBOBJ)
-	$(AR) $(LIB) $(LIBOBJ)
-	$(RANLIB) $(LIB) || echo Never mind.
-	@touch lib
-
-# ELF
-bn86-elf.s:	asm/bn-586.pl ../perlasm/x86asm.pl
-	(cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@)
-co86-elf.s:	asm/co-586.pl ../perlasm/x86asm.pl
-	(cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@)
-# COFF
-bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl
-	(cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@)
-co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl
-	(cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@)
-# a.out
-bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl
-	(cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@)
-co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl
-	(cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@)
-
-sparcv8.o:	asm/sparcv8.S
-sparcv8plus.o:	asm/sparcv8plus.S
-mips3.o:	asm/mips3.s
-
-x86_64-gcc.o:	asm/x86_64-gcc.c
-
-bn-ia64.s:	asm/ia64.S
-	$(CC) $(CFLAGS) -E asm/ia64.S > $@
-
-# GNU assembler fails to compile PA-RISC2 modules, insist on calling
-# vendor assembler...
-pa-risc2W.o: asm/pa-risc2W.s
-	/usr/ccs/bin/as -o pa-risc2W.o asm/pa-risc2W.s
-pa-risc2.o: asm/pa-risc2.s
-	/usr/ccs/bin/as -o pa-risc2.o asm/pa-risc2.s
-
-# ppc - AIX, Linux, MacOS X...
-linux_ppc32.s: asm/ppc.pl;	$(PERL) $< $@
-linux_ppc64.s: asm/ppc.pl;	$(PERL) $< $@
-aix_ppc32.s: asm/ppc.pl;	$(PERL) asm/ppc.pl $@
-aix_ppc64.s: asm/ppc.pl;	$(PERL) asm/ppc.pl $@
-osx_ppc32.s: asm/ppc.pl;	$(PERL) $< $@
-
-files:
-	$(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
-
-links:
-	@sh $(TOP)/util/point.sh Makefile.ssl Makefile
-	@$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
-	@$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
-	@$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-
-install:
-	@headerlist="$(EXHEADER)"; for i in $$headerlist ; \
-	do  \
-	(cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
-	chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
-	done;
-
-exptest:
-	rm -f exptest
-	gcc -I../../include -g2 -ggdb -o exptest exptest.c ../../libcrypto.a
-
-div:
-	rm -f a.out
-	gcc -I.. -g div.c ../../libcrypto.a
-
-tags:
-	ctags $(SRC)
-
-tests:
-
-lint:
-	lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-depend:
-	$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-
-dclean:
-	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
-	mv -f Makefile.new $(MAKEFILE)
-
-clean:
-	rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-bn_add.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_add.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_add.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_add.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_add.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_add.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_add.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_add.c bn_lcl.h
-bn_asm.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_asm.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_asm.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_asm.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_asm.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_asm.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_asm.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_asm.c bn_lcl.h
-bn_blind.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_blind.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_blind.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_blind.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_blind.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_blind.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_blind.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_blind.c bn_lcl.h
-bn_ctx.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_ctx.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_ctx.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_ctx.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_ctx.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_ctx.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_ctx.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_ctx.c bn_lcl.h
-bn_depr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_depr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_depr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_depr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_depr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_depr.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
-bn_depr.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_depr.o: ../cryptlib.h bn_depr.c bn_lcl.h
-bn_div.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_div.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_div.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_div.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_div.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_div.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_div.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_div.c bn_lcl.h
-bn_err.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_err.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-bn_err.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
-bn_err.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-bn_err.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
-bn_err.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_err.o: bn_err.c
-bn_exp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_exp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
-bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_exp2.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_exp2.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_exp2.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp2.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp2.c bn_lcl.h
-bn_gcd.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_gcd.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_gcd.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_gcd.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_gcd.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_gcd.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_gcd.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gcd.c bn_lcl.h
-bn_gf2m.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_gf2m.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_gf2m.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_gf2m.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_gf2m.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_gf2m.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_gf2m.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gf2m.c bn_lcl.h
-bn_kron.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_kron.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_kron.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_kron.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_kron.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_kron.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_kron.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_kron.c bn_lcl.h
-bn_lib.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_lib.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_lib.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_lib.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_lib.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_lib.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_lib.c
-bn_mod.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mod.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mod.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mod.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mod.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mod.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mod.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mod.c
-bn_mont.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mont.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mont.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mont.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mont.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mont.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mont.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mont.c
-bn_mpi.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mpi.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mpi.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mpi.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mpi.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mpi.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mpi.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mpi.c
-bn_mul.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mul.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mul.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mul.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mul.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mul.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mul.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mul.c
-bn_nist.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_nist.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_nist.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_nist.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_nist.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_nist.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_nist.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_nist.c
-bn_prime.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_prime.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_prime.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_prime.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_prime.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_prime.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
-bn_prime.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_prime.o: ../cryptlib.h bn_lcl.h bn_prime.c bn_prime.h
-bn_print.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_print.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_print.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_print.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_print.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_print.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_print.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_print.c
-bn_rand.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_rand.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_rand.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_rand.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_rand.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_rand.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
-bn_rand.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_rand.o: ../cryptlib.h bn_lcl.h bn_rand.c
-bn_recp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_recp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_recp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_recp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_recp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_recp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_recp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_recp.c
-bn_shift.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_shift.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_shift.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_shift.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_shift.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_shift.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_shift.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_shift.c
-bn_sqr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_sqr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_sqr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_sqr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_sqr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_sqr.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_sqr.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqr.c
-bn_sqrt.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_sqrt.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_sqrt.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_sqrt.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_sqrt.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_sqrt.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_sqrt.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqrt.c
-bn_word.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_word.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_word.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_word.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_word.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_word.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_word.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_word.c
diff --git a/crypto/bn/asm/.cvsignore b/crypto/bn/asm/.cvsignore
index 671eb02019..26475028f5 100644
--- a/crypto/bn/asm/.cvsignore
+++ b/crypto/bn/asm/.cvsignore
@@ -2,3 +2,5 @@ bn86unix.cpp
 co86unix.cpp
 bn86-elf.s
 co86-elf.s
+*.flc
+semantic.cache
diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl
new file mode 100644
index 0000000000..7a2cc3173b
--- /dev/null
+++ b/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,317 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set	noat
+.set	noreorder
+
+.globl	bn_mul_mont
+.align	5
+.ent	bn_mul_mont
+bn_mul_mont:
+	lda	sp,-40(sp)
+	stq	ra,0(sp)
+	stq	s3,8(sp)
+	stq	s4,16(sp)
+	stq	s5,24(sp)
+	stq	fp,32(sp)
+	mov	sp,fp
+	.mask	0x0400f000,-40
+	.frame	fp,40,ra
+	.prologue 0
+
+	.align	4
+	.set	reorder
+	sextl	$num,$num
+	mov	0,v0
+	cmplt	$num,4,AT
+	bne	AT,.Lexit
+
+	ldq	$hi0,0($ap)	# ap[0]
+	s8addq	$num,16,AT
+	ldq	$aj,8($ap)
+	subq	sp,AT,sp
+	ldq	$bi,0($bp)	# bp[0]
+	mov	-4096,AT
+	ldq	$n0,0($n0)
+	and	sp,AT,sp
+
+	mulq	$hi0,$bi,$lo0
+	ldq	$hi1,0($np)	# np[0]
+	umulh	$hi0,$bi,$hi0
+	ldq	$nj,8($np)
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	2,$j
+	umulh	$aj,$bi,$ahi
+	mov	sp,$tp
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+	s8addq	$j,$np,$nj
+.align	4
+.L1st:
+	.set	noreorder
+	ldq	$aj,($aj)
+	addl	$j,1,$j
+	ldq	$nj,($nj)
+	lda	$tp,8($tp)
+
+	addq	$alo,$hi0,$lo0
+	mulq	$aj,$bi,$alo
+	cmpult	$lo0,$hi0,AT
+	addq	$nlo,$hi1,$lo1
+
+	mulq	$nj,$m1,$nlo
+	addq	$ahi,AT,$hi0
+	cmpult	$lo1,$hi1,v0
+	cmplt	$j,$num,$tj
+
+	umulh	$aj,$bi,$ahi
+	addq	$nhi,v0,$hi1
+	addq	$lo1,$lo0,$lo1
+	s8addq	$j,$ap,$aj
+
+	umulh	$nj,$m1,$nhi
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+	s8addq	$j,$np,$nj
+
+	stq	$lo1,-8($tp)
+	nop
+	unop
+	bne	$tj,.L1st
+	.set	reorder
+
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	stq	$lo1,0($tp)
+
+	addq	$hi1,$hi0,$hi1
+	cmpult	$hi1,$hi0,AT
+	stq	$hi1,8($tp)
+	stq	AT,16($tp)
+
+	mov	1,$i
+.align	4
+.Louter:
+	s8addq	$i,$bp,$bi
+	ldq	$hi0,($ap)
+	ldq	$aj,8($ap)
+	ldq	$bi,($bi)
+	ldq	$hi1,($np)
+	ldq	$nj,8($np)
+	ldq	$tj,(sp)
+
+	mulq	$hi0,$bi,$lo0
+	umulh	$hi0,$bi,$hi0
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	mov	2,$j
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	sp,$tp
+	umulh	$aj,$bi,$ahi
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+.align	4
+.Linner:
+	.set	noreorder
+	ldq	$tj,8($tp)	#L0
+	nop			#U1
+	ldq	$aj,($aj)	#L1
+	s8addq	$j,$np,$nj	#U0
+
+	ldq	$nj,($nj)	#L0
+	nop			#U1
+	addq	$alo,$hi0,$lo0	#L1
+	lda	$tp,8($tp)
+
+	mulq	$aj,$bi,$alo	#U1
+	cmpult	$lo0,$hi0,AT	#L0
+	addq	$nlo,$hi1,$lo1	#L1
+	addl	$j,1,$j
+
+	mulq	$nj,$m1,$nlo	#U1
+	addq	$ahi,AT,$hi0	#L0
+	addq	$lo0,$tj,$lo0	#L1
+	cmpult	$lo1,$hi1,v0	#U0
+
+	umulh	$aj,$bi,$ahi	#U1
+	cmpult	$lo0,$tj,AT	#L0
+	addq	$lo1,$lo0,$lo1	#L1
+	addq	$nhi,v0,$hi1	#U0
+
+	umulh	$nj,$m1,$nhi	#U1
+	s8addq	$j,$ap,$aj	#L0
+	cmpult	$lo1,$lo0,v0	#L1
+	cmplt	$j,$num,$tj	#U0	# borrow $tj
+
+	addq	$hi0,AT,$hi0	#L0
+	addq	$hi1,v0,$hi1	#U1
+	stq	$lo1,-8($tp)	#L1
+	bne	$tj,.Linner	#U0
+	.set	reorder
+
+	ldq	$tj,8($tp)
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	ldq	$tj,16($tp)
+	addq	$lo1,$lo0,$j
+	cmpult	$j,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	addq	$hi1,$hi0,$lo1
+	stq	$j,($tp)
+	cmpult	$lo1,$hi0,$hi1
+	addq	$lo1,$tj,$lo1
+	cmpult	$lo1,$tj,AT
+	addl	$i,1,$i
+	addq	$hi1,AT,$hi1
+	stq	$lo1,8($tp)
+	cmplt	$i,$num,$tj	# borrow $tj
+	stq	$hi1,16($tp)
+	bne	$tj,.Louter
+
+	s8addq	$num,sp,$tj	# &tp[num]
+	mov	$rp,$bp		# put rp aside
+	mov	sp,$tp
+	mov	sp,$ap
+	mov	0,$hi0		# clear borrow bit
+
+.align	4
+.Lsub:	ldq	$lo0,($tp)
+	ldq	$lo1,($np)
+	lda	$tp,8($tp)
+	lda	$np,8($np)
+	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
+	cmpult	$lo0,$lo1,AT
+	subq	$lo1,$hi0,$lo0
+	cmpult	$lo1,$lo0,$hi0
+	or	$hi0,AT,$hi0
+	stq	$lo0,($rp)
+	cmpult	$tp,$tj,v0
+	lda	$rp,8($rp)
+	bne	v0,.Lsub
+
+	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
+	mov	sp,$tp
+	mov	$bp,$rp		# restore rp
+
+	and	sp,$hi0,$ap
+	bic	$bp,$hi0,$bp
+	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ldq	$aj,($ap)	# copy or in-place refresh
+	lda	$tp,8($tp)
+	lda	$rp,8($rp)
+	lda	$ap,8($ap)
+	stq	zero,-8($tp)	# zap tp
+	cmpult	$tp,$tj,AT
+	stq	$aj,-8($rp)
+	bne	AT,.Lcopy
+	mov	1,v0
+
+.Lexit:
+	.set	noreorder
+	mov	fp,sp
+	/*ldq	ra,0(sp)*/
+	ldq	s3,8(sp)
+	ldq	s4,16(sp)
+	ldq	s5,24(sp)
+	ldq	fp,32(sp)
+	lda	sp,40(sp)
+	ret	(ra)
+.end	bn_mul_mont
+.rdata
+.asciiz	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000000..05d5dc1a48
--- /dev/null
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,200 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an 
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+.text
+
+.global	bn_mul_mont
+.type	bn_mul_mont,%function
+
+.align	2
+bn_mul_mont:
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	ldr	$num,[sp,#3*4]		@ load num
+	cmp	$num,#2
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	adc	$nhi,$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	str	$nlo,[$num]		@ tp[num-1]=
+	ldr	$n0,[$_n0]		@ restore n0
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adc	$ahi,$ahi,#0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	adc	$nhi,$nhi,#0
+	adds	$nlo,$nlo,$tj
+	adc	$nhi,$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	str	$nlo,[$num]		@ tp[num-1]=
+	ldr	$n0,[$_n0]		@ restore n0
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,sp		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+	and	$ap,$tp,$nhi
+	bic	$np,$rp,$nhi
+	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
+
+.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+	str	sp,[$tp],#4		@ zap tp
+	str	$tj,[$rp],#4
+	cmp	$tp,$num
+	bne	.Lcopy
+
+	add	sp,$num,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	bn_mul_mont,.-bn_mul_mont
+.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl
index 26c2685a72..332ef3e91d 100644
--- a/crypto/bn/asm/bn-586.pl
+++ b/crypto/bn/asm/bn-586.pl
@@ -1,6 +1,7 @@
 #!/usr/local/bin/perl
 
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
@@ -24,38 +25,25 @@ sub bn_mul_add_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 
-	&comment("");
-	$Low="eax";
-	$High="edx";
-	$a="ebx";
-	$w="ebp";
-	$r="edi";
-	$c="esi";
-
-	&xor($c,$c);		# clear carry
-	&mov($r,&wparam(0));	#
-
-	&mov("ecx",&wparam(2));	#
-	&mov($a,&wparam(1));	#
-
-	&and("ecx",0xfffffff8);	# num / 8
-	&mov($w,&wparam(3));	#
-
-	&push("ecx");		# Up the stack for a tmp variable
-
-	&jz(&label("maw_finish"));
+	$r="eax";
+	$a="edx";
+	$c="ecx";
 
 	if ($sse2) {
 		&picmeup("eax","OPENSSL_ia32cap_P");
 		&bt(&DWP(0,"eax"),26);
-		&jnc(&label("maw_loop"));
+		&jnc(&label("maw_non_sse2"));
 
-		&movd("mm0",$w);		# mm0 = w
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
 		&pxor("mm1","mm1");		# mm1 = carry_in
-
-		&set_label("maw_sse2_loop",0);
+		&jmp(&label("maw_sse2_entry"));
+		
+	&set_label("maw_sse2_unrolled",16);
 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
 		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
@@ -112,42 +100,82 @@ sub bn_mul_add_words
 		&psrlq("mm1",32);		# mm1 = carry6
 		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
 		&movd(&DWP(28,$r,"",0),"mm1");
-		&add($r,32);
+		&lea($r,&DWP(32,$r));
 		&psrlq("mm1",32);		# mm1 = carry_out
 
-		&sub("ecx",8);
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
 		&jnz(&label("maw_sse2_loop"));
-
-		&movd($c,"mm1");		# c = carry_out
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
 		&emms();
+		&ret();
 
-		&jmp(&label("maw_finish"));
+	&set_label("maw_non_sse2",16);
 	}
 
-	&set_label("maw_loop",0);
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
 
-	&mov(&swtmp(0),"ecx");	#
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
 
 	for ($i=0; $i<32; $i+=4)
 		{
 		&comment("Round $i");
 
-		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 &mov("eax",&DWP($i,$a)); 	# *a
 		&mul($w);			# *a * w
-		&add("eax",$c);		# L(t)+= *r
-		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
+		&add("eax",$c);			# L(t)+= c
 		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);		# L(t)+=c
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
-		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
 		&mov($c,"edx");			# c=  H(t);
 		}
 
 	&comment("");
-	&mov("ecx",&swtmp(0));	#
-	&add($a,32);
-	&add($r,32);
 	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
 	&jnz(&label("maw_loop"));
 
 	&set_label("maw_finish",0);
@@ -160,16 +188,15 @@ sub bn_mul_add_words
 	for ($i=0; $i<7; $i++)
 		{
 		&comment("Tail Round $i");
-		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		 &mov("eax",&DWP($i*4,$a));	# *a
 		&mul($w);			# *a * w
 		&add("eax",$c);			# L(t)+=c
-		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
-		 &add("eax",$c);
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
 		&adc("edx",0);			# H(t)+=carry
 		 &dec("ecx") if ($i != 7-1);
-		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
-		 &mov($c,"edx");			# c=  H(t);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
 		&jz(&label("maw_end")) if ($i != 7-1);
 		}
 	&set_label("maw_end",0);
@@ -184,7 +211,45 @@ sub bn_mul_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,"");
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
 
 	&comment("");
 	$Low="eax";
@@ -257,7 +322,40 @@ sub bn_sqr_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,"");
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
 
 	&comment("");
 	$r="esi";
@@ -313,12 +411,13 @@ sub bn_div_words
 	{
 	local($name)=@_;
 
-	&function_begin($name,"");
+	&function_begin_B($name,"");
 	&mov("edx",&wparam(0));	#
 	&mov("eax",&wparam(1));	#
-	&mov("ebx",&wparam(2));	#
-	&div("ebx");
-	&function_end($name);
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
 	}
 
 sub bn_add_words
diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl
index 5d962cb957..57101a6bd7 100644
--- a/crypto/bn/asm/co-586.pl
+++ b/crypto/bn/asm/co-586.pl
@@ -1,6 +1,7 @@
 #!/usr/local/bin/perl
 
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000000..af903a2bf8
--- /dev/null
+++ b/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,764 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000302s 0.000024s   3312.3  41332.2
+# rsa 1024 bits 0.000816s 0.000058s   1225.2  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000254s 0.000206s   3944.6   4865.1
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without*:
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 120-30%,
+# hereafter less for longer keys, while verify - by 72-13%.
+# DSA performance improves by 100-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+.align	64;;
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+.align	64;;
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,-2
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+.align	64
+.skip	48;;		// aligns loop body
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=2		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*(n+1) ticks on Itanium 2. Actual
+// measurement with help of Interval Time Counter indicate that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*(n+1) ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// move np[8] to GPR bank and subtract it from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t[0]|t0 (least significant)
+{ .mmi;	getf.sig	n1=ni0
+	getf.sig	n2=ni1
+	add		r16=-7*16,prevsp}
+{ .mmi;	getf.sig	n3=ni2
+	getf.sig	n4=ni3
+	add		r17=-6*16,prevsp};;
+{ .mmi;	getf.sig	n5=ni4
+	getf.sig	n6=ni5
+	add		r18=-5*16,prevsp}
+{ .mmi;	getf.sig	n7=ni6
+	getf.sig	n8=ni7
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[0],n2
+	(p34)sub	n2=t[0],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[0]
+	(p34)cmp.geu	p35,p33=n2,t[0];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[7],n3	}
+{ .mmi;	(p35)sub	n3=t[7],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[7]
+	(p35)cmp.geu	p34,p32=n3,t[7]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[6],n4
+	(p34)sub	n4=t[6],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[6]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[6];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[5],n5
+	(p35)sub	n5=t[5],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[5]
+	(p35)cmp.geu	p34,p32=n5,t[5];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[4],n6	}
+{ .mmi;	(p34)sub	n6=t[4],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[4]
+	(p34)cmp.geu	p35,p33=n6,t[4]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[3],n7
+	(p35)sub	n7=t[3],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[3]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[3];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[2],n8
+	(p34)sub	n8=t[2],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[2]
+	(p34)cmp.geu	p35,p33=n8,t[2];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[1],r0	}
+{ .mmi;	(p35)sub	a8=t[1],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[1]
+	(p35)cmp.geu	p34,p32=a8,t[1]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[0],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[7],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[6],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[5],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[4],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[3],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[2],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,-2	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 7b82b820e6..951abc53ea 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -171,21 +171,21 @@
 .skip	32	// makes the loop body aligned at 64-byte boundary
 bn_add_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 { .mii;	alloc		r2=ar.pfs,4,12,0,16
 	cmp4.le		p6,p0=r35,r0	};;
 { .mfb;	mov		r8=r0			// return value
 (p6)	br.ret.spnt.many	b0	};;
 
-	.save	ar.lc,r3
 { .mib;	sub		r10=r35,r0,1
+	.save	ar.lc,r3
 	mov		r3=ar.lc
 	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
 					}
-	.body
 { .mib;	ADDP		r14=0,r32		// rp
+	.save	pr,r9
 	mov		r9=pr		};;
+	.body
 { .mii;	ADDP		r15=0,r33		// ap
 	mov		ar.lc=r10
 	mov		ar.ec=6		}
@@ -224,21 +224,21 @@ bn_add_words:
 .skip	32	// makes the loop body aligned at 64-byte boundary
 bn_sub_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 { .mii;	alloc		r2=ar.pfs,4,12,0,16
 	cmp4.le		p6,p0=r35,r0	};;
 { .mfb;	mov		r8=r0			// return value
 (p6)	br.ret.spnt.many	b0	};;
 
-	.save	ar.lc,r3
 { .mib;	sub		r10=r35,r0,1
+	.save	ar.lc,r3
 	mov		r3=ar.lc
 	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
 					}
-	.body
 { .mib;	ADDP		r14=0,r32		// rp
+	.save	pr,r9
 	mov		r9=pr		};;
+	.body
 { .mii;	ADDP		r15=0,r33		// ap
 	mov		ar.lc=r10
 	mov		ar.ec=6		}
@@ -283,7 +283,6 @@ bn_sub_words:
 .skip	32	// makes the loop body aligned at 64-byte boundary
 bn_mul_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #ifdef XMA_TEMPTATION
 { .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
@@ -294,9 +293,10 @@ bn_mul_words:
 	cmp4.le		p6,p0=r34,r0
 (p6)	br.ret.spnt.many	b0		};;
 
-	.save	ar.lc,r3
 { .mii;	sub	r10=r34,r0,1
+	.save	ar.lc,r3
 	mov	r3=ar.lc
+	.save	pr,r9
 	mov	r9=pr			};;
 
 	.body
@@ -397,22 +397,21 @@ bn_mul_words:
 .skip	48	// makes the loop body aligned at 64-byte boundary
 bn_mul_add_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
-	.save	ar.lc,r3
-	.save	pr,r9
 { .mmi;	alloc		r2=ar.pfs,4,4,0,8
 	cmp4.le		p6,p0=r34,r0
+	.save	ar.lc,r3
 	mov		r3=ar.lc	};;
 { .mib;	mov		r8=r0		// return value
 	sub		r10=r34,r0,1
 (p6)	br.ret.spnt.many	b0	};;
 
-	.body
 { .mib;	setf.sig	f8=r35		// w
+	.save	pr,r9
 	mov		r9=pr
 	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
 					}
+	.body
 { .mmi;	ADDP		r14=0,r32	// rp
 	ADDP		r15=0,r33	// ap
 	mov		ar.lc=r10	}
@@ -466,7 +465,6 @@ bn_mul_add_words:
 .skip	32	// makes the loop body aligned at 64-byte boundary 
 bn_sqr_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 { .mii;	alloc		r2=ar.pfs,3,0,0,0
 	sxt4		r34=r34		};;
@@ -476,9 +474,10 @@ bn_sqr_words:
 	nop.f		0x0
 (p6)	br.ret.spnt.many	b0	};;
 
-	.save	ar.lc,r3
 { .mii;	sub	r10=r34,r0,1
+	.save	ar.lc,r3
 	mov	r3=ar.lc
+	.save	pr,r9
 	mov	r9=pr			};;
 
 	.body
@@ -545,7 +544,6 @@ bn_sqr_words:
 .align	64
 bn_sqr_comba8:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc	r2=ar.pfs,2,1,0,0
@@ -617,7 +615,6 @@ bn_sqr_comba8:
 .align	64
 bn_mul_comba8:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc	r2=ar.pfs,3,0,0,0
@@ -1175,7 +1172,6 @@ bn_mul_comba8:
 .align	64
 bn_sqr_comba4:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc   r2=ar.pfs,2,1,0,0
@@ -1208,7 +1204,6 @@ bn_sqr_comba4:
 .align	64
 bn_mul_comba4:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 { .mii;	alloc   r2=ar.pfs,3,0,0,0
@@ -1411,11 +1406,11 @@ equ=p24
 .align	64
 bn_div_words:
 	.prologue
-	.fframe	0
 	.save	ar.pfs,r2
-	.save	b0,r3
 { .mii;	alloc		r2=ar.pfs,3,5,0,8
+	.save	b0,r3
 	mov		r3=b0
+	.save	pr,r10
 	mov		r10=pr		};;
 { .mmb;	cmp.eq		p6,p0=r34,r0
 	mov		r8=-1
diff --git a/crypto/bn/asm/mips3-mont.pl b/crypto/bn/asm/mips3-mont.pl
new file mode 100644
index 0000000000..8f9156e02a
--- /dev/null
+++ b/crypto/bn/asm/mips3-mont.pl
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys. While 512-bit
+# RSA private key operations are 40% faster, 1024-bit ones are hardly
+# faster at all, while longer key operations are slower by up to 20%.
+# It might be of interest to embedded system developers though, as
+# it's smaller than 1KB, yet offers ~3x improvement over compiler
+# generated code.
+#
+# The module targets N32 and N64 MIPS ABIs and currently is a bit
+# IRIX-centric, i.e. is likely to require adaptation for other OSes.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="a6";
+$hi0="a7";
+$lo1="v0";
+$hi1="v1";
+$aj="t0";
+$bi="t1";
+$nj="t2";
+$tp="t3";
+$alo="s0";
+$ahi="s1";
+$nlo="s2";
+$nhi="s3";
+$tj="s4";
+$i="s5";
+$j="s6";
+$fp="t8";
+$m1="t9";
+
+$FRAME=8*(2+8);
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set	noat
+.set	reorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+	.set	noreorder
+	PTR_SUB	sp,64
+	move	$fp,sp
+	.frame	$fp,64,ra
+	slt	AT,$num,4
+	li	v0,0
+	beqzl	AT,.Lproceed
+	nop
+	jr	ra
+	PTR_ADD	sp,$fp,64
+	.set	reorder
+.align	5
+.Lproceed:
+	ld	$n0,0($n0)
+	ld	$bi,0($bp)	# bp[0]
+	ld	$aj,0($ap)	# ap[0]
+	ld	$nj,0($np)	# np[0]
+	PTR_SUB	sp,16		# place for two extra words
+	sll	$num,3
+	li	AT,-4096
+	PTR_SUB	sp,$num
+	and	sp,AT
+
+	sd	s0,0($fp)
+	sd	s1,8($fp)
+	sd	s2,16($fp)
+	sd	s3,24($fp)
+	sd	s4,32($fp)
+	sd	s5,40($fp)
+	sd	s6,48($fp)
+	sd	s7,56($fp)
+
+	dmultu	$aj,$bi
+	ld	$alo,8($ap)
+	ld	$nlo,8($np)
+	mflo	$lo0
+	mfhi	$hi0
+	dmultu	$lo0,$n0
+	mflo	$m1
+
+	dmultu	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	dmultu	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	dmultu	$nlo,$m1
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,sp
+	li	$j,16
+.align	4
+.L1st:
+	.set	noreorder
+	PTR_ADD	$aj,$ap,$j
+	ld	$aj,($aj)
+	PTR_ADD	$nj,$np,$j
+	ld	$nj,($nj)
+
+	dmultu	$aj,$bi
+	daddu	$lo0,$alo,$hi0
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo0,$hi0
+	sltu	s7,$lo1,$hi1
+	daddu	$hi0,$ahi,AT
+	daddu	$hi1,$nhi,s7
+	mflo	$alo
+	mfhi	$ahi
+
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	dmultu	$nj,$m1
+	daddu	$hi1,AT
+	addu	$j,8
+	sd	$lo1,($tp)
+	sltu	s7,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	s7,.L1st
+	PTR_ADD	$tp,8
+	.set	reorder
+
+	daddu	$lo0,$alo,$hi0
+	sltu	AT,$lo0,$hi0
+	daddu	$hi0,$ahi,AT
+
+	daddu	$lo1,$nlo,$hi1
+	sltu	s7,$lo1,$hi1
+	daddu	$hi1,$nhi,s7
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+
+	sd	$lo1,($tp)
+
+	daddu	$hi1,$hi0
+	sltu	AT,$hi1,$hi0
+	sd	$hi1,8($tp)
+	sd	AT,16($tp)
+
+	li	$i,8
+.align	4
+.Louter:
+	PTR_ADD	$bi,$bp,$i
+	ld	$bi,($bi)
+	ld	$aj,($ap)
+	ld	$alo,8($ap)
+	ld	$tj,(sp)
+
+	dmultu	$aj,$bi
+	ld	$nj,($np)
+	ld	$nlo,8($np)
+	mflo	$lo0
+	mfhi	$hi0
+	daddu	$lo0,$tj
+	dmultu	$lo0,$n0
+	sltu	AT,$lo0,$tj
+	daddu	$hi0,AT
+	mflo	$m1
+
+	dmultu	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	dmultu	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	dmultu	$nlo,$m1
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,sp
+	li	$j,16
+	ld	$tj,8($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	PTR_ADD	$aj,$ap,$j
+	ld	$aj,($aj)
+	PTR_ADD	$nj,$np,$j
+	ld	$nj,($nj)
+
+	dmultu	$aj,$bi
+	daddu	$lo0,$alo,$hi0
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo0,$hi0
+	sltu	s7,$lo1,$hi1
+	daddu	$hi0,$ahi,AT
+	daddu	$hi1,$nhi,s7
+	mflo	$alo
+	mfhi	$ahi
+
+	daddu	$lo0,$tj
+	addu	$j,8
+	dmultu	$nj,$m1
+	sltu	AT,$lo0,$tj
+	daddu	$lo1,$lo0
+	daddu	$hi0,AT
+	sltu	s7,$lo1,$lo0
+	ld	$tj,16($tp)
+	daddu	$hi1,s7
+	sltu	AT,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	sd	$lo1,($tp)
+	bnez	AT,.Linner
+	PTR_ADD	$tp,8
+	.set	reorder
+
+	daddu	$lo0,$alo,$hi0
+	sltu	AT,$lo0,$hi0
+	daddu	$hi0,$ahi,AT
+	daddu	$lo0,$tj
+	sltu	s7,$lo0,$tj
+	daddu	$hi0,s7
+
+	ld	$tj,16($tp)
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo1,$hi1
+	daddu	$hi1,$nhi,AT
+	daddu	$lo1,$lo0
+	sltu	s7,$lo1,$lo0
+	daddu	$hi1,s7
+	sd	$lo1,($tp)
+
+	daddu	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	daddu	$lo1,$tj
+	sltu	AT,$lo1,$tj
+	daddu	$hi1,AT
+	sd	$lo1,8($tp)
+	sd	$hi1,16($tp)
+
+	addu	$i,8
+	sltu	s7,$i,$num
+	bnez	s7,.Louter
+
+	.set	noreorder
+	PTR_ADD	$tj,sp,$num	# &tp[num]
+	move	$tp,sp
+	move	$ap,sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	ld	$lo0,($tp)
+	ld	$lo1,($np)
+	PTR_ADD	$tp,8
+	PTR_ADD	$np,8
+	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	AT,$lo1,$lo0
+	dsubu	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	sd	$lo0,($rp)
+	or	$hi0,AT
+	sltu	AT,$tp,$tj
+	bnez	AT,.Lsub
+	PTR_ADD	$rp,8
+
+	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,sp
+	PTR_SUB	$rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ld	$aj,($ap)
+	PTR_ADD	$ap,8
+	PTR_ADD	$tp,8
+	sd	zero,-8($tp)
+	sltu	AT,$tp,$tj
+	sd	$aj,($rp)
+	bnez	AT,.Lcopy
+	PTR_ADD	$rp,8
+
+	ld	s0,0($fp)
+	ld	s1,8($fp)
+	ld	s2,16($fp)
+	ld	s3,24($fp)
+	ld	s4,32($fp)
+	ld	s5,40($fp)
+	ld	s6,48($fp)
+	ld	s7,56($fp)
+	li	v0,1
+	jr	ra
+	PTR_ADD	sp,$fp,64
+	.set	reorder
+END(bn_mul_mont)
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000000..4a766a87fb
--- /dev/null
+++ b/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
new file mode 100644
index 0000000000..9257b2cd71
--- /dev/null
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,328 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit	+65%	
+# 1024-bit	+35%
+# 2048-bit	+18%
+# 4096-bit	+4%
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+	$BITS=	32;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=4;
+	$RZONE=	224;
+	$FRAME=	$SIZE_T*16;
+
+	$LD=	"lwz";		# load
+	$LDU=	"lwzu";		# load and update
+	$LDX=	"lwzx";		# load indexed
+	$ST=	"stw";		# store
+	$STU=	"stwu";		# store and update
+	$STX=	"stwx";		# store indexed
+	$STUX=	"stwux";	# store indexed and update
+	$UMULL=	"mullw";	# unsigned multiply low
+	$UMULH=	"mulhwu";	# unsigned multiply high
+	$UCMP=	"cmplw";	# unsigned compare
+	$SHRI=	"srwi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} elsif ($flavour =~ /64/) {
+	$BITS=	64;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=8;
+	$RZONE=	288;
+	$FRAME=	$SIZE_T*16;
+
+	# same as above, but 64-bit mnemonics...
+	$LD=	"ld";		# load
+	$LDU=	"ldu";		# load and update
+	$LDX=	"ldx";		# load indexed
+	$ST=	"std";		# store
+	$STU=	"stdu";		# store and update
+	$STX=	"stdx";		# store indexed
+	$STUX=	"stdux";	# store indexed and update
+	$UMULL=	"mulld";	# unsigned multiply low
+	$UMULH=	"mulhdu";	# unsigned multiply high
+	$UCMP=	"cmpld";	# unsigned compare
+	$SHRI=	"srdi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3";	$ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";	# $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r14";
+$j="r15";
+$tp="r16";
+$m0="r17";
+$m1="r18";
+$lo0="r19";
+$hi0="r20";
+$lo1="r21";
+$hi1="r22";
+$alo="r23";
+$ahi="r24";
+$nlo="r25";
+#
+$nhi="r0";
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl	.bn_mul_mont_int
+.align	4
+.bn_mul_mont_int:
+	cmpwi	$num,4
+	mr	$rp,r3		; $rp is reassigned
+	li	r3,0
+	bltlr
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
+	slwi	$num,$num,`log($BNSZ)/log(2)`
+	li	$tj,-4096
+	addi	$ovf,$num,`$FRAME+$RZONE`
+	subf	$ovf,$ovf,$sp	; $sp-$ovf
+	and	$ovf,$ovf,$tj	; minimize TLB usage
+	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	srwi	$num,$num,`log($BNSZ)/log(2)`
+	$STUX	$sp,$sp,$ovf
+
+	$PUSH	r14,`4*$SIZE_T`($sp)
+	$PUSH	r15,`5*$SIZE_T`($sp)
+	$PUSH	r16,`6*$SIZE_T`($sp)
+	$PUSH	r17,`7*$SIZE_T`($sp)
+	$PUSH	r18,`8*$SIZE_T`($sp)
+	$PUSH	r19,`9*$SIZE_T`($sp)
+	$PUSH	r20,`10*$SIZE_T`($sp)
+	$PUSH	r21,`11*$SIZE_T`($sp)
+	$PUSH	r22,`12*$SIZE_T`($sp)
+	$PUSH	r23,`13*$SIZE_T`($sp)
+	$PUSH	r24,`14*$SIZE_T`($sp)
+	$PUSH	r25,`15*$SIZE_T`($sp)
+
+	$LD	$n0,0($n0)	; pull n0[0] value
+	addi	$num,$num,-2	; adjust $num for counter register
+
+	$LD	$m0,0($bp)	; m0=bp[0]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$FRAME
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
+	$UMULH	$hi0,$aj,$m0
+
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+
+	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
+
+	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
+	$UMULH	$ahi,$aj,$m0
+
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	addze	$hi1,$hi1
+
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+L1st:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LDX	$nj,$np,$j	; np[j]
+	addze	$hi0,$ahi
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
+	addc	$lo1,$nlo,$hi1
+	$UMULH	$ahi,$aj,$m0
+	addze	$hi1,$nhi
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	$UMULH	$nhi,$nj,$m1
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addi	$j,$j,$BNSZ	; j++
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz-	L1st
+;L1st
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	li	$ovf,0
+	addc	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf	; upmost overflow bit
+	$ST	$hi1,$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+Louter:
+	$LDX	$m0,$bp,$i	; m0=bp[i]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$FRAME
+	$LD	$tj,$FRAME($sp)	; tp[0]
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
+	$UMULH	$hi0,$aj,$m0
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi0,$hi0
+	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
+	$UMULH	$ahi,$aj,$m0
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	addze	$hi1,$hi1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+Linner:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addze	$hi0,$ahi
+	$LDX	$nj,$np,$j	; np[j]
+	addc	$lo1,$nlo,$hi1
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi1,$nhi
+	$UMULH	$ahi,$aj,$m0
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addze	$hi0,$hi0
+	$UMULH	$nhi,$nj,$m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addi	$j,$j,$BNSZ	; j++
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz-	Linner
+;Linner
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	addze	$hi0,$hi0
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
+	li	$ovf,0
+	adde	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf
+	$ST	$hi1,$BNSZ($tp)
+;
+	slwi	$tj,$num,`log($BNSZ)/log(2)`
+	$UCMP	$i,$tj
+	addi	$i,$i,$BNSZ
+	ble-	Louter
+
+	addi	$num,$num,2	; restore $num
+	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,$FRAME
+	mtctr	$num
+
+.align	4
+Lsub:	$LDX	$tj,$tp,$j
+	$LDX	$nj,$np,$j
+	subfe	$aj,$nj,$tj	; tp[j]-np[j]
+	$STX	$aj,$rp,$j
+	addi	$j,$j,$BNSZ
+	bdnz-	Lsub
+
+	li	$j,0
+	mtctr	$num
+	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	$LDX	$tj,$ap,$j
+	$STX	$tj,$rp,$j
+	$STX	$j,$tp,$j	; zap at once
+	addi	$j,$j,$BNSZ
+	bdnz-	Lcopy
+
+	$POP	r14,`4*$SIZE_T`($sp)
+	$POP	r15,`5*$SIZE_T`($sp)
+	$POP	r16,`6*$SIZE_T`($sp)
+	$POP	r17,`7*$SIZE_T`($sp)
+	$POP	r18,`8*$SIZE_T`($sp)
+	$POP	r19,`9*$SIZE_T`($sp)
+	$POP	r20,`10*$SIZE_T`($sp)
+	$POP	r21,`11*$SIZE_T`($sp)
+	$POP	r22,`12*$SIZE_T`($sp)
+	$POP	r23,`13*$SIZE_T`($sp)
+	$POP	r24,`14*$SIZE_T`($sp)
+	$POP	r25,`15*$SIZE_T`($sp)
+	$POP	$sp,0($sp)
+	li	r3,1
+	blr
+	.long	0
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index 307c7ccb35..37c65d3511 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -100,9 +100,9 @@
 #	me a note at schari@us.ibm.com
 #
 
-$opf = shift;
+$flavour = shift;
 
-if ($opf =~ /32\.s/) {
+if ($flavour =~ /32/) {
 	$BITS=	32;
 	$BNSZ=	$BITS/8;
 	$ISA=	"\"ppc\"";
@@ -116,7 +116,7 @@ if ($opf =~ /32\.s/) {
 	$UDIV=	"divwu";	# unsigned divide
 	$UCMPI=	"cmplwi";	# unsigned compare with immediate
 	$UCMP=	"cmplw";	# unsigned compare
-	$COUNTZ="cntlzw";	# count leading zeros
+	$CNTLZ=	"cntlzw";	# count leading zeros
 	$SHL=	"slw";		# shift left
 	$SHR=	"srw";		# unsigned shift right
 	$SHRI=	"srwi";		# unsigned shift right by immediate	
@@ -124,7 +124,8 @@ if ($opf =~ /32\.s/) {
 	$CLRU=	"clrlwi";	# clear upper bits
 	$INSR=	"insrwi";	# insert right
 	$ROTL=	"rotlwi";	# rotate left by immediate
-} elsif ($opf =~ /64\.s/) {
+	$TR=	"tw";		# conditional trap
+} elsif ($flavour =~ /64/) {
 	$BITS=	64;
 	$BNSZ=	$BITS/8;
 	$ISA=	"\"ppc64\"";
@@ -139,7 +140,7 @@ if ($opf =~ /32\.s/) {
 	$UDIV=	"divdu";	# unsigned divide
 	$UCMPI=	"cmpldi";	# unsigned compare with immediate
 	$UCMP=	"cmpld";	# unsigned compare
-	$COUNTZ="cntlzd";	# count leading zeros
+	$CNTLZ=	"cntlzd";	# count leading zeros
 	$SHL=	"sld";		# shift left
 	$SHR=	"srd";		# unsigned shift right
 	$SHRI=	"srdi";		# unsigned shift right by immediate	
@@ -147,93 +148,17 @@ if ($opf =~ /32\.s/) {
 	$CLRU=	"clrldi";	# clear upper bits
 	$INSR=	"insrdi";	# insert right 
 	$ROTL=	"rotldi";	# rotate left by immediate
-} else { die "nonsense $opf"; }
+	$TR=	"td";		# conditional trap
+} else { die "nonsense $flavour"; }
 
-( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
 
-# function entry points from the AIX code
-#
-# There are other, more elegant, ways to handle this. We (IBM) chose
-# this approach as it plays well with scripts we run to 'namespace'
-# OpenSSL .i.e. we add a prefix to all the public symbols so we can
-# co-exist in the same process with other implementations of OpenSSL.
-# 'cleverer' ways of doing these substitutions tend to hide data we
-# need to be obvious.
-#
-my @items = ("bn_sqr_comba4",
-	     "bn_sqr_comba8",
-	     "bn_mul_comba4",
-	     "bn_mul_comba8",
-	     "bn_sub_words",
-	     "bn_add_words",
-	     "bn_div_words",
-	     "bn_sqr_words",
-	     "bn_mul_words",
-	     "bn_mul_add_words");
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-if    ($opf =~ /linux/)	{  do_linux();	}
-elsif ($opf =~ /aix/)	{  do_aix();	}
-elsif ($opf =~ /osx/)	{  do_osx();	}
-else			{  do_bsd();	}
-
-sub do_linux {
-    $d=&data();
-
-    if ($BITS==64) {
-      foreach $t (@items) {
-        $d =~ s/\.$t:/\
-\t.section\t".opd","aw"\
-\t.align\t3\
-\t.globl\t$t\
-$t:\
-\t.quad\t.$t,.TOC.\@tocbase,0\
-\t.size\t$t,24\
-\t.previous\n\
-\t.type\t.$t,\@function\
-\t.globl\t.$t\
-.$t:/g;
-      }
-    }
-    else {
-      foreach $t (@items) {
-        $d=~s/\.$t/$t/g;
-      }
-    }
-    # hide internal labels to avoid pollution of name table...
-    $d=~s/Lppcasm_/.Lppcasm_/gm;
-    print $d;
-}
-
-sub do_aix {
-    # AIX assembler is smart enough to please the linker without
-    # making us do something special...
-    print &data();
-}
-
-# MacOSX 32 bit
-sub do_osx {
-    $d=&data();
-    # Change the bn symbol prefix from '.' to '_'
-    foreach $t (@items) {
-      $d=~s/\.$t/_$t/g;
-    }
-    # Change .machine to something OS X asm will accept
-    $d=~s/\.machine.*/.text/g;
-    $d=~s/\#/;/g; # change comment from '#' to ';'
-    print $d;
-}
-
-# BSD (Untested)
-sub do_bsd {
-    $d=&data();
-    foreach $t (@items) {
-      $d=~s/\.$t/_$t/g;
-    }
-    print $d;
-}
-
-sub data {
-	local($data)=<<EOF;
+$data=<<EOF;
 #--------------------------------------------------------------------
 #
 #
@@ -295,33 +220,20 @@ sub data {
 #
 #	Defines to be used in the assembly code.
 #	
-.set r0,0	# we use it as storage for value of 0
-.set SP,1	# preserved
-.set RTOC,2	# preserved 
-.set r3,3	# 1st argument/return value
-.set r4,4	# 2nd argument/volatile register
-.set r5,5	# 3rd argument/volatile register
-.set r6,6	# ...
-.set r7,7
-.set r8,8
-.set r9,9
-.set r10,10
-.set r11,11
-.set r12,12
-.set r13,13	# not used, nor any other "below" it...
-
-.set BO_IF_NOT,4
-.set BO_IF,12
-.set BO_dCTR_NZERO,16
-.set BO_dCTR_ZERO,18
-.set BO_ALWAYS,20
-.set CR0_LT,0;
-.set CR0_GT,1;
-.set CR0_EQ,2
-.set CR1_FX,4;
-.set CR1_FEX,5;
-.set CR1_VX,6
-.set LR,8
+#.set r0,0	# we use it as storage for value of 0
+#.set SP,1	# preserved
+#.set RTOC,2	# preserved 
+#.set r3,3	# 1st argument/return value
+#.set r4,4	# 2nd argument/volatile register
+#.set r5,5	# 3rd argument/volatile register
+#.set r6,6	# ...
+#.set r7,7
+#.set r8,8
+#.set r9,9
+#.set r10,10
+#.set r11,11
+#.set r12,12
+#.set r13,13	# not used, nor any other "below" it...
 
 #	Declare function names to be global
 #	NOTE:	For gcc these names MUST be changed to remove
@@ -342,7 +254,7 @@ sub data {
 	
 # .text section
 	
-	.machine	$ISA
+	.machine	"any"
 
 #
 #	NOTE:	The following label name should be changed to
@@ -476,7 +388,7 @@ sub data {
 
 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -901,7 +813,7 @@ sub data {
 	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
 
 
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 
 	.long	0x00000000
 
@@ -1053,7 +965,7 @@ sub data {
 
 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1589,7 +1501,7 @@ sub data {
 	adde	r10,r10,r9
 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1621,7 +1533,7 @@ sub data {
 	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
 				# if r6 > 0 then result !=0
 				# In either case carry bit is set.
-	bc	BO_IF,CR0_EQ,Lppcasm_sub_adios
+	beq	Lppcasm_sub_adios
 	addi	r4,r4,-$BNSZ
 	addi	r3,r3,-$BNSZ
 	addi	r5,r5,-$BNSZ
@@ -1633,11 +1545,11 @@ Lppcasm_sub_mainloop:
 				# if carry = 1 this is r7-r8. Else it
 				# is r7-r8 -1 as we need.
 	$STU	r6,$BNSZ(r3)
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
+	bdnz-	Lppcasm_sub_mainloop
 Lppcasm_sub_adios:	
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 
@@ -1668,7 +1580,7 @@ Lppcasm_sub_adios:
 #	check for r6 = 0. Is this needed?
 #
 	addic.	r6,r6,0		#test r6 and clear carry bit.
-	bc	BO_IF,CR0_EQ,Lppcasm_add_adios
+	beq	Lppcasm_add_adios
 	addi	r4,r4,-$BNSZ
 	addi	r3,r3,-$BNSZ
 	addi	r5,r5,-$BNSZ
@@ -1678,10 +1590,10 @@ Lppcasm_add_mainloop:
 	$LDU	r8,$BNSZ(r5)
 	adde	r8,r7,r8
 	$STU	r8,$BNSZ(r3)
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
+	bdnz-	Lppcasm_add_mainloop
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1705,29 +1617,24 @@ Lppcasm_add_adios:
 #	r5 = d
 	
 	$UCMPI	0,r5,0			# compare r5 and 0
-	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div1	# proceed if d!=0
+	bne	Lppcasm_div1		# proceed if d!=0
 	li	r3,-1			# d=0 return -1
-	bclr	BO_ALWAYS,CR0_LT	
+	blr
 Lppcasm_div1:
 	xor	r0,r0,r0		#r0=0
-	$COUNTZ	r7,r5			#r7 = num leading 0s in d.
-	subfic	r8,r7,$BITS		#r8 = BN_num_bits_word(d)
-	cmpi	0,0,r8,$BITS		#
-	bc	BO_IF,CR0_EQ,Lppcasm_div2	#proceed if (r8==$BITS)	
-	li	r9,1			# r9=1
-	$SHL	r10,r9,r8		# r9<<=r8
-	$UCMP	0,r3,r10		#	
-	bc	BO_IF,CR0_GT,Lppcasm_div2	#or if (h > (1<<r8))
-	$UDIV	r3,r3,r0		#if not assert(0) divide by 0!
-					#that's how we signal overflow
-	bclr	BO_ALWAYS,CR0_LT	#return. NEVER REACHED.
+	li	r8,$BITS
+	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
+	beq	Lppcasm_div2		#proceed if no leading zeros
+	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
+	$SHR.	r9,r3,r8		#are there any bits above r8'th?
+	$TR	16,r9,r0		#if there're, signal to dump core...
 Lppcasm_div2:
 	$UCMP	0,r3,r5			#h>=d?
-	bc	BO_IF,CR0_LT,Lppcasm_div3	#goto Lppcasm_div3 if not
+	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
 	subf	r3,r5,r3		#h-=d ; 
 Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
 	cmpi	0,0,r7,0		# is (i == 0)?
-	bc	BO_IF,CR0_EQ,Lppcasm_div4
+	beq	Lppcasm_div4
 	$SHL	r3,r3,r7		# h = (h<< i)
 	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
 	$SHL	r5,r5,r7		# d<<=i
@@ -1744,7 +1651,7 @@ Lppcasm_divouterloop:
 	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
 					# compute here for innerloop.
 	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
-	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div5	# goto Lppcasm_div5 if not
+	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
 
 	li	r8,-1
 	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l 
@@ -1765,9 +1672,9 @@ Lppcasm_divinnerloop:
 					# the following 2 instructions do that
 	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
 	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
-	$UCMP	1,r6,r7			# compare (tl <= r7)
-	bc	BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
-	bc	BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
+	$UCMP	cr1,r6,r7		# compare (tl <= r7)
+	bne	Lppcasm_divinnerexit
+	ble	cr1,Lppcasm_divinnerexit
 	addi	r8,r8,-1		#q--
 	subf	r12,r9,r12		#th -=dh
 	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
@@ -1776,14 +1683,14 @@ Lppcasm_divinnerloop:
 Lppcasm_divinnerexit:
 	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
 	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
-	$UCMP	1,r4,r11		# compare l and tl
+	$UCMP	cr1,r4,r11		# compare l and tl
 	add	r12,r12,r10		# th+=t
-	bc	BO_IF_NOT,CR1_FX,Lppcasm_div7  # if (l>=tl) goto Lppcasm_div7
+	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
 	addi	r12,r12,1		# th++
 Lppcasm_div7:
 	subf	r11,r11,r4		#r11=l-tl
-	$UCMP	1,r3,r12		#compare h and th
-	bc	BO_IF_NOT,CR1_FX,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
+	$UCMP	cr1,r3,r12		#compare h and th
+	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
 	addi	r8,r8,-1		# q--
 	add	r3,r5,r3		# h+=d
 Lppcasm_div8:
@@ -1794,12 +1701,12 @@ Lppcasm_div8:
 					# the following 2 instructions will do this.
 	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
 	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
-	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
+	bdz	Lppcasm_div9		#if (count==0) break ;
 	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
 	b	Lppcasm_divouterloop
 Lppcasm_div9:
 	or	r3,r8,r0
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1825,7 +1732,7 @@ Lppcasm_div9:
 #	No unrolling done here. Not performance critical.
 
 	addic.	r5,r5,0			#test r5.
-	bc	BO_IF,CR0_EQ,Lppcasm_sqr_adios
+	beq	Lppcasm_sqr_adios
 	addi	r4,r4,-$BNSZ
 	addi	r3,r3,-$BNSZ
 	mtctr	r5
@@ -1836,9 +1743,9 @@ Lppcasm_sqr_mainloop:
 	$UMULH  r8,r6,r6
 	$STU	r7,$BNSZ(r3)
 	$STU	r8,$BNSZ(r3)
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
+	bdnz-	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 
@@ -1861,7 +1768,7 @@ Lppcasm_sqr_adios:
 	xor	r0,r0,r0
 	xor	r12,r12,r12		# used for carry
 	rlwinm.	r7,r5,30,2,31		# num >> 2
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_REM
+	beq	Lppcasm_mw_REM
 	mtctr	r7
 Lppcasm_mw_LOOP:	
 					#mul(rp[0],ap[0],w,c1);
@@ -1899,11 +1806,11 @@ Lppcasm_mw_LOOP:
 	
 	addi	r3,r3,`4*$BNSZ`
 	addi	r4,r4,`4*$BNSZ`
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
+	bdnz-	Lppcasm_mw_LOOP
 
 Lppcasm_mw_REM:
 	andi.	r5,r5,0x3
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
+	beq	Lppcasm_mw_OVER
 					#mul(rp[0],ap[0],w,c1);
 	$LD	r8,`0*$BNSZ`(r4)
 	$UMULL	r9,r6,r8
@@ -1915,7 +1822,7 @@ Lppcasm_mw_REM:
 	
 	addi	r5,r5,-1
 	cmpli	0,0,r5,0
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
+	beq	Lppcasm_mw_OVER
 
 	
 					#mul(rp[1],ap[1],w,c1);
@@ -1929,7 +1836,7 @@ Lppcasm_mw_REM:
 	
 	addi	r5,r5,-1
 	cmpli	0,0,r5,0
-	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
+	beq	Lppcasm_mw_OVER
 	
 					#mul_add(rp[2],ap[2],w,c1);
 	$LD	r8,`2*$BNSZ`(r4)
@@ -1942,7 +1849,7 @@ Lppcasm_mw_REM:
 		
 Lppcasm_mw_OVER:	
 	addi	r3,r12,0
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 
 #
@@ -1967,7 +1874,7 @@ Lppcasm_mw_OVER:
 	xor	r0,r0,r0		#r0 = 0
 	xor	r12,r12,r12  		#r12 = 0 . used for carry		
 	rlwinm.	r7,r5,30,2,31		# num >> 2
-	bc	BO_IF,CR0_EQ,Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
+	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
 	mtctr	r7
 Lppcasm_maw_mainloop:	
 					#mul_add(rp[0],ap[0],w,c1);
@@ -2020,11 +1927,11 @@ Lppcasm_maw_mainloop:
 	$ST	r11,`3*$BNSZ`(r3)
 	addi	r3,r3,`4*$BNSZ`
 	addi	r4,r4,`4*$BNSZ`
-	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
+	bdnz-	Lppcasm_maw_mainloop
 	
 Lppcasm_maw_leftover:
 	andi.	r5,r5,0x3
-	bc	BO_IF,CR0_EQ,Lppcasm_maw_adios
+	beq	Lppcasm_maw_adios
 	addi	r3,r3,-$BNSZ
 	addi	r4,r4,-$BNSZ
 					#mul_add(rp[0],ap[0],w,c1);
@@ -2039,7 +1946,7 @@ Lppcasm_maw_leftover:
 	addze	r12,r10
 	$ST	r9,0(r3)
 	
-	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+	bdz	Lppcasm_maw_adios
 					#mul_add(rp[1],ap[1],w,c1);
 	$LDU	r8,$BNSZ(r4)	
 	$UMULL	r9,r6,r8
@@ -2051,7 +1958,7 @@ Lppcasm_maw_leftover:
 	addze	r12,r10
 	$ST	r9,0(r3)
 	
-	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+	bdz	Lppcasm_maw_adios
 					#mul_add(rp[2],ap[2],w,c1);
 	$LDU	r8,$BNSZ(r4)
 	$UMULL	r9,r6,r8
@@ -2065,17 +1972,10 @@ Lppcasm_maw_leftover:
 		
 Lppcasm_maw_adios:	
 	addi	r3,r12,0
-	bclr	BO_ALWAYS,CR0_LT
+	blr
 	.long	0x00000000
 	.align	4
 EOF
-	$data =~ s/\`([^\`]*)\`/eval $1/gem;
-
-	# if some assembler chokes on some simplified mnemonic,
-	# this is the spot to fix it up, e.g.:
-	# GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
-	$data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
-	# assembler X doesn't accept li, load immediate value
-	#$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
-	return($data);
-}
+$data =~ s/\`([^\`]*)\`/eval $1/gem;
+print $data;
+close STDOUT;
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
new file mode 100644
index 0000000000..f040466f43
--- /dev/null
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -0,0 +1,1086 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2007
+
+# The reason for undertaken effort is basically following. Even though
+# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
+# performance was observed to be less than impressive, essentially as
+# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
+# Well, it's not surprising that IBM had to make some sacrifices to
+# boost the clock frequency that much, but no overall improvement?
+# Having observed how much difference did switching to FPU make on
+# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
+# Unfortunately the resulting performance improvement is not as
+# impressive, ~30%, and in absolute terms is still very far from what
+# one would expect from 4.7GHz CPU. There is a chance that I'm doing
+# something wrong, but in the lack of assembler level micro-profiling
+# data or at least decent platform guide I can't tell... Or better
+# results might be achieved with VMX... Anyway, this module provides
+# *worse* performance on other PowerPC implementations, ~40-15% slower
+# on PPC970 depending on key length and ~40% slower on Power 5 for all
+# key lengths. As it's obviously inappropriate as "best all-round"
+# alternative, it has to be complemented with run-time CPU family
+# detection. Oh! It should also be noted that unlike other PowerPC
+# implementation IALU ppc-mont.pl module performs *suboptimaly* on
+# >=1024-bit key lengths on Power 6. It should also be noted that
+# *everything* said so far applies to 64-bit builds! As far as 32-bit
+# application executed on 64-bit CPU goes, this module is likely to
+# become preferred choice, because it's easy to adapt it for such
+# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
+
+# February 2008
+
+# Micro-profiling assisted optimization results in ~15% improvement
+# over original ppc64-mont.pl version, or overall ~50% improvement
+# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
+# Power 6 CPU, this module is 5-150% faster depending on key length,
+# [hereafter] more for longer keys. But if compared to ppc-mont.pl
+# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
+# in absolute terms, but it's apparently the way Power 6 is...
+
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$RZONE=	224;
+	$FRAME=	$SIZE_T*12+8*12;
+	$fname=	"bn_mul_mont_fpu64";
+
+	$STUX=	"stwux";	# store indexed and update
+	$PUSH=	"stw";
+	$POP=	"lwz";
+} elsif ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$RZONE=	288;
+	$FRAME=	$SIZE_T*12+8*12;
+	$fname=	"bn_mul_mont_fpu64";
+
+	# same as above, but 64-bit mnemonics...
+	$STUX=	"stdux";	# store indexed and update
+	$PUSH=	"std";
+	$POP=	"ld";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=($FRAME+63)&~63;
+$TRANSFER=16*8;
+
+$carry="r0";
+$sp="r1";
+$toc="r2";
+$rp="r3";	$ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";	# $rp is reassigned
+$tp="r10";
+$j="r11";
+$i="r12";
+# non-volatile registers
+$nap_d="r14";	# interleaved ap and np in double format
+$a0="r15";	# ap[0]
+$t0="r16";	# temporary registers
+$t1="r17";
+$t2="r18";
+$t3="r19";
+$t4="r20";
+$t5="r21";
+$t6="r22";
+$t7="r23";
+
+# PPC offers enough register bank capacity to unroll inner loops twice
+#
+#     ..A3A2A1A0
+#           dcba
+#    -----------
+#            A0a
+#           A0b
+#          A0c
+#         A0d
+#          A1a
+#         A1b
+#        A1c
+#       A1d
+#        A2a
+#       A2b
+#      A2c
+#     A2d
+#      A3a
+#     A3b
+#    A3c
+#   A3d
+#    ..a
+#   ..b
+#
+$ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
+$na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
+$dota="f8";	$dotb="f9";
+$A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
+$N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
+$T0a="f18";	$T0b="f19";
+$T1a="f20";	$T1b="f21";
+$T2a="f22";	$T2b="f23";
+$T3a="f24";	$T3b="f25";
+
+# sp----------->+-------------------------------+
+#		| saved sp			|
+#		+-------------------------------+
+#		|				|
+#		+-------------------------------+
+#		| 10 saved gpr, r14-r23		|
+#		.				.
+#		.				.
+#   +12*size_t	+-------------------------------+
+#		| 12 saved fpr, f14-f25		|
+#		.				.
+#		.				.
+#   +12*8	+-------------------------------+
+#		| padding to 64 byte boundary	|
+#		.				.
+#   +X		+-------------------------------+
+#		| 16 gpr<->fpr transfer zone	|
+#		.				.
+#		.				.
+#   +16*8	+-------------------------------+
+#		| __int64 tmp[-1]		|
+#		+-------------------------------+
+#		| __int64 tmp[num]		|
+#		.				.
+#		.				.
+#		.				.
+#   +(num+1)*8	+-------------------------------+
+#		| padding to 64 byte boundary	|
+#		.				.
+#   +X		+-------------------------------+
+#		| double nap_d[4*num]		|
+#		.				.
+#		.				.
+#		.				.
+#		+-------------------------------+
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl	.$fname
+.align	5
+.$fname:
+	cmpwi	$num,`3*8/$SIZE_T`
+	mr	$rp,r3		; $rp is reassigned
+	li	r3,0		; possible "not handled" return code
+	bltlr-
+	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
+	bnelr-
+
+	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
+	li	$i,-4096
+	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
+	add	$tp,$tp,$num	; place for tp[num+1]
+	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
+	subf	$tp,$tp,$sp	; $sp-$tp
+	and	$tp,$tp,$i	; minimize TLB usage
+	subf	$tp,$sp,$tp	; $tp-$sp
+	$STUX	$sp,$sp,$tp	; alloca
+
+	$PUSH	r14,`2*$SIZE_T`($sp)
+	$PUSH	r15,`3*$SIZE_T`($sp)
+	$PUSH	r16,`4*$SIZE_T`($sp)
+	$PUSH	r17,`5*$SIZE_T`($sp)
+	$PUSH	r18,`6*$SIZE_T`($sp)
+	$PUSH	r19,`7*$SIZE_T`($sp)
+	$PUSH	r20,`8*$SIZE_T`($sp)
+	$PUSH	r21,`9*$SIZE_T`($sp)
+	$PUSH	r22,`10*$SIZE_T`($sp)
+	$PUSH	r23,`11*$SIZE_T`($sp)
+	stfd	f14,`12*$SIZE_T+0`($sp)
+	stfd	f15,`12*$SIZE_T+8`($sp)
+	stfd	f16,`12*$SIZE_T+16`($sp)
+	stfd	f17,`12*$SIZE_T+24`($sp)
+	stfd	f18,`12*$SIZE_T+32`($sp)
+	stfd	f19,`12*$SIZE_T+40`($sp)
+	stfd	f20,`12*$SIZE_T+48`($sp)
+	stfd	f21,`12*$SIZE_T+56`($sp)
+	stfd	f22,`12*$SIZE_T+64`($sp)
+	stfd	f23,`12*$SIZE_T+72`($sp)
+	stfd	f24,`12*$SIZE_T+80`($sp)
+	stfd	f25,`12*$SIZE_T+88`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
+	ld	$a0,0($ap)	; pull ap[0] value
+	ld	$n0,0($n0)	; pull n0[0] value
+	ld	$t3,0($bp)	; bp[0]
+___
+$code.=<<___ if ($SIZE_T==4);
+	mr	$t1,$n0
+	lwz	$a0,0($ap)	; pull ap[0,1] value
+	lwz	$t0,4($ap)
+	lwz	$n0,0($t1)	; pull n0[0,1] value
+	lwz	$t1,4($t1)
+	lwz	$t3,0($bp)	; bp[0,1]
+	lwz	$t2,4($bp)
+	insrdi	$a0,$t0,32,0
+	insrdi	$n0,$t1,32,0
+	insrdi	$t3,$t2,32,0
+___
+$code.=<<___;
+	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
+	li	$i,-64
+	add	$nap_d,$tp,$num
+	and	$nap_d,$nap_d,$i	; align to 64 bytes
+
+	mulld	$t7,$a0,$t3	; ap[0]*bp[0]
+	; nap_d is off by 1, because it's used with stfdu/lfdu
+	addi	$nap_d,$nap_d,-8
+	srwi	$j,$num,`3+1`	; counter register, num/2
+	mulld	$t7,$t7,$n0	; tp[0]*n0
+	addi	$j,$j,-1
+	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
+	li	$carry,0
+	mtctr	$j
+
+	; transfer bp[0] to FPU as 4x16-bit values
+	extrdi	$t0,$t3,16,48
+	extrdi	$t1,$t3,16,32
+	extrdi	$t2,$t3,16,16
+	extrdi	$t3,$t3,16,0
+	std	$t0,`$FRAME+0`($sp)
+	std	$t1,`$FRAME+8`($sp)
+	std	$t2,`$FRAME+16`($sp)
+	std	$t3,`$FRAME+24`($sp)
+	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+	extrdi	$t4,$t7,16,48
+	extrdi	$t5,$t7,16,32
+	extrdi	$t6,$t7,16,16
+	extrdi	$t7,$t7,16,0
+	std	$t4,`$FRAME+32`($sp)
+	std	$t5,`$FRAME+40`($sp)
+	std	$t6,`$FRAME+48`($sp)
+	std	$t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
+	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
+	lwz	$t1,0($ap)
+	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
+	lwz	$t3,8($ap)
+	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
+	lwz	$t5,0($np)
+	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
+	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
+	lfd	$ba,`$FRAME+0`($sp)
+	lfd	$bb,`$FRAME+8`($sp)
+	lfd	$bc,`$FRAME+16`($sp)
+	lfd	$bd,`$FRAME+24`($sp)
+	lfd	$na,`$FRAME+32`($sp)
+	lfd	$nb,`$FRAME+40`($sp)
+	lfd	$nc,`$FRAME+48`($sp)
+	lfd	$nd,`$FRAME+56`($sp)
+	std	$t0,`$FRAME+64`($sp)
+	std	$t1,`$FRAME+72`($sp)
+	std	$t2,`$FRAME+80`($sp)
+	std	$t3,`$FRAME+88`($sp)
+	std	$t4,`$FRAME+96`($sp)
+	std	$t5,`$FRAME+104`($sp)
+	std	$t6,`$FRAME+112`($sp)
+	std	$t7,`$FRAME+120`($sp)
+	fcfid	$ba,$ba
+	fcfid	$bb,$bb
+	fcfid	$bc,$bc
+	fcfid	$bd,$bd
+	fcfid	$na,$na
+	fcfid	$nb,$nb
+	fcfid	$nc,$nc
+	fcfid	$nd,$nd
+
+	lfd	$A0,`$FRAME+64`($sp)
+	lfd	$A1,`$FRAME+72`($sp)
+	lfd	$A2,`$FRAME+80`($sp)
+	lfd	$A3,`$FRAME+88`($sp)
+	lfd	$N0,`$FRAME+96`($sp)
+	lfd	$N1,`$FRAME+104`($sp)
+	lfd	$N2,`$FRAME+112`($sp)
+	lfd	$N3,`$FRAME+120`($sp)
+	fcfid	$A0,$A0
+	fcfid	$A1,$A1
+	fcfid	$A2,$A2
+	fcfid	$A3,$A3
+	fcfid	$N0,$N0
+	fcfid	$N1,$N1
+	fcfid	$N2,$N2
+	fcfid	$N3,$N3
+	addi	$ap,$ap,16
+	addi	$np,$np,16
+
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	stfd	$A0,8($nap_d)		; save a[j] in double format
+	stfd	$A1,16($nap_d)
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	stfd	$A2,24($nap_d)		; save a[j+1] in double format
+	stfd	$A3,32($nap_d)
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	stfd	$N0,40($nap_d)		; save n[j] in double format
+	stfd	$N1,48($nap_d)
+	fmul	$T0a,$A0,$ba
+	fmul	$T0b,$A0,$bb
+	stfd	$N2,56($nap_d)		; save n[j+1] in double format
+	stfdu	$N3,64($nap_d)
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+
+.align	5
+L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
+	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
+	lwz	$t1,0($ap)
+	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
+	lwz	$t3,8($ap)
+	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
+	lwz	$t5,0($np)
+	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
+	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
+	std	$t0,`$FRAME+64`($sp)
+	std	$t1,`$FRAME+72`($sp)
+	std	$t2,`$FRAME+80`($sp)
+	std	$t3,`$FRAME+88`($sp)
+	std	$t4,`$FRAME+96`($sp)
+	std	$t5,`$FRAME+104`($sp)
+	std	$t6,`$FRAME+112`($sp)
+	std	$t7,`$FRAME+120`($sp)
+	ld	$t0,`$FRAME+0`($sp)
+	ld	$t1,`$FRAME+8`($sp)
+	ld	$t2,`$FRAME+16`($sp)
+	ld	$t3,`$FRAME+24`($sp)
+	ld	$t4,`$FRAME+32`($sp)
+	ld	$t5,`$FRAME+40`($sp)
+	ld	$t6,`$FRAME+48`($sp)
+	ld	$t7,`$FRAME+56`($sp)
+	lfd	$A0,`$FRAME+64`($sp)
+	lfd	$A1,`$FRAME+72`($sp)
+	lfd	$A2,`$FRAME+80`($sp)
+	lfd	$A3,`$FRAME+88`($sp)
+	lfd	$N0,`$FRAME+96`($sp)
+	lfd	$N1,`$FRAME+104`($sp)
+	lfd	$N2,`$FRAME+112`($sp)
+	lfd	$N3,`$FRAME+120`($sp)
+	fcfid	$A0,$A0
+	fcfid	$A1,$A1
+	fcfid	$A2,$A2
+	fcfid	$A3,$A3
+	fcfid	$N0,$N0
+	fcfid	$N1,$N1
+	fcfid	$N2,$N2
+	fcfid	$N3,$N3
+	addi	$ap,$ap,16
+	addi	$np,$np,16
+
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	stfd	$A0,8($nap_d)		; save a[j] in double format
+	stfd	$A1,16($nap_d)
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	fmadd	$T0a,$A0,$ba,$dota
+	fmadd	$T0b,$A0,$bb,$dotb
+	stfd	$A2,24($nap_d)		; save a[j+1] in double format
+	stfd	$A3,32($nap_d)
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	stfd	$N0,40($nap_d)		; save n[j] in double format
+	stfd	$N1,48($nap_d)
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	 add	$t0,$t0,$carry		; can not overflow
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+	stfd	$N2,56($nap_d)		; save n[j+1] in double format
+	stfdu	$N3,64($nap_d)
+	 srdi	$carry,$t0,16
+	 add	$t1,$t1,$carry
+	 srdi	$carry,$t1,16
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	 insrdi	$t0,$t1,16,32
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	 add	$t2,$t2,$carry
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	 srdi	$carry,$t2,16
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+	 insrdi	$t0,$t2,16,16
+	 add	$t3,$t3,$carry
+	 srdi	$carry,$t3,16
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	 insrdi	$t0,$t3,16,0		; 0..63 bits
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	 add	$t4,$t4,$carry
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	 srdi	$carry,$t4,16
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+	 add	$t5,$t5,$carry
+	 srdi	$carry,$t5,16
+	 insrdi	$t4,$t5,16,32
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	 add	$t6,$t6,$carry
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	 srdi	$carry,$t6,16
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	 insrdi	$t4,$t6,16,16
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+	 add	$t7,$t7,$carry
+	 insrdi	$t4,$t7,16,0		; 64..127 bits
+	 srdi	$carry,$t7,16		; upper 33 bits
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+	 std	$t0,8($tp)		; tp[j-1]
+	 stdu	$t4,16($tp)		; tp[j]
+	bdnz-	L1st
+
+	fctid	$dota,$dota
+	fctid	$dotb,$dotb
+
+	ld	$t0,`$FRAME+0`($sp)
+	ld	$t1,`$FRAME+8`($sp)
+	ld	$t2,`$FRAME+16`($sp)
+	ld	$t3,`$FRAME+24`($sp)
+	ld	$t4,`$FRAME+32`($sp)
+	ld	$t5,`$FRAME+40`($sp)
+	ld	$t6,`$FRAME+48`($sp)
+	ld	$t7,`$FRAME+56`($sp)
+	stfd	$dota,`$FRAME+64`($sp)
+	stfd	$dotb,`$FRAME+72`($sp)
+
+	add	$t0,$t0,$carry		; can not overflow
+	srdi	$carry,$t0,16
+	add	$t1,$t1,$carry
+	srdi	$carry,$t1,16
+	insrdi	$t0,$t1,16,32
+	add	$t2,$t2,$carry
+	srdi	$carry,$t2,16
+	insrdi	$t0,$t2,16,16
+	add	$t3,$t3,$carry
+	srdi	$carry,$t3,16
+	insrdi	$t0,$t3,16,0		; 0..63 bits
+	add	$t4,$t4,$carry
+	srdi	$carry,$t4,16
+	add	$t5,$t5,$carry
+	srdi	$carry,$t5,16
+	insrdi	$t4,$t5,16,32
+	add	$t6,$t6,$carry
+	srdi	$carry,$t6,16
+	insrdi	$t4,$t6,16,16
+	add	$t7,$t7,$carry
+	insrdi	$t4,$t7,16,0		; 64..127 bits
+	srdi	$carry,$t7,16		; upper 33 bits
+	ld	$t6,`$FRAME+64`($sp)
+	ld	$t7,`$FRAME+72`($sp)
+
+	std	$t0,8($tp)		; tp[j-1]
+	stdu	$t4,16($tp)		; tp[j]
+
+	add	$t6,$t6,$carry		; can not overflow
+	srdi	$carry,$t6,16
+	add	$t7,$t7,$carry
+	insrdi	$t6,$t7,48,0
+	srdi	$ovf,$t7,48
+	std	$t6,8($tp)		; tp[num-1]
+
+	slwi	$t7,$num,2
+	subf	$nap_d,$t7,$nap_d	; rewind pointer
+
+	li	$i,8			; i=1
+.align	5
+Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
+	ldx	$t3,$bp,$i	; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+	add	$t0,$bp,$i
+	lwz	$t3,0($t0)		; bp[i,i+1]
+	lwz	$t0,4($t0)
+	insrdi	$t3,$t0,32,0
+___
+$code.=<<___;
+	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
+	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
+
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	add	$t7,$t7,$t6	; ap[0]*bp[i]+tp[0]
+	li	$carry,0
+	mulld	$t7,$t7,$n0	; tp[0]*n0
+	mtctr	$j
+
+	; transfer bp[i] to FPU as 4x16-bit values
+	extrdi	$t0,$t3,16,48
+	extrdi	$t1,$t3,16,32
+	extrdi	$t2,$t3,16,16
+	extrdi	$t3,$t3,16,0
+	std	$t0,`$FRAME+0`($sp)
+	std	$t1,`$FRAME+8`($sp)
+	std	$t2,`$FRAME+16`($sp)
+	std	$t3,`$FRAME+24`($sp)
+	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+	extrdi	$t4,$t7,16,48
+	extrdi	$t5,$t7,16,32
+	extrdi	$t6,$t7,16,16
+	extrdi	$t7,$t7,16,0
+	std	$t4,`$FRAME+32`($sp)
+	std	$t5,`$FRAME+40`($sp)
+	std	$t6,`$FRAME+48`($sp)
+	std	$t7,`$FRAME+56`($sp)
+
+	lfd	$A0,8($nap_d)		; load a[j] in double format
+	lfd	$A1,16($nap_d)
+	lfd	$A2,24($nap_d)		; load a[j+1] in double format
+	lfd	$A3,32($nap_d)
+	lfd	$N0,40($nap_d)		; load n[j] in double format
+	lfd	$N1,48($nap_d)
+	lfd	$N2,56($nap_d)		; load n[j+1] in double format
+	lfdu	$N3,64($nap_d)
+
+	lfd	$ba,`$FRAME+0`($sp)
+	lfd	$bb,`$FRAME+8`($sp)
+	lfd	$bc,`$FRAME+16`($sp)
+	lfd	$bd,`$FRAME+24`($sp)
+	lfd	$na,`$FRAME+32`($sp)
+	lfd	$nb,`$FRAME+40`($sp)
+	lfd	$nc,`$FRAME+48`($sp)
+	lfd	$nd,`$FRAME+56`($sp)
+
+	fcfid	$ba,$ba
+	fcfid	$bb,$bb
+	fcfid	$bc,$bc
+	fcfid	$bd,$bd
+	fcfid	$na,$na
+	fcfid	$nb,$nb
+	fcfid	$nc,$nc
+	fcfid	$nd,$nd
+
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	fmul	$T0a,$A0,$ba
+	fmul	$T0b,$A0,$bb
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	 lfd	$A0,8($nap_d)		; load a[j] in double format
+	 lfd	$A1,16($nap_d)
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
+	 lfd	$A3,32($nap_d)
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+
+.align	5
+Linner:
+	fmul	$T1a,$A1,$ba
+	fmul	$T1b,$A1,$bb
+	fmul	$T2a,$A2,$ba
+	fmul	$T2b,$A2,$bb
+	lfd	$N0,40($nap_d)		; load n[j] in double format
+	lfd	$N1,48($nap_d)
+	fmul	$T3a,$A3,$ba
+	fmul	$T3b,$A3,$bb
+	fmadd	$T0a,$A0,$ba,$dota
+	fmadd	$T0b,$A0,$bb,$dotb
+	lfd	$N2,56($nap_d)		; load n[j+1] in double format
+	lfdu	$N3,64($nap_d)
+
+	fmadd	$T1a,$A0,$bc,$T1a
+	fmadd	$T1b,$A0,$bd,$T1b
+	fmadd	$T2a,$A1,$bc,$T2a
+	fmadd	$T2b,$A1,$bd,$T2b
+	 lfd	$A0,8($nap_d)		; load a[j] in double format
+	 lfd	$A1,16($nap_d)
+	fmadd	$T3a,$A2,$bc,$T3a
+	fmadd	$T3b,$A2,$bd,$T3b
+	fmul	$dota,$A3,$bc
+	fmul	$dotb,$A3,$bd
+	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
+	 lfd	$A3,32($nap_d)
+
+	fmadd	$T1a,$N1,$na,$T1a
+	fmadd	$T1b,$N1,$nb,$T1b
+	 ld	$t0,`$FRAME+0`($sp)
+	 ld	$t1,`$FRAME+8`($sp)
+	fmadd	$T2a,$N2,$na,$T2a
+	fmadd	$T2b,$N2,$nb,$T2b
+	 ld	$t2,`$FRAME+16`($sp)
+	 ld	$t3,`$FRAME+24`($sp)
+	fmadd	$T3a,$N3,$na,$T3a
+	fmadd	$T3b,$N3,$nb,$T3b
+	 add	$t0,$t0,$carry		; can not overflow
+	 ld	$t4,`$FRAME+32`($sp)
+	 ld	$t5,`$FRAME+40`($sp)
+	fmadd	$T0a,$N0,$na,$T0a
+	fmadd	$T0b,$N0,$nb,$T0b
+	 srdi	$carry,$t0,16
+	 add	$t1,$t1,$carry
+	 srdi	$carry,$t1,16
+	 ld	$t6,`$FRAME+48`($sp)
+	 ld	$t7,`$FRAME+56`($sp)
+
+	fmadd	$T1a,$N0,$nc,$T1a
+	fmadd	$T1b,$N0,$nd,$T1b
+	 insrdi	$t0,$t1,16,32
+	 ld	$t1,8($tp)		; tp[j]
+	fmadd	$T2a,$N1,$nc,$T2a
+	fmadd	$T2b,$N1,$nd,$T2b
+	 add	$t2,$t2,$carry
+	fmadd	$T3a,$N2,$nc,$T3a
+	fmadd	$T3b,$N2,$nd,$T3b
+	 srdi	$carry,$t2,16
+	 insrdi	$t0,$t2,16,16
+	fmadd	$dota,$N3,$nc,$dota
+	fmadd	$dotb,$N3,$nd,$dotb
+	 add	$t3,$t3,$carry
+	 ldu	$t2,16($tp)		; tp[j+1]
+	 srdi	$carry,$t3,16
+	 insrdi	$t0,$t3,16,0		; 0..63 bits
+	 add	$t4,$t4,$carry
+
+	fctid	$T0a,$T0a
+	fctid	$T0b,$T0b
+	 srdi	$carry,$t4,16
+	fctid	$T1a,$T1a
+	fctid	$T1b,$T1b
+	 add	$t5,$t5,$carry
+	fctid	$T2a,$T2a
+	fctid	$T2b,$T2b
+	 srdi	$carry,$t5,16
+	 insrdi	$t4,$t5,16,32
+	fctid	$T3a,$T3a
+	fctid	$T3b,$T3b
+	 add	$t6,$t6,$carry
+	 srdi	$carry,$t6,16
+	 insrdi	$t4,$t6,16,16
+
+	stfd	$T0a,`$FRAME+0`($sp)
+	stfd	$T0b,`$FRAME+8`($sp)
+	 add	$t7,$t7,$carry
+	 addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
+	stfd	$T1a,`$FRAME+16`($sp)
+	stfd	$T1b,`$FRAME+24`($sp)
+	 insrdi	$t4,$t7,16,0		; 64..127 bits
+	 srdi	$carry,$t7,16		; upper 33 bits
+	stfd	$T2a,`$FRAME+32`($sp)
+	stfd	$T2b,`$FRAME+40`($sp)
+	 adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
+	stfd	$T3a,`$FRAME+48`($sp)
+	stfd	$T3b,`$FRAME+56`($sp)
+	 addze	$carry,$carry
+	 std	$t3,-16($tp)		; tp[j-1]
+	 std	$t5,-8($tp)		; tp[j]
+	bdnz-	Linner
+
+	fctid	$dota,$dota
+	fctid	$dotb,$dotb
+	ld	$t0,`$FRAME+0`($sp)
+	ld	$t1,`$FRAME+8`($sp)
+	ld	$t2,`$FRAME+16`($sp)
+	ld	$t3,`$FRAME+24`($sp)
+	ld	$t4,`$FRAME+32`($sp)
+	ld	$t5,`$FRAME+40`($sp)
+	ld	$t6,`$FRAME+48`($sp)
+	ld	$t7,`$FRAME+56`($sp)
+	stfd	$dota,`$FRAME+64`($sp)
+	stfd	$dotb,`$FRAME+72`($sp)
+
+	add	$t0,$t0,$carry		; can not overflow
+	srdi	$carry,$t0,16
+	add	$t1,$t1,$carry
+	srdi	$carry,$t1,16
+	insrdi	$t0,$t1,16,32
+	add	$t2,$t2,$carry
+	ld	$t1,8($tp)		; tp[j]
+	srdi	$carry,$t2,16
+	insrdi	$t0,$t2,16,16
+	add	$t3,$t3,$carry
+	ldu	$t2,16($tp)		; tp[j+1]
+	srdi	$carry,$t3,16
+	insrdi	$t0,$t3,16,0		; 0..63 bits
+	add	$t4,$t4,$carry
+	srdi	$carry,$t4,16
+	add	$t5,$t5,$carry
+	srdi	$carry,$t5,16
+	insrdi	$t4,$t5,16,32
+	add	$t6,$t6,$carry
+	srdi	$carry,$t6,16
+	insrdi	$t4,$t6,16,16
+	add	$t7,$t7,$carry
+	insrdi	$t4,$t7,16,0		; 64..127 bits
+	srdi	$carry,$t7,16		; upper 33 bits
+	ld	$t6,`$FRAME+64`($sp)
+	ld	$t7,`$FRAME+72`($sp)
+
+	addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
+	adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
+	addze	$carry,$carry
+
+	std	$t3,-16($tp)		; tp[j-1]
+	std	$t5,-8($tp)		; tp[j]
+
+	add	$carry,$carry,$ovf	; comsume upmost overflow
+	add	$t6,$t6,$carry		; can not overflow
+	srdi	$carry,$t6,16
+	add	$t7,$t7,$carry
+	insrdi	$t6,$t7,48,0
+	srdi	$ovf,$t7,48
+	std	$t6,0($tp)		; tp[num-1]
+
+	slwi	$t7,$num,2
+	addi	$i,$i,8
+	subf	$nap_d,$t7,$nap_d	; rewind pointer
+	cmpw	$i,$num
+	blt-	Louter
+___
+
+$code.=<<___ if ($SIZE_T==8);
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
+	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
+	addi	$t5,$np,8
+	addi	$t6,$rp,8
+	mtctr	$j
+
+.align	4
+Lsub:	ldx	$t0,$tp,$i
+	ldx	$t1,$np,$i
+	ldx	$t2,$t4,$i
+	ldx	$t3,$t5,$i
+	subfe	$t0,$t1,$t0	; tp[j]-np[j]
+	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
+	stdx	$t0,$rp,$i
+	stdx	$t2,$t6,$i
+	addi	$i,$i,16
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$t7,$ap,8
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	ldx	$t0,$ap,$i
+	ldx	$t1,$t7,$i
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stdx	$t0,$rp,$i
+	stdx	$t1,$t6,$i
+	stdx	$i,$tp,$i	; zap tp at once
+	stdx	$i,$t4,$i
+	addi	$i,$i,16
+	bdnz-	Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	addi	$np,$np,-4
+	addi	$rp,$rp,-4
+	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
+	mtctr	$j
+
+.align	4
+Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
+	ldu	$t2,16($tp)
+	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
+	lwz	$t5,8($np)
+	lwz	$t6,12($np)
+	lwzu	$t7,16($np)
+	extrdi	$t1,$t0,32,0
+	extrdi	$t3,$t2,32,0
+	subfe	$t4,$t4,$t0	; tp[j]-np[j]
+	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
+	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
+	 stw	$t1,8($ap)
+	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
+	 stw	$t2,12($ap)
+	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
+	 stwu	$t3,16($ap)
+	stw	$t4,4($rp)
+	stw	$t5,8($rp)
+	stw	$t6,12($rp)
+	stwu	$t7,16($rp)
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
+	subf	$rp,$num,$rp	; rewind rp
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	lwz	$t0,4($ap)
+	lwz	$t1,8($ap)
+	lwz	$t2,12($ap)
+	lwzu	$t3,16($ap)
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stw	$t0,4($rp)
+	stw	$t1,8($rp)
+	stw	$t2,12($rp)
+	stwu	$t3,16($rp)
+	std	$i,8($tp)	; zap tp at once
+	stdu	$i,16($tp)
+	bdnz-	Lcopy
+___
+
+$code.=<<___;
+	$POP	r14,`2*$SIZE_T`($sp)
+	$POP	r15,`3*$SIZE_T`($sp)
+	$POP	r16,`4*$SIZE_T`($sp)
+	$POP	r17,`5*$SIZE_T`($sp)
+	$POP	r18,`6*$SIZE_T`($sp)
+	$POP	r19,`7*$SIZE_T`($sp)
+	$POP	r20,`8*$SIZE_T`($sp)
+	$POP	r21,`9*$SIZE_T`($sp)
+	$POP	r22,`10*$SIZE_T`($sp)
+	$POP	r23,`11*$SIZE_T`($sp)
+	lfd	f14,`12*$SIZE_T+0`($sp)
+	lfd	f15,`12*$SIZE_T+8`($sp)
+	lfd	f16,`12*$SIZE_T+16`($sp)
+	lfd	f17,`12*$SIZE_T+24`($sp)
+	lfd	f18,`12*$SIZE_T+32`($sp)
+	lfd	f19,`12*$SIZE_T+40`($sp)
+	lfd	f20,`12*$SIZE_T+48`($sp)
+	lfd	f21,`12*$SIZE_T+56`($sp)
+	lfd	f22,`12*$SIZE_T+64`($sp)
+	lfd	f23,`12*$SIZE_T+72`($sp)
+	lfd	f24,`12*$SIZE_T+80`($sp)
+	lfd	f25,`12*$SIZE_T+88`($sp)
+	$POP	$sp,0($sp)
+	li	r3,1	; signal "handled"
+	blr
+	.long	0
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
new file mode 100644
index 0000000000..d23251033b
--- /dev/null
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,225 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2007.
+#
+# Performance improvement over vanilla C code varies from 85% to 45%
+# depending on key length and benchmark. Unfortunately in this context
+# these are not very impressive results [for code that utilizes "wide"
+# 64x64=128-bit multiplication, which is not commonly available to C
+# programmers], at least hand-coded bn_asm.c replacement is known to
+# provide 30-40% better results for longest keys. Well, on a second
+# thought it's not very surprising, because z-CPUs are single-issue
+# and _strictly_ in-order execution, while bn_mul_mont is more or less
+# dependent on CPU ability to pipe-line instructions and have several
+# of them "in-flight" at the same time. I mean while other methods,
+# for example Karatsuba, aim to minimize amount of multiplications at
+# the cost of other operations increase, bn_mul_mont aim to neatly
+# "overlap" multiplications and the other operations [and on most
+# platforms even minimize the amount of the other operations, in
+# particular references to memory]. But it's possible to improve this
+# module performance by implementing dedicated squaring code-path and
+# possibly by unrolling loops...
+
+# January 2009.
+#
+# Reschedule to minimize/avoid Address Generation Interlock hazard,
+# make inner loops counter-based.
+
+$mn0="%r0";
+$num="%r1";
+
+# int bn_mul_mont(
+$rp="%r2";		# BN_ULONG *rp,
+$ap="%r3";		# const BN_ULONG *ap,
+$bp="%r4";		# const BN_ULONG *bp,
+$np="%r5";		# const BN_ULONG *np,
+$n0="%r6";		# const BN_ULONG *n0,
+#$num="160(%r15)"	# int num);
+
+$bi="%r2";	# zaps rp
+$j="%r7";
+
+$ahi="%r8";
+$alo="%r9";
+$nhi="%r10";
+$nlo="%r11";
+$AHI="%r12";
+$NHI="%r13";
+$count="%r14";
+$sp="%r15";
+
+$code.=<<___;
+.text
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function
+bn_mul_mont:
+	lgf	$num,164($sp)	# pull $num
+	sla	$num,3		# $num to enumerate bytes
+	la	$bp,0($num,$bp)
+
+	stg	%r2,16($sp)
+
+	cghi	$num,16		#
+	lghi	%r2,0		#
+	blr	%r14		# if($num<16) return 0;
+	cghi	$num,128	#
+	bhr	%r14		# if($num>128) return 0;
+
+	stmg	%r3,%r15,24($sp)
+
+	lghi	$rp,-160-8	# leave room for carry bit
+	lcgr	$j,$num		# -$num
+	lgr	%r0,$sp
+	la	$rp,0($rp,$sp)
+	la	$sp,0($j,$rp)	# alloca
+	stg	%r0,0($sp)	# back chain
+
+	sra	$num,3		# restore $num
+	la	$bp,0($j,$bp)	# restore $bp
+	ahi	$num,-1		# adjust $num for inner loop
+	lg	$n0,0($n0)	# pull n0
+
+	lg	$bi,0($bp)
+	lg	$alo,0($ap)
+	mlgr	$ahi,$bi	# ap[0]*bp[0]
+	lgr	$AHI,$ahi
+
+	lgr	$mn0,$alo	# "tp[0]"*n0
+	msgr	$mn0,$n0
+
+	lg	$nlo,0($np)	#
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.L1st:
+	lg	$alo,0($j,$ap)
+	mlgr	$ahi,$bi	# ap[j]*bp[0]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI	# +="tp[j]"
+	algr	$nlo,$alo
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.L1st
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI	# upmost overflow bit
+	stg	$NHI,160-8($j,$sp)
+	stg	$AHI,160($j,$sp)
+	la	$bp,8($bp)	# bp++
+
+.Louter:
+	lg	$bi,0($bp)	# bp[i]
+	lg	$alo,0($ap)
+	mlgr	$ahi,$bi	# ap[0]*bp[i]
+	alg	$alo,160($sp)	# +=tp[0]
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lgr	$mn0,$alo
+	msgr	$mn0,$n0	# tp[0]*n0
+
+	lg	$nlo,0($np)	# np[0]
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.Linner:
+	lg	$alo,0($j,$ap)
+	mlgr	$ahi,$bi	# ap[j]*bp[i]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$ahi,$AHI
+	alg	$alo,160($j,$sp)# +=tp[j]
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI
+	algr	$nlo,$alo	# +="tp[j]"
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.Linner
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI
+	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	lghi	$ahi,0
+	alcgr	$AHI,$ahi	# new upmost overflow bit
+	stg	$NHI,160-8($j,$sp)
+	stg	$AHI,160($j,$sp)
+
+	la	$bp,8($bp)	# bp++
+	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	jne	.Louter
+
+	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
+	la	$ap,160($sp)
+	ahi	$num,1		# restore $num, incidentally clears "borrow"
+
+	la	$j,0(%r0)
+	lr	$count,$num
+.Lsub:	lg	$alo,0($j,$ap)
+	slbg	$alo,0($j,$np)
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lsub
+	lghi	$ahi,0
+	slbgr	$AHI,$ahi	# handle upmost carry
+
+	ngr	$ap,$AHI
+	lghi	$np,-1
+	xgr	$np,$AHI
+	ngr	$np,$rp
+	ogr	$ap,$np		# ap=borrow?tp:rp
+
+	la	$j,0(%r0)
+	lgr	$count,$num
+.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
+	stg	$j,160($j,$sp)	# zap tp
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lcopy
+
+	la	%r1,160+8+48($j,$sp)
+	lmg	%r6,%r15,0(%r1)
+	lghi	%r2,1		# signal "processed"
+	br	%r14
+.size	bn_mul_mont,.-bn_mul_mont
+.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
new file mode 100755
index 0000000000..8f45f5d513
--- /dev/null
+++ b/crypto/bn/asm/s390x.S
@@ -0,0 +1,678 @@
+.ident "s390x.S, version 1.0"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+
+.text
+
+#define zero	%r0
+
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	4
+bn_mul_add_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0;
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r10,48(%r15)
+	lghi	%r8,0		// carry = 0
+	srag	%r10,%r4,2	// cnt=len/4
+	jz	.Loop1_madd
+
+.Loop4_madd:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r2,%r1)	// +=rp[i]
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lg	%r9,8(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,8(%r2,%r1)
+	alcgr	%r8,zero
+	stg	%r9,8(%r2,%r1)
+
+	lg	%r7,16(%r2,%r3)
+	mlgr	%r6,%r5
+	algr	%r7,%r8
+	alcgr	%r6,zero
+	alg	%r7,16(%r2,%r1)
+	alcgr	%r6,zero
+	stg	%r7,16(%r2,%r1)
+
+	lg	%r9,24(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,24(%r2,%r1)
+	alcgr	%r8,zero
+	stg	%r9,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r10,.Loop4_madd
+
+	lghi	%r10,3
+	nr	%r4,%r10	// cnt=len%4
+	jz	.Lend_madd
+
+.Loop1_madd:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r2,%r1)	// +=rp[i]
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lgr	%r8,%r6
+	la	%r2,8(%r2)	// i++
+	brct	%r4,.Loop1_madd
+
+.Lend_madd:
+	lgr	%r2,%r8
+	lmg	%r6,%r10,48(%r15)
+	br	%r14
+.size	bn_mul_add_words,.-bn_mul_add_words
+
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_words
+.type	bn_mul_words,@function
+.align	4
+bn_mul_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0;
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r10,48(%r15)
+	lghi	%r8,0		// carry = 0
+	srag	%r10,%r4,2	// cnt=len/4
+	jz	.Loop1_mul
+
+.Loop4_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lg	%r9,8(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	stg	%r9,8(%r2,%r1)
+
+	lg	%r7,16(%r2,%r3)
+	mlgr	%r6,%r5
+	algr	%r7,%r8
+	alcgr	%r6,zero
+	stg	%r7,16(%r2,%r1)
+
+	lg	%r9,24(%r2,%r3)
+	mlgr	%r8,%r5
+	algr	%r9,%r6
+	alcgr	%r8,zero
+	stg	%r9,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r10,.Loop4_mul
+
+	lghi	%r10,3
+	nr	%r4,%r10	// cnt=len%4
+	jz	.Lend_mul
+
+.Loop1_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	algr	%r7,%r8		// +=carry
+	alcgr	%r6,zero
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lgr	%r8,%r6
+	la	%r2,8(%r2)	// i++
+	brct	%r4,.Loop1_mul
+
+.Lend_mul:
+	lgr	%r2,%r8
+	lmg	%r6,%r10,48(%r15)
+	br	%r14
+.size	bn_mul_words,.-bn_mul_words
+
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	4
+bn_sqr_words:
+	ltgfr	%r4,%r4
+	bler	%r14
+
+	stmg	%r6,%r7,48(%r15)
+	srag	%r1,%r4,2	// cnt=len/4
+	jz	.Loop1_sqr
+
+.Loop4_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	lg	%r7,8(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,16(%r2)
+	stg	%r6,24(%r2)
+
+	lg	%r7,16(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,32(%r2)
+	stg	%r6,40(%r2)
+
+	lg	%r7,24(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,48(%r2)
+	stg	%r6,56(%r2)
+
+	la	%r3,32(%r3)
+	la	%r2,64(%r2)
+	brct	%r1,.Loop4_sqr
+
+	lghi	%r1,3
+	nr	%r4,%r1		// cnt=len%4
+	jz	.Lend_sqr
+
+.Loop1_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	la	%r3,8(%r3)
+	la	%r2,16(%r2)
+	brct	%r4,.Loop1_sqr
+
+.Lend_sqr:
+	lmg	%r6,%r7,48(%r15)
+	br	%r14
+.size	bn_sqr_words,.-bn_sqr_words
+
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl	bn_div_words
+.type	bn_div_words,@function
+.align	4
+bn_div_words:
+	dlgr	%r2,%r4
+	lgr	%r2,%r3
+	br	%r14
+.size	bn_div_words,.-bn_div_words
+
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_add_words
+.type	bn_add_words,@function
+.align	4
+bn_add_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jz	.Loop1_add	// carry is incidentally cleared if branch taken
+	algr	%r2,%r2		// clear carry
+
+.Loop4_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	alcg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	alcg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	alcg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_add
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_add	// without touching condition code:-)
+
+.Lexit_add:
+	lghi	%r2,0
+	alcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+
+.Loop1_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_add
+
+	j	.Lexit_add
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_sub_words
+.type	bn_sub_words,@function
+.align	4
+bn_sub_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jnz	.Loop4_sub	// borrow is incidentally cleared if branch taken
+	slgr	%r2,%r2		// clear borrow
+
+.Loop1_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_sub
+	j	.Lexit_sub
+
+.Loop4_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	slbg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	slbg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	slbg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_sub
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_sub	// without touching condition code:-)
+
+.Lexit_sub:
+	lghi	%r2,0
+	slbgr	%r2,%r2
+	lcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+.size	bn_sub_words,.-bn_sub_words
+
+#define c1	%r1
+#define c2	%r5
+#define c3	%r8
+
+#define mul_add_c(ai,bi,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,bi*8(%r4);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	4
+bn_mul_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(4,0,c2,c3,c1);
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	mul_add_c(0,4,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(0,5,c3,c1,c2);
+	mul_add_c(1,4,c3,c1,c2);
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	mul_add_c(4,1,c3,c1,c2);
+	mul_add_c(5,0,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(6,0,c1,c2,c3);
+	mul_add_c(5,1,c1,c2,c3);
+	mul_add_c(4,2,c1,c2,c3);
+	mul_add_c(3,3,c1,c2,c3);
+	mul_add_c(2,4,c1,c2,c3);
+	mul_add_c(1,5,c1,c2,c3);
+	mul_add_c(0,6,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,7,c2,c3,c1);
+	mul_add_c(1,6,c2,c3,c1);
+	mul_add_c(2,5,c2,c3,c1);
+	mul_add_c(3,4,c2,c3,c1);
+	mul_add_c(4,3,c2,c3,c1);
+	mul_add_c(5,2,c2,c3,c1);
+	mul_add_c(6,1,c2,c3,c1);
+	mul_add_c(7,0,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,1,c3,c1,c2);
+	mul_add_c(6,2,c3,c1,c2);
+	mul_add_c(5,3,c3,c1,c2);
+	mul_add_c(4,4,c3,c1,c2);
+	mul_add_c(3,5,c3,c1,c2);
+	mul_add_c(2,6,c3,c1,c2);
+	mul_add_c(1,7,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(2,7,c1,c2,c3);
+	mul_add_c(3,6,c1,c2,c3);
+	mul_add_c(4,5,c1,c2,c3);
+	mul_add_c(5,4,c1,c2,c3);
+	mul_add_c(6,3,c1,c2,c3);
+	mul_add_c(7,2,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(7,3,c2,c3,c1);
+	mul_add_c(6,4,c2,c3,c1);
+	mul_add_c(5,5,c2,c3,c1);
+	mul_add_c(4,6,c2,c3,c1);
+	mul_add_c(3,7,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(4,7,c3,c1,c2);
+	mul_add_c(5,6,c3,c1,c2);
+	mul_add_c(6,5,c3,c1,c2);
+	mul_add_c(7,4,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(7,5,c1,c2,c3);
+	mul_add_c(6,6,c1,c2,c3);
+	mul_add_c(5,7,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+
+	mul_add_c(6,7,c2,c3,c1);
+	mul_add_c(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba8,.-bn_mul_comba8
+
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	4
+bn_mul_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r3)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(3,3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	stmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba4,.-bn_mul_comba4
+
+#define sqr_add_c(ai,c1,c2,c3)		\
+	lg	%r7,ai*8(%r3);		\
+	mlgr	%r6,%r7;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+#define sqr_add_c2(ai,aj,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,aj*8(%r3);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	4
+bn_sqr_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	sqr_add_c2(4,0,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(5,0,c3,c1,c2);
+	sqr_add_c2(4,1,c3,c1,c2);
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	sqr_add_c2(4,2,c1,c2,c3);
+	sqr_add_c2(5,1,c1,c2,c3);
+	sqr_add_c2(6,0,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,0,c2,c3,c1);
+	sqr_add_c2(6,1,c2,c3,c1);
+	sqr_add_c2(5,2,c2,c3,c1);
+	sqr_add_c2(4,3,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(4,c3,c1,c2);
+	sqr_add_c2(5,3,c3,c1,c2);
+	sqr_add_c2(6,2,c3,c1,c2);
+	sqr_add_c2(7,1,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(7,2,c1,c2,c3);
+	sqr_add_c2(6,3,c1,c2,c3);
+	sqr_add_c2(5,4,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(5,c2,c3,c1);
+	sqr_add_c2(6,4,c2,c3,c1);
+	sqr_add_c2(7,3,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(7,4,c3,c1,c2);
+	sqr_add_c2(6,5,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(6,c1,c2,c3);
+	sqr_add_c2(7,5,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba8,.-bn_sqr_comba8
+
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	4
+bn_sqr_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/crypto/bn/asm/sparcv8plus.S b/crypto/bn/asm/sparcv8plus.S
index 0074dfdb75..63de1860f2 100644
--- a/crypto/bn/asm/sparcv8plus.S
+++ b/crypto/bn/asm/sparcv8plus.S
@@ -144,6 +144,19 @@
  *	    }
  */
 
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+  /* They've said -xarch=v9 at command line */
+  .register	%g2,#scratch
+  .register	%g3,#scratch
+# define	FRAME_SIZE	-192
+#elif defined(__GNUC__) && defined(__arch64__)
+  /* They've said -m64 at command line */
+  .register	%g2,#scratch
+  .register	%g3,#scratch
+# define	FRAME_SIZE	-192
+#else 
+# define	FRAME_SIZE	-96
+#endif 
 /*
  * GNU assembler can't stand stuw:-(
  */
@@ -162,10 +175,14 @@
  * BN_ULONG w;
  */
 bn_mul_add_words:
+	sra	%o2,%g0,%o2	! signx %o2
 	brgz,a	%o2,.L_bn_mul_add_words_proceed
 	lduw	[%o1],%g2
 	retl
 	clr	%o0
+	nop
+	nop
+	nop
 
 .L_bn_mul_add_words_proceed:
 	srl	%o3,%g0,%o3	! clruw	%o3
@@ -260,10 +277,14 @@ bn_mul_add_words:
  * BN_ULONG w;
  */
 bn_mul_words:
+	sra	%o2,%g0,%o2	! signx %o2
 	brgz,a	%o2,.L_bn_mul_words_proceeed
 	lduw	[%o1],%g2
 	retl
 	clr	%o0
+	nop
+	nop
+	nop
 
 .L_bn_mul_words_proceeed:
 	srl	%o3,%g0,%o3	! clruw	%o3
@@ -344,10 +365,14 @@ bn_mul_words:
  * int n;
  */
 bn_sqr_words:
+	sra	%o2,%g0,%o2	! signx %o2
 	brgz,a	%o2,.L_bn_sqr_words_proceeed
 	lduw	[%o1],%g2
 	retl
 	clr	%o0
+	nop
+	nop
+	nop
 
 .L_bn_sqr_words_proceeed:
 	andcc	%o2,-4,%g0
@@ -445,6 +470,7 @@ bn_div_words:
  * int n;
  */
 bn_add_words:
+	sra	%o3,%g0,%o3	! signx %o3
 	brgz,a	%o3,.L_bn_add_words_proceed
 	lduw	[%o1],%o4
 	retl
@@ -454,7 +480,6 @@ bn_add_words:
 	andcc	%o3,-4,%g0
 	bz,pn	%icc,.L_bn_add_words_tail
 	addcc	%g0,0,%g0	! clear carry flag
-	nop
 
 .L_bn_add_words_loop:		! wow! 32 aligned!
 	dec	4,%o3
@@ -523,6 +548,7 @@ bn_add_words:
  * int n;
  */
 bn_sub_words:
+	sra	%o3,%g0,%o3	! signx %o3
 	brgz,a	%o3,.L_bn_sub_words_proceed
 	lduw	[%o1],%o4
 	retl
@@ -532,7 +558,6 @@ bn_sub_words:
 	andcc	%o3,-4,%g0
 	bz,pn	%icc,.L_bn_sub_words_tail
 	addcc	%g0,0,%g0	! clear carry flag
-	nop
 
 .L_bn_sub_words_loop:		! wow! 32 aligned!
 	dec	4,%o3
@@ -607,8 +632,6 @@ bn_sub_words:
  *							Andy.
  */
 
-#define FRAME_SIZE	-96
-
 /*
  * Here is register usage map for *all* routines below.
  */
diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl
new file mode 100644
index 0000000000..b8fb1e8a25
--- /dev/null
+++ b/crypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*)	Engine accessing the driver in question is on my TODO list.
+#	For reference, acceleator is estimated to give 6 to 10 times
+#	improvement on single-threaded RSA sign. It should be noted
+#	that 6-10x improvement coefficient does not actually mean
+#	something extraordinary in terms of absolute [single-threaded]
+#	performance, as SPARCv9 instruction set is by all means least
+#	suitable for high performance crypto among other 64 bit
+#	platforms. 6-10x factor simply places T1 in same performance
+#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
+#	appear impressive at all, but it's the sign operation which is
+#	far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$bits=32;
+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)	{ $bias=2047; $frame=192; }
+else		{ $bias=0;    $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2";	# 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1";	# 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont_int";
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global	$fname
+.align	32
+$fname:
+	cmp	%o5,4			! 128 bits minimum
+	bge,pt	%icc,.Lenter
+	sethi	%hi(0xffffffff),$mask
+	retl
+	clr	%o0
+.align	32
+.Lenter:
+	save	%sp,-$frame,%sp
+	sll	$num,2,$num		! num*=4
+	or	$mask,%lo(0xffffffff),$mask
+	ld	[$n0],$n0
+	cmp	$ap,$bp
+	and	$num,$mask,$num
+	ld	[$bp],$mul0		! bp[0]
+	nop
+
+	add	%sp,$bias,%o7		! real top of stack
+	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
+	sub	%o7,$num,%o7
+	ld	[$ap+4],$apj		! ap[1]
+	and	%o7,-1024,%o7
+	ld	[$np],$car1		! np[0]
+	sub	%o7,$bias,%sp		! alloca
+	ld	[$np+4],$npj		! np[1]
+	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
+	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj		!prologue!
+
+	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.L1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]
+	cmp	$j,$num
+	mov	$tmp1,$acc1
+	srlx	$car1,32,$car1
+	bl	%icc,.L1st
+	add	$tp,4,$tp		! tp++
+!.L1st
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	mov	4,$i			! i++
+	ld	[$bp+4],$mul0		! bp[1]
+.Louter:
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap],$car0		! ap[0]
+	ld	[$ap+4],$apj		! ap[1]
+	ld	[$np],$car1		! np[0]
+	ld	[$np+4],$npj		! np[1]
+	ld	[$tp],$tmp1		! tp[0]
+	ld	[$tp+4],$tpj		! tp[1]
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0
+	mulx	$apj,$mul0,$tmp0	!prologue!
+	add	$tmp1,$car0,$car0
+	ld	[$ap+8],$apj		!prologue!
+	and	$car0,$mask,$acc0
+
+	mulx	$n0,$acc0,$mul1
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1	!prologue!
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.Linner:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	add	$acc0,$car0,$car0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	and	$car0,$mask,$acc0
+	ld	[$tp+8],$tpj		! tp[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+	mov	$tmp1,$acc1
+	cmp	$j,$num
+	bl	%icc,.Linner
+	add	$tp,4,$tp		! tp++
+!.Linner
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	add	$acc0,$car0,$car0
+	ld	[$tp+8],$tpj		! tp[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$tpj,$car0,$car0
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]		! tp[j-1]
+	srlx	$car0,32,$car0
+	add	$i,4,$i			! i++
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	cmp	$i,$num
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+8]
+
+	srlx	$car1,32,$car2
+	bl,a	%icc,.Louter
+	ld	[$bp+$i],$mul0		! bp[i]
+!.Louter
+
+	add	$tp,12,$tp
+
+.Ltail:
+	add	$np,$num,$np
+	add	$rp,$num,$rp
+	mov	$tp,$ap
+	sub	%g0,$num,%o7		! k=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+.align	16
+.Lsub:
+	ld	[$tp+%o7],%o0
+	ld	[$np+%o7],%o1
+	subccc	%o0,%o1,%o1		! tp[j]-np[j]
+	add	$rp,%o7,$i
+	add	%o7,4,%o7
+	brnz	%o7,.Lsub
+	st	%o1,[$i]
+	subc	$car2,0,$car2		! handle upmost overflow bit
+	and	$tp,$car2,$ap
+	andn	$rp,$car2,$np
+	or	$ap,$np,$ap
+	sub	%g0,$num,%o7
+
+.Lcopy:
+	ld	[$ap+%o7],%o0		! copy or in-place refresh
+	st	%g0,[$tp+%o7]		! zap tp
+	st	%o0,[$rp+%o7]
+	add	%o7,4,%o7
+	brnz	%o7,.Lcopy
+	nop
+	mov	1,%i0
+	ret
+	restore
+___
+
+########
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
+########
+$sbit="%i2";		# re-use $bp!
+
+$code.=<<___;
+.align	32
+.Lbn_sqr_mont:
+	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
+	mulx	$apj,$mul0,$tmp0		!prologue!
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj			!prologue!
+
+	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
+	srlx	$car0,32,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1		!prologue!
+	and	$car0,1,$sbit
+	ld	[$np+8],$npj			!prologue!
+	srlx	$car0,1,$car0
+	add	$acc0,$car1,$car1
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0			!prologue!
+
+.Lsqr_1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	mov	$tmp1,$acc1
+	srlx	$acc0,32,$sbit
+	add	$j,4,$j				! j++
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	mov	$tmp0,$acc0
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_1st
+	add	$tp,4,$tp			! tp++
+!.Lsqr_1st
+
+	mulx	$apj,$mul0,$tmp0		! epilogue
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
+	add	$tmp1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
+	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
+	ld	[$ap+4],$mul0			! ap[1]
+	ld	[$ap+8],$apj			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp0,$mul1
+
+	mulx	$mul0,$mul0,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1
+	add	$tmp0,$car1,$car1
+	and	$car0,$mask,$acc0
+	ld	[$np+8],$npj			! np[2]
+	srlx	$car1,32,$car1
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	and	$car0,1,$sbit
+	add	$acc1,$car1,$car1
+	srlx	$car0,1,$car0
+	mov	12,$j
+	st	$car1,[%sp+$bias+$frame]	! tp[0]=
+	srlx	$car1,32,$car1
+	add	%sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc0,$acc0,$acc0
+	add	$j,4,$j				! j++
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_2nd
+	add	$tp,4,$tp			! tp++
+!.Lsqr_2nd
+
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+8],$mul0			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	mov	8,$i
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+	mov	4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner1
+	add	$tp,4,$tp
+!.Lsqr_inner1
+
+	add	$j,4,$j
+	ld	[$ap+$j],$apj			! ap[j]
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	ld	[$np+$j],$npj			! np[j]
+	add	$acc0,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$j,4,$j
+	cmp	$j,$num
+	be,pn	%icc,.Lsqr_no_inner2
+	add	$tp,4,$tp
+
+.Lsqr_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	ld	[$tp+8],$tpj			! tp[j]
+	or	$sbit,$acc0,$acc0
+	add	$j,4,$j				! j++
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner2
+	add	$tp,4,$tp			! tp++
+
+.Lsqr_no_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	add	$i,4,$i				! i++
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+$i],$mul0			! ap[j]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	add	$i,4,$tmp0
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+
+	cmp	$tmp0,$num			! i<num-1
+	bl	%icc,.Lsqr_outer
+	mov	4,$j
+
+.Lsqr_last:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_last
+	add	$tp,4,$tp
+!.Lsqr_last
+
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0		! recover $car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ba	.Ltail
+	add	$tp,8,$tp
+.type	$fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl
new file mode 100755
index 0000000000..a14205f2f0
--- /dev/null
+++ b/crypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
+# Because unlike integer multiplier, which simply stalls whole CPU,
+# FPU is fully pipelined and can effectively emit 48 bit partial
+# product every cycle. Why not blended SPARC v9? One can argue that
+# making this module dependent on UltraSPARC VIS extension limits its
+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
+# implementations from compatibility matrix. But the rest, whole Sun
+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
+# VIS extension instructions used in this module. This is considered
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
+
+# USI&II cores currently exhibit uniform 2x improvement [over pre-
+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
+# performance improves few percents for shorter keys and worsens few
+# percents for longer keys. This is because USIII integer multiplier
+# is >3x faster than USI&II one, which is harder to match [but see
+# TODO list below]. It should also be noted that SPARC64 V features
+# out-of-order execution, which *might* mean that integer multiplier
+# is pipelined, which in turn *might* be impossible to match... On
+# additional note, SPARC64 V implements FP Multiply-Add instruction,
+# which is perfectly usable in this context... In other words, as far
+# as Fujitsu SPARC64 V goes, talk to the author:-)
+
+# The implementation implies following "non-natural" limitations on
+# input arguments:
+# - num may not be less than 4;
+# - num has to be even;
+# Failure to meet either condition has no fatal effects, simply
+# doesn't give any performance gain.
+
+# TODO:
+# - modulo-schedule inner loop for better performance (on in-order
+#   execution core such as UltraSPARC this shall result in further
+#   noticeable(!) improvement);
+# - dedicated squaring procedure[?];
+
+######################################################################
+# November 2006
+#
+# Modulo-scheduled inner loops allow to interleave floating point and
+# integer instructions and minimize Read-After-Write penalties. This
+# results in *further* 20-50% perfromance improvement [depending on
+# key length, more for longer keys] on USI&II cores and 30-80% - on
+# USIII&IV.
+
+$fname="bn_mul_mont_fpu";
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+
+if ($bits==64) {
+	$bias=2047;
+	$frame=192;
+} else {
+	$bias=0;
+	$frame=128;	# 96 rounded up to largest known cache-line
+}
+$locals=64;
+
+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
+# exclusively for pointers, indexes and other small values...
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$tp="%l0";	# t[num]
+$ap_l="%l1";	# a[num],n[num] are smashed to 32-bit words and saved
+$ap_h="%l2";	# to these four vectors as double-precision FP values.
+$np_l="%l3";	# This way a bunch of fxtods are eliminated in second
+$np_h="%l4";	# loop and L1-cache aliasing is minimized...
+$i="%l5";
+$j="%l6";
+$mask="%l7";	# 16-bit mask, 0xffff
+
+$n0="%g4";	# reassigned(!) to "64-bit" register
+$carry="%i4";	# %i4 reused(!) for a carry bit
+
+# FP register naming chart
+#
+#     ..HILO
+#       dcba
+#   --------
+#        LOa
+#       LOb
+#      LOc
+#     LOd
+#      HIa
+#     HIb
+#    HIc
+#   HId
+#    ..a
+#   ..b
+$ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
+$na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
+$alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
+$nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
+
+$dota="%f24"; $dotb="%f26";
+
+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
+
+$ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global $fname
+.align  32
+$fname:
+	save	%sp,-$frame-$locals,%sp
+
+	cmp	$num,4
+	bl,a,pn %icc,.Lret
+	clr	%i0
+	andcc	$num,1,%g0		! $num has to be even...
+	bnz,a,pn %icc,.Lret
+	clr	%i0			! signal "unsupported input value"
+
+	srl	$num,1,$num
+	sethi	%hi(0xffff),$mask
+	ld	[%i4+0],$n0		! $n0 reassigned, remember?
+	or	$mask,%lo(0xffff),$mask
+	ld	[%i4+4],%o0
+	sllx	%o0,32,%o0
+	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
+
+	sll	$num,3,$num		! num*=8
+
+	add	%sp,$bias,%o0		! real top of stack
+	sll	$num,2,%o1
+	add	%o1,$num,%o1		! %o1=num*5
+	sub	%o0,%o1,%o0
+	and	%o0,-2048,%o0		! optimize TLB utilization
+	sub	%o0,$bias,%sp		! alloca(5*num*8)
+
+	rd	%asi,%o7		! save %asi
+	add	%sp,$bias+$frame+$locals,$tp
+	add	$tp,$num,$ap_l
+	add	$ap_l,$num,$ap_l	! [an]p_[lh] point at the vectors' ends !
+	add	$ap_l,$num,$ap_h
+	add	$ap_h,$num,$np_l
+	add	$np_l,$num,$np_h
+
+	wr	%g0,$ASI_FL16_P,%asi	! setup %asi for 16-bit FP loads
+
+	add	$rp,$num,$rp		! readjust input pointers to point
+	add	$ap,$num,$ap		! at the ends too...
+	add	$bp,$num,$bp
+	add	$np,$num,$np
+
+	stx	%o7,[%sp+$bias+$frame+48]	! save %asi
+
+	sub	%g0,$num,$i		! i=-num
+	sub	%g0,$num,$j		! j=-num
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[0]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	add	$np,$j,%o5
+
+	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
+	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o3+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	fxtod	$alo,$alo
+	ldda	[%o4+0]%asi,$bb
+	fxtod	$ahi,$ahi
+	ldda	[%o4+6]%asi,$bc
+	fxtod	$nlo,$nlo
+	ldda	[%o4+4]%asi,$bd
+	fxtod	$nhi,$nhi
+
+	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+	fxtod	$na,$na
+	std	$ahi,[$ap_h+$j]
+	fxtod	$nb,$nb
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+	fxtod	$nc,$nc
+	std	$nhi,[$np_h+$j]
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	add	$j,8,$j
+	std	$nlob,[%sp+$bias+$frame+8]
+	add	$ap,$j,%o4
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$np,$j,%o5
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	!and	%o0,$mask,%o0
+	!and	%o1,$mask,%o1
+	!and	%o2,$mask,%o2
+	!sllx	%o1,16,%o1
+	!sllx	%o2,32,%o2
+	!sllx	%o3,48,%o7
+	!or	%o1,%o0,%o0
+	!or	%o2,%o0,%o0
+	!or	%o7,%o0,%o0		! 64-bit result
+	srlx	%o3,16,%g1		! 34-bit carry
+		fmuld	$ahi,$bb,$ahib
+
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$dota,$nloa,$nloa
+	faddd	$dotb,$nlob,$nlob
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.L1stskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+.align	32			! incidentally already aligned !
+.L1st:
+	add	$ap,$j,%o4
+	add	$np,$j,%o5
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		fmuld	$ahi,$bb,$ahib
+	sllx	%o1,16,%o1
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	sllx	%o2,32,%o2
+		fmuld	$ahi,$bc,$ahic
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	or	%o2,%o0,%o0
+		fmuld	$ahi,$bd,$ahid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	addcc	%g1,%o0,%o0
+		faddd	$dota,$nloa,$nloa
+	srlx	%o3,16,%g1		! 34-bit carry
+		faddd	$dotb,$nlob,$nlob
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	addcc	$j,8,$j
+	bnz,pt	%icc,.L1st
+	add	$tp,8,$tp
+
+.L1stskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+32],%o4
+	addcc	%g1,%o0,%o0
+	ldx	[%sp+$bias+$frame+40],%o5
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	mov	%g1,$carry
+	stx	%o4,[$tp]		! tp[num-1]=
+
+	ba	.Louter
+	add	$i,8,$i
+.align	32
+.Louter:
+	sub	%g0,$num,$j		! j=-num
+	add	%sp,$bias+$frame+$locals,$tp
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[i]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	ldx	[$tp],%o2		! tp[0]
+	mulx	%o1,%o0,%o0
+	addcc	%o2,%o0,%o0
+	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	ldda	[%o4+0]%asi,$bb
+	ldda	[%o4+6]%asi,$bc
+	ldda	[%o4+4]%asi,$bd
+
+	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	fxtod	$na,$na
+	ldd	[$ap_h+$j],$ahi
+	fxtod	$nb,$nb
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	fxtod	$nc,$nc
+	ldd	[$np_h+$j],$nhi
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	! why?
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[$tp],%o7
+		faddd	$nloc,$nhia,$nloc
+	addcc	%o7,%o0,%o0
+	! end-of-why?
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.Linnerskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ba	.Linner
+	nop
+.align	32
+.Linner:
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$nloc,$nhia,$nloc
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+		fdtox	$nlob,$nlob
+	addcc	%o7,%o0,%o0
+		fdtox	$nloc,$nloc
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+		fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	addcc	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+	bnz,pt	%icc,.Linner
+	add	$tp,8,$tp
+
+.Linnerskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	ldx	[%sp+$bias+$frame+32],%o4
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+40],%o5
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	%o7,%o0,%o0
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	$carry,%o4,%o4
+	stx	%o4,[$tp]		! tp[num-1]
+	mov	%g1,$carry
+	bcs,a	%xcc,.+8
+	add	$carry,1,$carry
+
+	addcc	$i,8,$i
+	bnz	%icc,.Louter
+	nop
+
+	add	$tp,8,$tp		! adjust tp to point at the end
+	orn	%g0,%g0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+
+.align	32
+.Lsub:
+	ldx	[$tp+%o7],%o0
+	add	$np,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	srlx	%o0,32,%o1
+	subccc	%o0,%o2,%o2
+	add	$rp,%o7,%g1
+	subccc	%o1,%o3,%o3
+	st	%o2,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lsub
+	st	%o3,[%g1+4]
+	subc	$carry,0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lcopy
+	nop
+
+.align	32
+.Lcopy:
+	ldx	[$tp+%o7],%o0
+	add	$rp,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	stx	%g0,[$tp+%o7]
+	and	%o0,%g4,%o0
+	srlx	%o0,32,%o1
+	andn	%o2,%g4,%o2
+	andn	%o3,%g4,%o3
+	or	%o2,%o0,%o0
+	or	%o3,%o1,%o1
+	st	%o0,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lcopy
+	st	%o1,[%g1+4]
+	sub	%g0,$num,%o7		! n=-num
+
+.Lzap:
+	stx	%g0,[$ap_l+%o7]
+	stx	%g0,[$ap_h+%o7]
+	stx	%g0,[$np_l+%o7]
+	stx	%g0,[$np_h+%o7]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lzap
+	nop
+
+	ldx	[%sp+$bias+$frame+48],%o7
+	wr	%g0,%o7,%asi		! restore %asi
+
+	mov	1,%i0
+.Lret:
+	ret
+	restore
+.type   $fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+# Below substitution makes it possible to compile without demanding
+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# dare to do this, because VIS capability is detected at run-time now
+# and this routine is not called on CPU not capable to execute it. Do
+# note that fzeros is not the only VIS dependency! Another dependency
+# is implicit and is just _a_ numerical value loaded to %asi register,
+# which assembler can't recognize as VIS specific...
+$code =~ s/fzeros\s+%f([0-9]+)/
+	   sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
+	  /gem;
+
+print $code;
+# flush
+close STDOUT;
diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl
new file mode 100644
index 0000000000..c046a514c8
--- /dev/null
+++ b/crypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Wrapper around 'rep montmul', VIA-specific instruction accessing
+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
+#
+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
+# different software configurations on 1.5GHz VIA Esther processor.
+# Lines marked with "software integer" denote performance of hand-
+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
+# refers to hand-coded SSE2 Montgomery multiplication procedure found
+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
+# Padlock SDK 2.0.1 available for download from VIA, which naturally
+# utilizes the magic 'repz montmul' instruction. And finally "hardware
+# this" refers to *this* implementation which also uses 'repz montmul'
+#
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.001720s 0.000140s    581.4   7149.7	software integer
+# rsa  512 bits 0.000690s 0.000086s   1450.3  11606.0	software SSE2
+# rsa  512 bits 0.006136s 0.000201s    163.0   4974.5	hardware VIA SDK
+# rsa  512 bits 0.000712s 0.000050s   1404.9  19858.5	hardware this
+#
+# rsa 1024 bits 0.008518s 0.000413s    117.4   2420.8	software integer
+# rsa 1024 bits 0.004275s 0.000277s    233.9   3609.7	software SSE2
+# rsa 1024 bits 0.012136s 0.000260s     82.4   3844.5	hardware VIA SDK
+# rsa 1024 bits 0.002522s 0.000116s    396.5   8650.9	hardware this
+#
+# rsa 2048 bits 0.050101s 0.001371s     20.0    729.6	software integer
+# rsa 2048 bits 0.030273s 0.001008s     33.0    991.9	software SSE2
+# rsa 2048 bits 0.030833s 0.000976s     32.4   1025.1	hardware VIA SDK
+# rsa 2048 bits 0.011879s 0.000342s     84.2   2921.7	hardware this
+#
+# rsa 4096 bits 0.327097s 0.004859s      3.1    205.8	software integer
+# rsa 4096 bits 0.229318s 0.003859s      4.4    259.2	software SSE2
+# rsa 4096 bits 0.233953s 0.003274s      4.3    305.4	hardware VIA SDK
+# rsa 4096 bits 0.070493s 0.001166s     14.2    857.6	hardware this
+#
+# dsa  512 bits 0.001342s 0.001651s    745.2    605.7	software integer
+# dsa  512 bits 0.000844s 0.000987s   1185.3   1013.1	software SSE2
+# dsa  512 bits 0.001902s 0.002247s    525.6    444.9	hardware VIA SDK
+# dsa  512 bits 0.000458s 0.000524s   2182.2   1909.1	hardware this
+#
+# dsa 1024 bits 0.003964s 0.004926s    252.3    203.0	software integer
+# dsa 1024 bits 0.002686s 0.003166s    372.3    315.8	software SSE2
+# dsa 1024 bits 0.002397s 0.002823s    417.1    354.3	hardware VIA SDK
+# dsa 1024 bits 0.000978s 0.001170s   1022.2    855.0	hardware this
+#
+# dsa 2048 bits 0.013280s 0.016518s     75.3     60.5	software integer
+# dsa 2048 bits 0.009911s 0.011522s    100.9     86.8	software SSE2
+# dsa 2048 bits 0.009542s 0.011763s    104.8     85.0	hardware VIA SDK
+# dsa 2048 bits 0.002884s 0.003352s    346.8    298.3	hardware this
+#
+# To give you some other reference point here is output for 2.4GHz P4
+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
+# SSE2" in above terms.
+#
+# rsa  512 bits 0.000407s 0.000047s   2454.2  21137.0
+# rsa 1024 bits 0.002426s 0.000141s    412.1   7100.0
+# rsa 2048 bits 0.015046s 0.000491s     66.5   2034.9
+# rsa 4096 bits 0.109770s 0.002379s      9.1    420.3
+# dsa  512 bits 0.000438s 0.000525s   2281.1   1904.1
+# dsa 1024 bits 0.001346s 0.001595s    742.7    627.0
+# dsa 2048 bits 0.004745s 0.005582s    210.7    179.1
+#
+# Conclusions: 
+# - VIA SDK leaves a *lot* of room for improvement (which this
+#   implementation successfully fills:-);
+# - 'rep montmul' gives up to >3x performance improvement depending on
+#   key length;
+# - in terms of absolute performance it delivers approximately as much
+#   as modern out-of-order 32-bit cores [again, for longer keys].
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"via-mont.pl");
+
+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+$func="bn_mul_mont_padlock";
+
+$pad=16*1;	# amount of reserved bytes on top of every vector
+
+# stack layout
+$mZeroPrime=&DWP(0,"esp");		# these are specified by VIA
+$A=&DWP(4,"esp");
+$B=&DWP(8,"esp");
+$T=&DWP(12,"esp");
+$M=&DWP(16,"esp");
+$scratch=&DWP(20,"esp");
+$rp=&DWP(24,"esp");			# these are mine
+$sp=&DWP(28,"esp");
+# &DWP(32,"esp")			# 32 byte scratch area
+# &DWP(64+(4*$num+$pad)*0,"esp")	# padded tp[num]
+# &DWP(64+(4*$num+$pad)*1,"esp")	# padded copy of ap[num]
+# &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of bp[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")	# padded copy of np[num]
+# Note that SDK suggests to unconditionally allocate 2K per vector. This
+# has quite an impact on performance. It naturally depends on key length,
+# but to give an example 1024 bit private RSA key operations suffer >30%
+# penalty. I allocate only as much as actually required...
+
+&function_begin($func);
+	&xor	("eax","eax");
+	&mov	("ecx",&wparam(5));	# num
+	# meet VIA's limitations for num [note that the specification
+	# expresses them in bits, while we work with amount of 32-bit words]
+	&test	("ecx",3);
+	&jnz	(&label("leave"));	# num % 4 != 0
+	&cmp	("ecx",8);
+	&jb	(&label("leave"));	# num < 8
+	&cmp	("ecx",1024);
+	&ja	(&label("leave"));	# num > 1024
+
+	&pushf	();
+	&cld	();
+
+	&mov	("edi",&wparam(0));	# rp
+	&mov	("eax",&wparam(1));	# ap
+	&mov	("ebx",&wparam(2));	# bp
+	&mov	("edx",&wparam(3));	# np
+	&mov	("esi",&wparam(4));	# n0
+	&mov	("esi",&DWP(0,"esi"));	# *n0
+
+	&lea	("ecx",&DWP($pad,"","ecx",4));	# ecx becomes vector size in bytes
+	&lea	("ebp",&DWP(64,"","ecx",4));	# allocate 4 vectors + 64 bytes
+	&neg	("ebp");
+	&add	("ebp","esp");
+	&and	("ebp",-64);		# align to cache-line
+	&xchg	("ebp","esp");		# alloca
+
+	&mov	($rp,"edi");		# save rp
+	&mov	($sp,"ebp");		# save esp
+
+	&mov	($mZeroPrime,"esi");
+	&lea	("esi",&DWP(64,"esp"));	# tp
+	&mov	($T,"esi");
+	&lea	("edi",&DWP(32,"esp"));	# scratch area
+	&mov	($scratch,"edi");
+	&mov	("esi","eax");
+
+	&lea	("ebp",&DWP(-$pad,"ecx"));
+	&shr	("ebp",2);		# restore original num value in ebp
+
+	&xor	("eax","eax");
+
+	&mov	("ecx","ebp");
+	&lea	("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("ecx","ebp");
+	&lea	("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
+	&mov	($A,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded ap copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","ebx");
+	&mov	($B,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded bp copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","edx");
+	&mov	($M,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded np copy...
+
+	# let magic happen...
+	&mov	("ecx","ebp");
+	&mov	("esi","esp");
+	&shl	("ecx",5);		# convert word counter to bit counter
+	&align	(4);
+	&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
+
+	&mov	("ecx","ebp");
+	&lea	("esi",&DWP(64,"esp"));		# tp
+	# edi still points at the end of padded np copy...
+	&neg	("ebp");
+	&lea	("ebp",&DWP(-$pad,"edi","ebp",4));	# so just "rewind"
+	&mov	("edi",$rp);			# restore rp
+	&xor	("edx","edx");			# i=0 and clear CF
+
+&set_label("sub",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&sbb	("eax",&DWP(0,"ebp","edx",4));
+	&mov	(&DWP(0,"edi","edx",4),"eax");	# rp[i]=tp[i]-np[i]
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("sub"));		# doesn't affect CF!
+
+	&mov	("eax",&DWP(0,"esi","edx",4));	# upmost overflow bit
+	&sbb	("eax",0);
+	&and	("esi","eax");
+	&not	("eax");
+	&mov	("ebp","edi");
+	&and	("ebp","eax");
+	&or	("esi","ebp");			# tp=carry?tp:rp
+
+	&mov	("ecx","edx");			# num
+	&xor	("edx","edx");			# i=0
+
+&set_label("copy",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&mov	(&DWP(64,"esp","edx",4),"ecx");	# zap tp
+	&mov	(&DWP(0,"edi","edx",4),"eax");
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("copy"));
+
+	&mov	("ebp",$sp);
+	&xor	("eax","eax");
+
+	&mov	("ecx",64/4);
+	&mov	("edi","esp");		# zap frame including scratch area
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	# zap copies of ap, bp and np
+	&lea	("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+	&lea	("ecx",&DWP(3*$pad/4,"edx","edx",2));
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("esp","ebp");
+	&inc	("eax");		# signal "done"
+	&popf	();
+&set_label("leave");
+&function_end($func);
+
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
new file mode 100755
index 0000000000..5cd3cd2ed5
--- /dev/null
+++ b/crypto/bn/asm/x86-mont.pl
@@ -0,0 +1,591 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&mov	("ebp","esp");		# saved stack pointer!
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","esp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","esp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("esp","edx");		# this splits them apart modulo 4096
+
+	&and	("esp",-64);		# align to cache line
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"edx");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&lea	("edx",&DWP(0,$sbit,"edx",2));
+	&shr	($carry,31);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d13ec5a468..a0c49ee978 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -1,3 +1,7 @@
+#include "../bn_lcl.h"
+#if !(defined(__GNUC__) && __GNUC__>=2)
+# include "../bn_asm.c"	/* kind of dirty hack for Sun Studio */
+#else
 /*
  * x86_64 BIGNUM accelerator version 0.1, December 2002.
  *
@@ -51,7 +55,14 @@
  *    machine.
  */
 
+#ifdef _WIN64
+#define BN_ULONG unsigned long long
+#else
 #define BN_ULONG unsigned long
+#endif
+
+#undef mul
+#undef mul_add
 
 /*
  * "m"(a), "+m"(r)	is the way to favor DirectPath �-code;
@@ -94,7 +105,7 @@
 		: "a"(a)		\
 		: "cc");
 
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c1=0;
 
@@ -118,7 +129,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	return(c1);
 	} 
 
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c1=0;
 
@@ -141,7 +152,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	return(c1);
 	} 
 
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
         {
 	if (n <= 0) return;
 
@@ -172,21 +183,21 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 	return ret;
 }
 
-BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
 { BN_ULONG ret=0,i=0;
 
 	if (n <= 0) return 0;
 
 	asm (
 	"	subq	%2,%2		\n"
-	".align 16			\n"
+	".p2align 4			\n"
 	"1:	movq	(%4,%2,8),%0	\n"
 	"	adcq	(%5,%2,8),%0	\n"
 	"	movq	%0,(%3,%2,8)	\n"
 	"	leaq	1(%2),%2	\n"
 	"	loop	1b		\n"
 	"	sbbq	%0,%0		\n"
-		: "+a"(ret),"+c"(n),"+r"(i)
+		: "=&a"(ret),"+c"(n),"=&r"(i)
 		: "r"(rp),"r"(ap),"r"(bp)
 		: "cc"
 	);
@@ -195,21 +206,21 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 }
 
 #ifndef SIMICS
-BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
 { BN_ULONG ret=0,i=0;
 
 	if (n <= 0) return 0;
 
 	asm (
 	"	subq	%2,%2		\n"
-	".align 16			\n"
+	".p2align 4			\n"
 	"1:	movq	(%4,%2,8),%0	\n"
 	"	sbbq	(%5,%2,8),%0	\n"
 	"	movq	%0,(%3,%2,8)	\n"
 	"	leaq	1(%2),%2	\n"
 	"	loop	1b		\n"
 	"	sbbq	%0,%0		\n"
-		: "+a"(ret),"+c"(n),"+r"(i)
+		: "=&a"(ret),"+c"(n),"=&r"(i)
 		: "r"(rp),"r"(ap),"r"(bp)
 		: "cc"
 	);
@@ -482,7 +493,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	r[7]=c2;
 	}
 
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t1,t2;
 	BN_ULONG c1,c2,c3;
@@ -558,7 +569,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 	r[15]=c1;
 	}
 
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t1,t2;
 	BN_ULONG c1,c2,c3;
@@ -591,3 +602,4 @@ void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 	r[6]=c1;
 	r[7]=c2;
 	}
+#endif
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
new file mode 100755
index 0000000000..3b7a6f243f
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005.
+#
+# Montgomery multiplication routine for x86_64. While it gives modest
+# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
+# than twice, >2x, as fast. Most common rsa1024 sign is improved by
+# respectful 50%. It remains to be seen if loop unrolling and
+# dedicated squaring routine can provide further improvement...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num);
+$lo0="%r10";
+$hi0="%r11";
+$bp="%r12";	# reassign $bp
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function,6
+.align	16
+bn_mul_mont:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	2($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lprologue:
+	mov	%rdx,$bp		# $bp reassigned, remember?
+
+	mov	($n0),$n0		# pull n0[0] value
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	%rdx,$hi0
+
+	imulq	$n0,%rax		# "tp[0]"*n0
+	mov	%rax,$m1
+
+	mulq	($np)			# np[0]*m1
+	add	$lo0,%rax		# discarded
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+.L1st:
+	mov	($ap,$j,8),%rax
+	mulq	$m0			# ap[j]*bp[0]
+	add	$hi0,%rax
+	adc	\$0,%rdx
+	mov	%rax,$lo0
+	mov	($np,$j,8),%rax
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[j]*m1
+	add	$hi1,%rax
+	lea	1($j),$j		# j++
+	adc	\$0,%rdx
+	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	cmp	$num,$j
+	mov	%rdx,$hi1
+	jl	.L1st
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter:
+	xor	$j,$j			# j=0
+
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	mov	($ap),%rax		# ap[0]
+	mulq	$m0			# ap[0]*bp[i]
+	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
+	adc	\$0,%rdx
+	mov	%rax,$lo0
+	mov	%rdx,$hi0
+
+	imulq	$n0,%rax		# tp[0]*n0
+	mov	%rax,$m1
+
+	mulq	($np,$j,8)		# np[0]*m1
+	add	$lo0,%rax		# discarded
+	mov	8(%rsp),$lo0		# tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+.align	4
+.Linner:
+	mov	($ap,$j,8),%rax
+	mulq	$m0			# ap[j]*bp[i]
+	add	$hi0,%rax
+	adc	\$0,%rdx
+	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[j]*m1
+	add	$hi1,%rax
+	lea	1($j),$j		# j++
+	adc	\$0,%rdx
+	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	(%rsp,$j,8),$lo0
+	cmp	$num,$j
+	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	jl	.Linner
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jl	.Louter
+
+	lea	(%rsp),$ap		# borrow ap for tp
+	lea	-1($num),$j		# j=num-1
+
+	mov	($ap),%rax		# tp[0]
+	xor	$i,$i			# i=0 and clear CF!
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	dec	$j			# doesn't affect CF!
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	jge	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$j,8),%rax
+	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
+	mov	$i,(%rsp,$j,8)		# zap temporary vector
+	dec	$j
+	jge	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lprologue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont
+	.rva	.LSEH_end_bn_mul_mont
+	.rva	.LSEH_info_bn_mul_mont
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index acf48b9784..e484b7fc11 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -56,6 +56,59 @@
  * [including the GNU Public Licence.]
  */
 /* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
  * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
  *
  * Portions of the attached software ("Contribution") are developed by 
@@ -77,6 +130,7 @@
 #include <stdio.h> /* FILE */
 #endif
 #include <openssl/ossl_typ.h>
+#include <openssl/crypto.h>
 
 #ifdef  __cplusplus
 extern "C" {
@@ -90,21 +144,15 @@ extern "C" {
  * BN_DEBUG - turn on various debugging alterations to the bignum code
  * BN_DEBUG_RAND - uses random poisoning of unused words to trip up
  * mismanagement of bignum internals. You must also define BN_DEBUG.
- * BN_STRICT - disables anything (not already caught by BN_DEBUG) that uses the
- * old ambiguity over zero representation. At some point, this behaviour should
- * become standard.
  */
 /* #define BN_DEBUG */
 /* #define BN_DEBUG_RAND */
-/* #define BN_STRICT */
-
-#ifdef OPENSSL_SYS_VMS
-#undef BN_LLONG /* experimental, so far... */
-#endif
 
+#ifndef OPENSSL_SMALL_FOOTPRINT
 #define BN_MUL_COMBA
 #define BN_SQR_COMBA
 #define BN_RECURSION
+#endif
 
 /* This next option uses the C libraries (2 word)/(1 word) function.
  * If it is not defined, I use my C version (which is slower).
@@ -145,6 +193,8 @@ extern "C" {
 #define BN_DEC_FMT1	"%lu"
 #define BN_DEC_FMT2	"%019lu"
 #define BN_DEC_NUM	19
+#define BN_HEX_FMT1	"%lX"
+#define BN_HEX_FMT2	"%016lX"
 #endif
 
 /* This is where the long long data type is 64 bits, but long is 32.
@@ -170,93 +220,72 @@ extern "C" {
 #define BN_DEC_FMT1	"%llu"
 #define BN_DEC_FMT2	"%019llu"
 #define BN_DEC_NUM	19
+#define BN_HEX_FMT1	"%llX"
+#define BN_HEX_FMT2	"%016llX"
 #endif
 
 #ifdef THIRTY_TWO_BIT
-#if defined(OPENSSL_SYS_WIN32) && !defined(__GNUC__)
-#define BN_ULLONG	unsigned _int64
-#else
-#define BN_ULLONG	unsigned long long
+#ifdef BN_LLONG
+# if defined(_WIN32) && !defined(__GNUC__)
+#  define BN_ULLONG	unsigned __int64
+#  define BN_MASK	(0xffffffffffffffffI64)
+# else
+#  define BN_ULLONG	unsigned long long
+#  define BN_MASK	(0xffffffffffffffffLL)
+# endif
 #endif
-#define BN_ULONG	unsigned long
-#define BN_LONG		long
+#define BN_ULONG	unsigned int
+#define BN_LONG		int
 #define BN_BITS		64
 #define BN_BYTES	4
 #define BN_BITS2	32
 #define BN_BITS4	16
-#ifdef OPENSSL_SYS_WIN32
-/* VC++ doesn't like the LL suffix */
-#define BN_MASK		(0xffffffffffffffffL)
-#else
-#define BN_MASK		(0xffffffffffffffffLL)
-#endif
 #define BN_MASK2	(0xffffffffL)
 #define BN_MASK2l	(0xffff)
 #define BN_MASK2h1	(0xffff8000L)
 #define BN_MASK2h	(0xffff0000L)
 #define BN_TBIT		(0x80000000L)
 #define BN_DEC_CONV	(1000000000L)
-#define BN_DEC_FMT1	"%lu"
-#define BN_DEC_FMT2	"%09lu"
-#define BN_DEC_NUM	9
-#endif
-
-#ifdef SIXTEEN_BIT
-#ifndef BN_DIV2W
-#define BN_DIV2W
-#endif
-#define BN_ULLONG	unsigned long
-#define BN_ULONG	unsigned short
-#define BN_LONG		short
-#define BN_BITS		32
-#define BN_BYTES	2
-#define BN_BITS2	16
-#define BN_BITS4	8
-#define BN_MASK		(0xffffffff)
-#define BN_MASK2	(0xffff)
-#define BN_MASK2l	(0xff)
-#define BN_MASK2h1	(0xff80)
-#define BN_MASK2h	(0xff00)
-#define BN_TBIT		(0x8000)
-#define BN_DEC_CONV	(100000)
-#define BN_DEC_FMT1	"%u"
-#define BN_DEC_FMT2	"%05u"
-#define BN_DEC_NUM	5
-#endif
-
-#ifdef EIGHT_BIT
-#ifndef BN_DIV2W
-#define BN_DIV2W
-#endif
-#define BN_ULLONG	unsigned short
-#define BN_ULONG	unsigned char
-#define BN_LONG		char
-#define BN_BITS		16
-#define BN_BYTES	1
-#define BN_BITS2	8
-#define BN_BITS4	4
-#define BN_MASK		(0xffff)
-#define BN_MASK2	(0xff)
-#define BN_MASK2l	(0xf)
-#define BN_MASK2h1	(0xf8)
-#define BN_MASK2h	(0xf0)
-#define BN_TBIT		(0x80)
-#define BN_DEC_CONV	(100)
 #define BN_DEC_FMT1	"%u"
-#define BN_DEC_FMT2	"%02u"
-#define BN_DEC_NUM	2
+#define BN_DEC_FMT2	"%09u"
+#define BN_DEC_NUM	9
+#define BN_HEX_FMT1	"%X"
+#define BN_HEX_FMT2	"%08X"
 #endif
 
 #define BN_DEFAULT_BITS	1280
 
 #define BN_FLG_MALLOCED		0x01
 #define BN_FLG_STATIC_DATA	0x02
+#define BN_FLG_CONSTTIME	0x04 /* avoid leaking exponent information through timing,
+                                      * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
+                                      * BN_div() will call BN_div_no_branch,
+                                      * BN_mod_inverse() will call BN_mod_inverse_no_branch.
+                                      */
+
+#ifndef OPENSSL_NO_DEPRECATED
+#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */
+                                      /* avoid leaking exponent information through timings
+                                      * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
+#endif
+
 #ifndef OPENSSL_NO_DEPRECATED
 #define BN_FLG_FREE		0x8000	/* used for debuging */
 #endif
 #define BN_set_flags(b,n)	((b)->flags|=(n))
 #define BN_get_flags(b,n)	((b)->flags&(n))
 
+/* get a clone of a BIGNUM with changed flags, for *temporary* use only
+ * (the two BIGNUMs cannot not be used in parallel!) */
+#define BN_with_flags(dest,b,n)  ((dest)->d=(b)->d, \
+                                  (dest)->top=(b)->top, \
+                                  (dest)->dmax=(b)->dmax, \
+                                  (dest)->neg=(b)->neg, \
+                                  (dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
+                                                 |  ((b)->flags & ~BN_FLG_MALLOCED) \
+                                                 |  BN_FLG_STATIC_DATA \
+                                                 |  (n)))
+
 /* Already declared in ossl_typ.h */
 #if 0
 typedef struct bignum_st BIGNUM;
@@ -278,16 +307,6 @@ struct bignum_st
 	int flags;
 	};
 
-struct bn_blinding_st
-	{
-	int init;
-	BIGNUM *A;
-	BIGNUM *Ai;
-	BIGNUM *mod; /* just a reference */
-	unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
-				  * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
-	};
-
 /* Used for montgomery multiplication */
 struct bn_mont_ctx_st
 	{
@@ -296,7 +315,8 @@ struct bn_mont_ctx_st
 	BIGNUM N;      /* The modulus */
 	BIGNUM Ni;     /* R*(1/R mod N) - N*Ni = 1
 	                * (Ni is only stored for bignum algorithm) */
-	BN_ULONG n0;   /* least significant word of Ni */
+	BN_ULONG n0[2];/* least significant word(s) of Ni;
+	                  (type changed with 0.9.9, was "BN_ULONG n0;" before) */
 	int flags;
 	};
 
@@ -366,11 +386,7 @@ int BN_GENCB_call(BN_GENCB *cb, int a, int b);
 /* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
 #define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
 				(((w) == 0) && ((a)->top == 0)))
-#ifdef BN_STRICT
 #define BN_is_zero(a)       ((a)->top == 0)
-#else
-#define BN_is_zero(a)       BN_abs_is_word(a,0)
-#endif
 #define BN_is_one(a)        (BN_abs_is_word((a),1) && !(a)->neg)
 #define BN_is_word(a,w)     (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
 #define BN_is_odd(a)	    (((a)->top > 0) && ((a)->d[0] & 1))
@@ -387,14 +403,6 @@ int BN_GENCB_call(BN_GENCB *cb, int a, int b);
 #else
 #define BN_zero(a)	(BN_set_word((a),0))
 #endif
-/* BN_set_sign(BIGNUM *, int) sets the sign of a BIGNUM
- * (0 for a non-negative value, 1 for negative) */
-#define BN_set_sign(a,b) ((a)->neg = (b))
-/* BN_get_sign(BIGNUM *) returns the sign of the BIGNUM */
-#define BN_get_sign(a)   ((a)->neg)
-
-/*#define BN_ascii2bn(a)	BN_hex2bn(a) */
-/*#define BN_bn2ascii(a)	BN_bn2hex(a) */
 
 const BIGNUM *BN_value_one(void);
 char *	BN_options(void);
@@ -408,16 +416,14 @@ BIGNUM *BN_CTX_get(BN_CTX *ctx);
 void	BN_CTX_end(BN_CTX *ctx);
 int     BN_rand(BIGNUM *rnd, int bits, int top,int bottom);
 int     BN_pseudo_rand(BIGNUM *rnd, int bits, int top,int bottom);
-int	BN_rand_range(BIGNUM *rnd, BIGNUM *range);
-int	BN_pseudo_rand_range(BIGNUM *rnd, BIGNUM *range);
+int	BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
+int	BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
 int	BN_num_bits(const BIGNUM *a);
 int	BN_num_bits_word(BN_ULONG);
 BIGNUM *BN_new(void);
 void	BN_init(BIGNUM *);
 void	BN_clear_free(BIGNUM *a);
 BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
-/* BN_ncopy(): like BN_copy() but copies at most the first n BN_ULONGs */
-BIGNUM *BN_ncopy(BIGNUM *a, const BIGNUM *b, size_t n);
 void	BN_swap(BIGNUM *a, BIGNUM *b);
 BIGNUM *BN_bin2bn(const unsigned char *s,int len,BIGNUM *ret);
 int	BN_bn2bin(const BIGNUM *a, unsigned char *to);
@@ -429,6 +435,16 @@ int	BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 int	BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 int	BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
 int	BN_sqr(BIGNUM *r, const BIGNUM *a,BN_CTX *ctx);
+/** BN_set_negative sets sign of a BIGNUM
+ * \param  b  pointer to the BIGNUM object
+ * \param  n  0 if the BIGNUM b should be positive and a value != 0 otherwise 
+ */
+void	BN_set_negative(BIGNUM *b, int n);
+/** BN_is_negative returns 1 if the BIGNUM is negative
+ * \param  a  pointer to the BIGNUM object
+ * \return 1 if a < 0 and 0 otherwise
+ */
+#define BN_is_negative(a) ((a)->neg != 0)
 
 int	BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 	BN_CTX *ctx);
@@ -465,6 +481,8 @@ int	BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	const BIGNUM *m,BN_CTX *ctx);
 int	BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont);
 int	BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
 	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
 int	BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
@@ -494,6 +512,7 @@ char *	BN_bn2hex(const BIGNUM *a);
 char *	BN_bn2dec(const BIGNUM *a);
 int 	BN_hex2bn(BIGNUM **a, const char *str);
 int 	BN_dec2bn(BIGNUM **a, const char *str);
+int	BN_asc2bn(BIGNUM **a, const char *str);
 int	BN_gcd(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx);
 int	BN_kronecker(const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx); /* returns -2 for error */
 BIGNUM *BN_mod_inverse(BIGNUM *ret,
@@ -532,12 +551,32 @@ int BN_from_montgomery(BIGNUM *r,const BIGNUM *a,
 void BN_MONT_CTX_free(BN_MONT_CTX *mont);
 int BN_MONT_CTX_set(BN_MONT_CTX *mont,const BIGNUM *mod,BN_CTX *ctx);
 BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from);
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+					const BIGNUM *mod, BN_CTX *ctx);
 
-BN_BLINDING *BN_BLINDING_new(BIGNUM *A,BIGNUM *Ai,BIGNUM *mod);
+/* BN_BLINDING flags */
+#define	BN_BLINDING_NO_UPDATE	0x00000001
+#define	BN_BLINDING_NO_RECREATE	0x00000002
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
 void BN_BLINDING_free(BN_BLINDING *b);
 int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx);
-int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *r, BN_CTX *ctx);
+int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
 int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+#ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
+void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
+#endif
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
+void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+	const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+			  const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
+	BN_MONT_CTX *m_ctx);
 
 #ifndef OPENSSL_NO_DEPRECATED
 void BN_set_params(int mul,int high,int low,int mont);
@@ -587,24 +626,24 @@ int	BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
  *     t^p[0] + t^p[1] + ... + t^p[k]
  * where m = p[0] > p[1] > ... > p[k] = 0.
  */
-int	BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[]);
+int	BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
 	/* r = a mod p */
 int	BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
-	const unsigned int p[], BN_CTX *ctx); /* r = (a * b) mod p */
-int	BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[],
+	const int p[], BN_CTX *ctx); /* r = (a * b) mod p */
+int	BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
 	BN_CTX *ctx); /* r = (a * a) mod p */
-int	BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const unsigned int p[],
+int	BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
 	BN_CTX *ctx); /* r = (1 / b) mod p */
 int	BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
-	const unsigned int p[], BN_CTX *ctx); /* r = (a / b) mod p */
+	const int p[], BN_CTX *ctx); /* r = (a / b) mod p */
 int	BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
-	const unsigned int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
+	const int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
 int	BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
-	const unsigned int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
+	const int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
 int	BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
-	const unsigned int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
-int	BN_GF2m_poly2arr(const BIGNUM *a, unsigned int p[], int max);
-int	BN_GF2m_arr2poly(const unsigned int p[], BIGNUM *a);
+	const int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
+int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
+int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
@@ -693,9 +732,11 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
 #define bn_check_top(a) \
 	do { \
 		const BIGNUM *_bnum2 = (a); \
-		assert((_bnum2->top == 0) || \
+		if (_bnum2 != NULL) { \
+			assert((_bnum2->top == 0) || \
 				(_bnum2->d[_bnum2->top - 1] != 0)); \
-		bn_pollute(_bnum2); \
+			bn_pollute(_bnum2); \
+		} \
 	} while(0)
 
 #define bn_fix_top(a)		bn_check_top(a)
@@ -711,10 +752,12 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
 #define bn_correct_top(a) \
         { \
         BN_ULONG *ftl; \
-	if ((a)->top > 0) \
+	int tmp_top = (a)->top; \
+	if (tmp_top > 0) \
 		{ \
-		for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \
-		if (*(ftl--)) break; \
+		for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
+			if (*(ftl--)) break; \
+		(a)->top = tmp_top; \
 		} \
 	bn_pollute(a); \
 	}
@@ -726,6 +769,18 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
 
+/* Primes from RFC 2409 */
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
+
+/* Primes from RFC 3526 */
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
+
 int BN_bntest_rand(BIGNUM *rnd, int bits, int top,int bottom);
 
 /* BEGIN ERROR CODES */
@@ -737,28 +792,38 @@ void ERR_load_BN_strings(void);
 /* Error codes for the BN functions. */
 
 /* Function codes. */
-#define BN_F_BN_BLINDING_CONVERT			 100
-#define BN_F_BN_BLINDING_INVERT				 101
+#define BN_F_BNRAND					 127
+#define BN_F_BN_BLINDING_CONVERT_EX			 100
+#define BN_F_BN_BLINDING_CREATE_PARAM			 128
+#define BN_F_BN_BLINDING_INVERT_EX			 101
 #define BN_F_BN_BLINDING_NEW				 102
 #define BN_F_BN_BLINDING_UPDATE				 103
 #define BN_F_BN_BN2DEC					 104
 #define BN_F_BN_BN2HEX					 105
 #define BN_F_BN_CTX_GET					 116
 #define BN_F_BN_CTX_NEW					 106
+#define BN_F_BN_CTX_START				 129
 #define BN_F_BN_DIV					 107
+#define BN_F_BN_DIV_NO_BRANCH				 138
+#define BN_F_BN_DIV_RECP				 130
+#define BN_F_BN_EXP					 123
 #define BN_F_BN_EXPAND2					 108
 #define BN_F_BN_EXPAND_INTERNAL				 120
-#define BN_F_BN_GF2M_MOD				 126
-#define BN_F_BN_GF2M_MOD_DIV				 123
-#define BN_F_BN_GF2M_MOD_EXP				 127
-#define BN_F_BN_GF2M_MOD_MUL				 124
-#define BN_F_BN_GF2M_MOD_SOLVE_QUAD			 128
-#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR			 129
-#define BN_F_BN_GF2M_MOD_SQR				 125
+#define BN_F_BN_GF2M_MOD				 131
+#define BN_F_BN_GF2M_MOD_EXP				 132
+#define BN_F_BN_GF2M_MOD_MUL				 133
+#define BN_F_BN_GF2M_MOD_SOLVE_QUAD			 134
+#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR			 135
+#define BN_F_BN_GF2M_MOD_SQR				 136
+#define BN_F_BN_GF2M_MOD_SQRT				 137
 #define BN_F_BN_MOD_EXP2_MONT				 118
 #define BN_F_BN_MOD_EXP_MONT				 109
+#define BN_F_BN_MOD_EXP_MONT_CONSTTIME			 124
 #define BN_F_BN_MOD_EXP_MONT_WORD			 117
+#define BN_F_BN_MOD_EXP_RECP				 125
+#define BN_F_BN_MOD_EXP_SIMPLE				 126
 #define BN_F_BN_MOD_INVERSE				 110
+#define BN_F_BN_MOD_INVERSE_NO_BRANCH			 139
 #define BN_F_BN_MOD_LSHIFT_QUICK			 119
 #define BN_F_BN_MOD_MUL_RECIPROCAL			 111
 #define BN_F_BN_MOD_SQRT				 121
@@ -780,10 +845,9 @@ void ERR_load_BN_strings(void);
 #define BN_R_INVALID_LENGTH				 106
 #define BN_R_INVALID_RANGE				 115
 #define BN_R_NOT_A_SQUARE				 111
-#define BN_R_NOT_IMPLEMENTED				 116
 #define BN_R_NOT_INITIALIZED				 107
 #define BN_R_NO_INVERSE					 108
-#define BN_R_NO_SOLUTION				 117
+#define BN_R_NO_SOLUTION				 116
 #define BN_R_P_IS_NOT_PRIME				 112
 #define BN_R_TOO_MANY_ITERATIONS			 113
 #define BN_R_TOO_MANY_TEMPORARY_VARIABLES		 109
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
index be8aa3ffc5..c43c91cc09 100644
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -75,6 +75,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	assert(num >= 0);
 	if (num <= 0) return(c1);
 
+#ifndef OPENSSL_SMALL_FOOTPRINT
 	while (num&~3)
 		{
 		mul_add(rp[0],ap[0],w,c1);
@@ -83,11 +84,11 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 		mul_add(rp[3],ap[3],w,c1);
 		ap+=4; rp+=4; num-=4;
 		}
-	if (num)
+#endif
+	while (num)
 		{
-		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
-		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
-		mul_add(rp[2],ap[2],w,c1); return c1;
+		mul_add(rp[0],ap[0],w,c1);
+		ap++; rp++; num--;
 		}
 	
 	return(c1);
@@ -100,6 +101,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	assert(num >= 0);
 	if (num <= 0) return(c1);
 
+#ifndef OPENSSL_SMALL_FOOTPRINT
 	while (num&~3)
 		{
 		mul(rp[0],ap[0],w,c1);
@@ -108,11 +110,11 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 		mul(rp[3],ap[3],w,c1);
 		ap+=4; rp+=4; num-=4;
 		}
-	if (num)
+#endif
+	while (num)
 		{
-		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
-		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
-		mul(rp[2],ap[2],w,c1);
+		mul(rp[0],ap[0],w,c1);
+		ap++; rp++; num--;
 		}
 	return(c1);
 	} 
@@ -121,6 +123,8 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
         {
 	assert(n >= 0);
 	if (n <= 0) return;
+
+#ifndef OPENSSL_SMALL_FOOTPRINT
 	while (n&~3)
 		{
 		sqr(r[0],r[1],a[0]);
@@ -129,11 +133,11 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
 		sqr(r[6],r[7],a[3]);
 		a+=4; r+=8; n-=4;
 		}
-	if (n)
+#endif
+	while (n)
 		{
-		sqr(r[0],r[1],a[0]); if (--n == 0) return;
-		sqr(r[2],r[3],a[1]); if (--n == 0) return;
-		sqr(r[4],r[5],a[2]);
+		sqr(r[0],r[1],a[0]);
+		a++; r+=2; n--;
 		}
 	}
 
@@ -150,18 +154,20 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	bl=LBITS(w);
 	bh=HBITS(w);
 
-	for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+	while (num&~3)
 		{
 		mul_add(rp[0],ap[0],bl,bh,c);
-		if (--num == 0) break;
 		mul_add(rp[1],ap[1],bl,bh,c);
-		if (--num == 0) break;
 		mul_add(rp[2],ap[2],bl,bh,c);
-		if (--num == 0) break;
 		mul_add(rp[3],ap[3],bl,bh,c);
-		if (--num == 0) break;
-		ap+=4;
-		rp+=4;
+		ap+=4; rp+=4; num-=4;
+		}
+#endif
+	while (num)
+		{
+		mul_add(rp[0],ap[0],bl,bh,c);
+		ap++; rp++; num--;
 		}
 	return(c);
 	} 
@@ -177,18 +183,20 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	bl=LBITS(w);
 	bh=HBITS(w);
 
-	for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+	while (num&~3)
 		{
 		mul(rp[0],ap[0],bl,bh,carry);
-		if (--num == 0) break;
 		mul(rp[1],ap[1],bl,bh,carry);
-		if (--num == 0) break;
 		mul(rp[2],ap[2],bl,bh,carry);
-		if (--num == 0) break;
 		mul(rp[3],ap[3],bl,bh,carry);
-		if (--num == 0) break;
-		ap+=4;
-		rp+=4;
+		ap+=4; rp+=4; num-=4;
+		}
+#endif
+	while (num)
+		{
+		mul(rp[0],ap[0],bl,bh,carry);
+		ap++; rp++; num--;
 		}
 	return(carry);
 	} 
@@ -197,22 +205,21 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
         {
 	assert(n >= 0);
 	if (n <= 0) return;
-	for (;;)
+
+#ifndef OPENSSL_SMALL_FOOTPRINT
+	while (n&~3)
 		{
 		sqr64(r[0],r[1],a[0]);
-		if (--n == 0) break;
-
 		sqr64(r[2],r[3],a[1]);
-		if (--n == 0) break;
-
 		sqr64(r[4],r[5],a[2]);
-		if (--n == 0) break;
-
 		sqr64(r[6],r[7],a[3]);
-		if (--n == 0) break;
-
-		a+=4;
-		r+=8;
+		a+=4; r+=8; n-=4;
+		}
+#endif
+	while (n)
+		{
+		sqr64(r[0],r[1],a[0]);
+		a++; r+=2; n--;
 		}
 	}
 
@@ -237,7 +244,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 	if (d == 0) return(BN_MASK2);
 
 	i=BN_num_bits_word(d);
-	assert((i == BN_BITS2) || (h > (BN_ULONG)1<<i));
+	assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
 
 	i=BN_BITS2-i;
 	if (h >= d) h-=d;
@@ -303,31 +310,30 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 	assert(n >= 0);
 	if (n <= 0) return((BN_ULONG)0);
 
-	for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+	while (n&~3)
 		{
 		ll+=(BN_ULLONG)a[0]+b[0];
 		r[0]=(BN_ULONG)ll&BN_MASK2;
 		ll>>=BN_BITS2;
-		if (--n <= 0) break;
-
 		ll+=(BN_ULLONG)a[1]+b[1];
 		r[1]=(BN_ULONG)ll&BN_MASK2;
 		ll>>=BN_BITS2;
-		if (--n <= 0) break;
-
 		ll+=(BN_ULLONG)a[2]+b[2];
 		r[2]=(BN_ULONG)ll&BN_MASK2;
 		ll>>=BN_BITS2;
-		if (--n <= 0) break;
-
 		ll+=(BN_ULLONG)a[3]+b[3];
 		r[3]=(BN_ULONG)ll&BN_MASK2;
 		ll>>=BN_BITS2;
-		if (--n <= 0) break;
-
-		a+=4;
-		b+=4;
-		r+=4;
+		a+=4; b+=4; r+=4; n-=4;
+		}
+#endif
+	while (n)
+		{
+		ll+=(BN_ULLONG)a[0]+b[0];
+		r[0]=(BN_ULONG)ll&BN_MASK2;
+		ll>>=BN_BITS2;
+		a++; b++; r++; n--;
 		}
 	return((BN_ULONG)ll);
 	}
@@ -340,7 +346,8 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 	if (n <= 0) return((BN_ULONG)0);
 
 	c=0;
-	for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+	while (n&~3)
 		{
 		t=a[0];
 		t=(t+c)&BN_MASK2;
@@ -348,35 +355,36 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 		l=(t+b[0])&BN_MASK2;
 		c+=(l < t);
 		r[0]=l;
-		if (--n <= 0) break;
-
 		t=a[1];
 		t=(t+c)&BN_MASK2;
 		c=(t < c);
 		l=(t+b[1])&BN_MASK2;
 		c+=(l < t);
 		r[1]=l;
-		if (--n <= 0) break;
-
 		t=a[2];
 		t=(t+c)&BN_MASK2;
 		c=(t < c);
 		l=(t+b[2])&BN_MASK2;
 		c+=(l < t);
 		r[2]=l;
-		if (--n <= 0) break;
-
 		t=a[3];
 		t=(t+c)&BN_MASK2;
 		c=(t < c);
 		l=(t+b[3])&BN_MASK2;
 		c+=(l < t);
 		r[3]=l;
-		if (--n <= 0) break;
-
-		a+=4;
-		b+=4;
-		r+=4;
+		a+=4; b+=4; r+=4; n-=4;
+		}
+#endif
+	while(n)
+		{
+		t=a[0];
+		t=(t+c)&BN_MASK2;
+		c=(t < c);
+		l=(t+b[0])&BN_MASK2;
+		c+=(l < t);
+		r[0]=l;
+		a++; b++; r++; n--;
 		}
 	return((BN_ULONG)c);
 	}
@@ -390,36 +398,35 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 	assert(n >= 0);
 	if (n <= 0) return((BN_ULONG)0);
 
-	for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+	while (n&~3)
 		{
 		t1=a[0]; t2=b[0];
 		r[0]=(t1-t2-c)&BN_MASK2;
 		if (t1 != t2) c=(t1 < t2);
-		if (--n <= 0) break;
-
 		t1=a[1]; t2=b[1];
 		r[1]=(t1-t2-c)&BN_MASK2;
 		if (t1 != t2) c=(t1 < t2);
-		if (--n <= 0) break;
-
 		t1=a[2]; t2=b[2];
 		r[2]=(t1-t2-c)&BN_MASK2;
 		if (t1 != t2) c=(t1 < t2);
-		if (--n <= 0) break;
-
 		t1=a[3]; t2=b[3];
 		r[3]=(t1-t2-c)&BN_MASK2;
 		if (t1 != t2) c=(t1 < t2);
-		if (--n <= 0) break;
-
-		a+=4;
-		b+=4;
-		r+=4;
+		a+=4; b+=4; r+=4; n-=4;
+		}
+#endif
+	while (n)
+		{
+		t1=a[0]; t2=b[0];
+		r[0]=(t1-t2-c)&BN_MASK2;
+		if (t1 != t2) c=(t1 < t2);
+		a++; b++; r++; n--;
 		}
 	return(c);
 	}
 
-#ifdef BN_MUL_COMBA
+#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 
 #undef bn_mul_comba8
 #undef bn_mul_comba4
@@ -459,6 +466,34 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 
+#elif defined(BN_UMULT_LOHI)
+
+#define mul_add_c(a,b,c0,c1,c2)	{	\
+	BN_ULONG ta=(a),tb=(b);		\
+	BN_UMULT_LOHI(t1,t2,ta,tb);	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define mul_add_c2(a,b,c0,c1,c2) {	\
+	BN_ULONG ta=(a),tb=(b),t0;	\
+	BN_UMULT_LOHI(t0,t1,ta,tb);	\
+	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
+	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define sqr_add_c(a,i,c0,c1,c2)	{	\
+	BN_ULONG ta=(a)[i];		\
+	BN_UMULT_LOHI(t1,t2,ta,ta);	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define sqr_add_c2(a,i,j,c0,c1,c2)	\
+	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
 #elif defined(BN_UMULT_HIGH)
 
 #define mul_add_c(a,b,c0,c1,c2)	{	\
@@ -792,18 +827,134 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	r[6]=c1;
 	r[7]=c2;
 	}
+
+#ifdef OPENSSL_NO_ASM
+#ifdef OPENSSL_BN_ASM_MONT
+#include <alloca.h>
+/*
+ * This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this routine was
+ * observed to give 40% faster rsa1024 private key operations and 10%
+ * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
+ * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
+ * reference implementation, one to be used as starting point for
+ * platform-specific assembler. Mentioned numbers apply to compiler
+ * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
+ * can vary not only from platform to platform, but even for compiler
+ * versions. Assembler vs. assembler improvement coefficients can
+ * [and are known to] differ and are to be documented elsewhere.
+ */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
+	{
+	BN_ULONG c0,c1,ml,*tp,n0;
+#ifdef mul64
+	BN_ULONG mh;
+#endif
+	volatile BN_ULONG *vp;
+	int i=0,j;
+
+#if 0	/* template for platform-specific implementation */
+	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
+#endif
+	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+	n0 = *n0p;
+
+	c0 = 0;
+	ml = bp[0];
+#ifdef mul64
+	mh = HBITS(ml);
+	ml = LBITS(ml);
+	for (j=0;j<num;++j)
+		mul(tp[j],ap[j],ml,mh,c0);
+#else
+	for (j=0;j<num;++j)
+		mul(tp[j],ap[j],ml,c0);
+#endif
+
+	tp[num]   = c0;
+	tp[num+1] = 0;
+	goto enter;
+
+	for(i=0;i<num;i++)
+		{
+		c0 = 0;
+		ml = bp[i];
+#ifdef mul64
+		mh = HBITS(ml);
+		ml = LBITS(ml);
+		for (j=0;j<num;++j)
+			mul_add(tp[j],ap[j],ml,mh,c0);
+#else
+		for (j=0;j<num;++j)
+			mul_add(tp[j],ap[j],ml,c0);
+#endif
+		c1 = (tp[num] + c0)&BN_MASK2;
+		tp[num]   = c1;
+		tp[num+1] = (c1<c0?1:0);
+	enter:
+		c1  = tp[0];
+		ml = (c1*n0)&BN_MASK2;
+		c0 = 0;
+#ifdef mul64
+		mh = HBITS(ml);
+		ml = LBITS(ml);
+		mul_add(c1,np[0],ml,mh,c0);
+#else
+		mul_add(c1,ml,np[0],c0);
+#endif
+		for(j=1;j<num;j++)
+			{
+			c1 = tp[j];
+#ifdef mul64
+			mul_add(c1,np[j],ml,mh,c0);
+#else
+			mul_add(c1,ml,np[j],c0);
+#endif
+			tp[j-1] = c1&BN_MASK2;
+			}
+		c1        = (tp[num] + c0)&BN_MASK2;
+		tp[num-1] = c1;
+		tp[num]   = tp[num+1] + (c1<c0?1:0);
+		}
+
+	if (tp[num]!=0 || tp[num-1]>=np[num-1])
+		{
+		c0 = bn_sub_words(rp,tp,np,num);
+		if (tp[num]!=0 || c0==0)
+			{
+			for(i=0;i<num+2;i++)	vp[i] = 0;
+			return 1;
+			}
+		}
+	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
+	vp[num]   = 0;
+	vp[num+1] = 0;
+	return 1;
+	}
+#else
+/*
+ * Return value of 0 indicates that multiplication/convolution was not
+ * performed to signal the caller to fall down to alternative/original
+ * code-path.
+ */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+{	return 0;	}
+#endif /* OPENSSL_BN_ASM_MONT */
+#endif
+
 #else /* !BN_MUL_COMBA */
 
 /* hmm... is it faster just to do a multiply? */
 #undef bn_sqr_comba4
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t[8];
 	bn_sqr_normal(r,a,4,t);
 	}
 
 #undef bn_sqr_comba8
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 	{
 	BN_ULONG t[16];
 	bn_sqr_normal(r,a,8,t);
@@ -829,4 +980,51 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
 	}
 
+#ifdef OPENSSL_NO_ASM
+#ifdef OPENSSL_BN_ASM_MONT
+#include <alloca.h>
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
+	{
+	BN_ULONG c0,c1,*tp,n0=*n0p;
+	volatile BN_ULONG *vp;
+	int i=0,j;
+
+	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+	for(i=0;i<=num;i++)	tp[i]=0;
+
+	for(i=0;i<num;i++)
+		{
+		c0         = bn_mul_add_words(tp,ap,num,bp[i]);
+		c1         = (tp[num] + c0)&BN_MASK2;
+		tp[num]    = c1;
+		tp[num+1]  = (c1<c0?1:0);
+
+		c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
+		c1         = (tp[num] + c0)&BN_MASK2;
+		tp[num]    = c1;
+		tp[num+1] += (c1<c0?1:0);
+		for(j=0;j<=num;j++)	tp[j]=tp[j+1];
+		}
+
+	if (tp[num]!=0 || tp[num-1]>=np[num-1])
+		{
+		c0 = bn_sub_words(rp,tp,np,num);
+		if (tp[num]!=0 || c0==0)
+			{
+			for(i=0;i<num+2;i++)	vp[i] = 0;
+			return 1;
+			}
+		}
+	for(i=0;i<num;i++)	rp[i] = tp[i],	vp[i] = 0;
+	vp[num]   = 0;
+	vp[num+1] = 0;
+	return 1;
+	}
+#else
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+{	return 0;	}
+#endif /* OPENSSL_BN_ASM_MONT */
+#endif
+
 #endif /* !BN_MUL_COMBA */
diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c
index 011d37f1ff..e060592fdc 100644
--- a/crypto/bn/bn_blind.c
+++ b/crypto/bn/bn_blind.c
@@ -1,4 +1,57 @@
 /* crypto/bn/bn_blind.c */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
  * All rights reserved.
  *
@@ -60,11 +113,31 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-BN_BLINDING *BN_BLINDING_new(BIGNUM *A, BIGNUM *Ai, BIGNUM *mod)
+#define BN_BLINDING_COUNTER	32
+
+struct bn_blinding_st
+	{
+	BIGNUM *A;
+	BIGNUM *Ai;
+	BIGNUM *e;
+	BIGNUM *mod; /* just a reference */
+#ifndef OPENSSL_NO_DEPRECATED
+	unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
+				  * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
+#endif
+	CRYPTO_THREADID tid;
+	unsigned int  counter;
+	unsigned long flags;
+	BN_MONT_CTX *m_ctx;
+	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+			  const BIGNUM *m, BN_CTX *ctx,
+			  BN_MONT_CTX *m_ctx);
+	};
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
 	{
 	BN_BLINDING *ret=NULL;
 
-	bn_check_top(Ai);
 	bn_check_top(mod);
 
 	if ((ret=(BN_BLINDING *)OPENSSL_malloc(sizeof(BN_BLINDING))) == NULL)
@@ -73,11 +146,22 @@ BN_BLINDING *BN_BLINDING_new(BIGNUM *A, BIGNUM *Ai, BIGNUM *mod)
 		return(NULL);
 		}
 	memset(ret,0,sizeof(BN_BLINDING));
-	if ((ret->A=BN_new()) == NULL) goto err;
-	if ((ret->Ai=BN_new()) == NULL) goto err;
-	if (!BN_copy(ret->A,A)) goto err;
-	if (!BN_copy(ret->Ai,Ai)) goto err;
-	ret->mod=mod;
+	if (A != NULL)
+		{
+		if ((ret->A  = BN_dup(A))  == NULL) goto err;
+		}
+	if (Ai != NULL)
+		{
+		if ((ret->Ai = BN_dup(Ai)) == NULL) goto err;
+		}
+
+	/* save a copy of mod in the BN_BLINDING structure */
+	if ((ret->mod = BN_dup(mod)) == NULL) goto err;
+	if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
+		BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
+
+	ret->counter = BN_BLINDING_COUNTER;
+	CRYPTO_THREADID_current(&ret->tid);
 	return(ret);
 err:
 	if (ret != NULL) BN_BLINDING_free(ret);
@@ -91,6 +175,8 @@ void BN_BLINDING_free(BN_BLINDING *r)
 
 	if (r->A  != NULL) BN_free(r->A );
 	if (r->Ai != NULL) BN_free(r->Ai);
+	if (r->e  != NULL) BN_free(r->e );
+	if (r->mod != NULL) BN_free(r->mod); 
 	OPENSSL_free(r);
 	}
 
@@ -103,38 +189,76 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
 		BNerr(BN_F_BN_BLINDING_UPDATE,BN_R_NOT_INITIALIZED);
 		goto err;
 		}
-		
-	if (!BN_mod_mul(b->A,b->A,b->A,b->mod,ctx)) goto err;
-	if (!BN_mod_mul(b->Ai,b->Ai,b->Ai,b->mod,ctx)) goto err;
+
+	if (--(b->counter) == 0 && b->e != NULL &&
+		!(b->flags & BN_BLINDING_NO_RECREATE))
+		{
+		/* re-create blinding parameters */
+		if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
+			goto err;
+		}
+	else if (!(b->flags & BN_BLINDING_NO_UPDATE))
+		{
+		if (!BN_mod_mul(b->A,b->A,b->A,b->mod,ctx)) goto err;
+		if (!BN_mod_mul(b->Ai,b->Ai,b->Ai,b->mod,ctx)) goto err;
+		}
 
 	ret=1;
 err:
+	if (b->counter == 0)
+		b->counter = BN_BLINDING_COUNTER;
 	return(ret);
 	}
 
 int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
 	{
+	return BN_BLINDING_convert_ex(n, NULL, b, ctx);
+	}
+
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
+	{
+	int ret = 1;
+
 	bn_check_top(n);
 
 	if ((b->A == NULL) || (b->Ai == NULL))
 		{
-		BNerr(BN_F_BN_BLINDING_CONVERT,BN_R_NOT_INITIALIZED);
+		BNerr(BN_F_BN_BLINDING_CONVERT_EX,BN_R_NOT_INITIALIZED);
 		return(0);
 		}
-	return(BN_mod_mul(n,n,b->A,b->mod,ctx));
+
+	if (r != NULL)
+		{
+		if (!BN_copy(r, b->Ai)) ret=0;
+		}
+
+	if (!BN_mod_mul(n,n,b->A,b->mod,ctx)) ret=0;
+	
+	return ret;
 	}
 
 int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
 	{
+	return BN_BLINDING_invert_ex(n, NULL, b, ctx);
+	}
+
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
+	{
 	int ret;
 
 	bn_check_top(n);
 	if ((b->A == NULL) || (b->Ai == NULL))
 		{
-		BNerr(BN_F_BN_BLINDING_INVERT,BN_R_NOT_INITIALIZED);
+		BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
 		return(0);
 		}
-	if ((ret=BN_mod_mul(n,n,b->Ai,b->mod,ctx)) >= 0)
+
+	if (r != NULL)
+		ret = BN_mod_mul(n, n, r, b->mod, ctx);
+	else
+		ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
+
+	if (ret >= 0)
 		{
 		if (!BN_BLINDING_update(b,ctx))
 			return(0);
@@ -143,3 +267,110 @@ int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
 	return(ret);
 	}
 
+#ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *b)
+	{
+	return b->thread_id;
+	}
+
+void BN_BLINDING_set_thread_id(BN_BLINDING *b, unsigned long n)
+	{
+	b->thread_id = n;
+	}
+#endif
+
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *b)
+	{
+	return &b->tid;
+	}
+
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *b)
+	{
+	return b->flags;
+	}
+
+void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
+	{
+	b->flags = flags;
+	}
+
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+	const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+	int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+			  const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
+	BN_MONT_CTX *m_ctx)
+{
+	int    retry_counter = 32;
+	BN_BLINDING *ret = NULL;
+
+	if (b == NULL)
+		ret = BN_BLINDING_new(NULL, NULL, m);
+	else
+		ret = b;
+
+	if (ret == NULL)
+		goto err;
+
+	if (ret->A  == NULL && (ret->A  = BN_new()) == NULL)
+		goto err;
+	if (ret->Ai == NULL && (ret->Ai	= BN_new()) == NULL)
+		goto err;
+
+	if (e != NULL)
+		{
+		if (ret->e != NULL)
+			BN_free(ret->e);
+		ret->e = BN_dup(e);
+		}
+	if (ret->e == NULL)
+		goto err;
+
+	if (bn_mod_exp != NULL)
+		ret->bn_mod_exp = bn_mod_exp;
+	if (m_ctx != NULL)
+		ret->m_ctx = m_ctx;
+
+	do {
+		if (!BN_rand_range(ret->A, ret->mod)) goto err;
+		if (BN_mod_inverse(ret->Ai, ret->A, ret->mod, ctx) == NULL)
+			{
+			/* this should almost never happen for good RSA keys */
+			unsigned long error = ERR_peek_last_error();
+			if (ERR_GET_REASON(error) == BN_R_NO_INVERSE)
+				{
+				if (retry_counter-- == 0)
+				{
+					BNerr(BN_F_BN_BLINDING_CREATE_PARAM,
+						BN_R_TOO_MANY_ITERATIONS);
+					goto err;
+				}
+				ERR_clear_error();
+				}
+			else
+				goto err;
+			}
+		else
+			break;
+	} while (1);
+
+	if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL)
+		{
+		if (!ret->bn_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx, ret->m_ctx))
+			goto err;
+		}
+	else
+		{
+		if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
+			goto err;
+		}
+
+	return ret;
+err:
+	if (b == NULL && ret != NULL)
+		{
+		BN_BLINDING_free(ret);
+		ret = NULL;
+		}
+
+	return ret;
+}
diff --git a/crypto/bn/bn_const.c b/crypto/bn/bn_const.c
new file mode 100755
index 0000000000..eb60a25b3c
--- /dev/null
+++ b/crypto/bn/bn_const.c
@@ -0,0 +1,402 @@
+/* crypto/bn/knownprimes.c */
+/* Insert boilerplate */
+
+#include "bn.h"
+
+/* "First Oakley Default Group" from RFC2409, section 6.1.
+ *
+ * The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
+ *
+ * RFC2409 specifies a generator of 2.
+ * RFC2412 specifies a generator of of 22.
+ */
+
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn)
+	{
+	static const unsigned char RFC2409_PRIME_768[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x3A,0x36,0x20,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC2409_PRIME_768,sizeof(RFC2409_PRIME_768),bn);
+	}
+
+/* "Second Oakley Default Group" from RFC2409, section 6.2.
+ *
+ * The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
+ *
+ * RFC2409 specifies a generator of 2.
+ * RFC2412 specifies a generator of 22.
+ */
+
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn)
+	{
+	static const unsigned char RFC2409_PRIME_1024[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE6,0x53,0x81,
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC2409_PRIME_1024,sizeof(RFC2409_PRIME_1024),bn);
+	}
+
+/* "1536-bit MODP Group" from RFC3526, Section 2.
+ *
+ * The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
+ *
+ * RFC3526 specifies a generator of 2.
+ * RFC2312 specifies a generator of 22.
+ */
+
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn)
+	{
+	static const unsigned char RFC3526_PRIME_1536[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+		0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+		0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+		0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+		0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+		0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+		0xCA,0x23,0x73,0x27,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC3526_PRIME_1536,sizeof(RFC3526_PRIME_1536),bn);
+	}
+
+/* "2048-bit MODP Group" from RFC3526, Section 3.
+ *
+ * The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn)
+	{
+	static const unsigned char RFC3526_PRIME_2048[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+		0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+		0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+		0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+		0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+		0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+		0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+		0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+		0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+		0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+		0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+		0x15,0x72,0x8E,0x5A,0x8A,0xAC,0xAA,0x68,0xFF,0xFF,0xFF,0xFF,
+		0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC3526_PRIME_2048,sizeof(RFC3526_PRIME_2048),bn);
+	}
+
+/* "3072-bit MODP Group" from RFC3526, Section 4.
+ *
+ * The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn)
+	{
+	static const unsigned char RFC3526_PRIME_3072[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+		0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+		0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+		0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+		0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+		0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+		0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+		0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+		0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+		0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+		0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+		0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+		0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+		0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+		0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+		0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+		0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+		0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+		0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+		0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+		0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+		0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+		0xA9,0x3A,0xD2,0xCA,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC3526_PRIME_3072,sizeof(RFC3526_PRIME_3072),bn);
+	}
+
+/* "4096-bit MODP Group" from RFC3526, Section 5.
+ *
+ * The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn)
+	{
+	static const unsigned char RFC3526_PRIME_4096[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+		0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+		0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+		0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+		0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+		0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+		0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+		0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+		0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+		0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+		0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+		0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+		0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+		0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+		0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+		0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+		0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+		0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+		0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+		0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+		0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+		0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+		0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
+		0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
+		0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
+		0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
+		0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
+		0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
+		0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
+		0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
+		0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
+		0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
+		0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x06,0x31,0x99,
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC3526_PRIME_4096,sizeof(RFC3526_PRIME_4096),bn);
+	}
+
+/* "6144-bit MODP Group" from RFC3526, Section 6.
+ *
+ * The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn)
+	{
+	static const unsigned char RFC3526_PRIME_6144[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+		0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+		0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+		0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+		0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+		0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+		0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+		0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+		0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+		0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+		0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+		0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+		0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+		0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+		0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+		0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+		0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+		0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+		0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+		0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+		0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+		0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+		0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
+		0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
+		0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
+		0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
+		0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
+		0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
+		0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
+		0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
+		0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
+		0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
+		0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x02,0x84,0x92,
+		0x36,0xC3,0xFA,0xB4,0xD2,0x7C,0x70,0x26,0xC1,0xD4,0xDC,0xB2,
+		0x60,0x26,0x46,0xDE,0xC9,0x75,0x1E,0x76,0x3D,0xBA,0x37,0xBD,
+		0xF8,0xFF,0x94,0x06,0xAD,0x9E,0x53,0x0E,0xE5,0xDB,0x38,0x2F,
+		0x41,0x30,0x01,0xAE,0xB0,0x6A,0x53,0xED,0x90,0x27,0xD8,0x31,
+		0x17,0x97,0x27,0xB0,0x86,0x5A,0x89,0x18,0xDA,0x3E,0xDB,0xEB,
+		0xCF,0x9B,0x14,0xED,0x44,0xCE,0x6C,0xBA,0xCE,0xD4,0xBB,0x1B,
+		0xDB,0x7F,0x14,0x47,0xE6,0xCC,0x25,0x4B,0x33,0x20,0x51,0x51,
+		0x2B,0xD7,0xAF,0x42,0x6F,0xB8,0xF4,0x01,0x37,0x8C,0xD2,0xBF,
+		0x59,0x83,0xCA,0x01,0xC6,0x4B,0x92,0xEC,0xF0,0x32,0xEA,0x15,
+		0xD1,0x72,0x1D,0x03,0xF4,0x82,0xD7,0xCE,0x6E,0x74,0xFE,0xF6,
+		0xD5,0x5E,0x70,0x2F,0x46,0x98,0x0C,0x82,0xB5,0xA8,0x40,0x31,
+		0x90,0x0B,0x1C,0x9E,0x59,0xE7,0xC9,0x7F,0xBE,0xC7,0xE8,0xF3,
+		0x23,0xA9,0x7A,0x7E,0x36,0xCC,0x88,0xBE,0x0F,0x1D,0x45,0xB7,
+		0xFF,0x58,0x5A,0xC5,0x4B,0xD4,0x07,0xB2,0x2B,0x41,0x54,0xAA,
+		0xCC,0x8F,0x6D,0x7E,0xBF,0x48,0xE1,0xD8,0x14,0xCC,0x5E,0xD2,
+		0x0F,0x80,0x37,0xE0,0xA7,0x97,0x15,0xEE,0xF2,0x9B,0xE3,0x28,
+		0x06,0xA1,0xD5,0x8B,0xB7,0xC5,0xDA,0x76,0xF5,0x50,0xAA,0x3D,
+		0x8A,0x1F,0xBF,0xF0,0xEB,0x19,0xCC,0xB1,0xA3,0x13,0xD5,0x5C,
+		0xDA,0x56,0xC9,0xEC,0x2E,0xF2,0x96,0x32,0x38,0x7F,0xE8,0xD7,
+		0x6E,0x3C,0x04,0x68,0x04,0x3E,0x8F,0x66,0x3F,0x48,0x60,0xEE,
+		0x12,0xBF,0x2D,0x5B,0x0B,0x74,0x74,0xD6,0xE6,0x94,0xF9,0x1E,
+		0x6D,0xCC,0x40,0x24,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC3526_PRIME_6144,sizeof(RFC3526_PRIME_6144),bn);
+	}
+
+/* "8192-bit MODP Group" from RFC3526, Section 7.
+ *
+ * The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn)
+	{
+	static const unsigned char RFC3526_PRIME_8192[]={
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+		0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+		0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+		0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+		0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+		0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+		0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+		0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+		0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+		0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+		0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+		0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+		0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+		0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+		0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+		0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+		0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+		0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+		0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+		0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+		0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+		0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+		0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+		0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+		0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+		0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+		0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+		0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+		0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+		0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+		0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+		0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
+		0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
+		0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
+		0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
+		0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
+		0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
+		0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
+		0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
+		0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
+		0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
+		0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x02,0x84,0x92,
+		0x36,0xC3,0xFA,0xB4,0xD2,0x7C,0x70,0x26,0xC1,0xD4,0xDC,0xB2,
+		0x60,0x26,0x46,0xDE,0xC9,0x75,0x1E,0x76,0x3D,0xBA,0x37,0xBD,
+		0xF8,0xFF,0x94,0x06,0xAD,0x9E,0x53,0x0E,0xE5,0xDB,0x38,0x2F,
+		0x41,0x30,0x01,0xAE,0xB0,0x6A,0x53,0xED,0x90,0x27,0xD8,0x31,
+		0x17,0x97,0x27,0xB0,0x86,0x5A,0x89,0x18,0xDA,0x3E,0xDB,0xEB,
+		0xCF,0x9B,0x14,0xED,0x44,0xCE,0x6C,0xBA,0xCE,0xD4,0xBB,0x1B,
+		0xDB,0x7F,0x14,0x47,0xE6,0xCC,0x25,0x4B,0x33,0x20,0x51,0x51,
+		0x2B,0xD7,0xAF,0x42,0x6F,0xB8,0xF4,0x01,0x37,0x8C,0xD2,0xBF,
+		0x59,0x83,0xCA,0x01,0xC6,0x4B,0x92,0xEC,0xF0,0x32,0xEA,0x15,
+		0xD1,0x72,0x1D,0x03,0xF4,0x82,0xD7,0xCE,0x6E,0x74,0xFE,0xF6,
+		0xD5,0x5E,0x70,0x2F,0x46,0x98,0x0C,0x82,0xB5,0xA8,0x40,0x31,
+		0x90,0x0B,0x1C,0x9E,0x59,0xE7,0xC9,0x7F,0xBE,0xC7,0xE8,0xF3,
+		0x23,0xA9,0x7A,0x7E,0x36,0xCC,0x88,0xBE,0x0F,0x1D,0x45,0xB7,
+		0xFF,0x58,0x5A,0xC5,0x4B,0xD4,0x07,0xB2,0x2B,0x41,0x54,0xAA,
+		0xCC,0x8F,0x6D,0x7E,0xBF,0x48,0xE1,0xD8,0x14,0xCC,0x5E,0xD2,
+		0x0F,0x80,0x37,0xE0,0xA7,0x97,0x15,0xEE,0xF2,0x9B,0xE3,0x28,
+		0x06,0xA1,0xD5,0x8B,0xB7,0xC5,0xDA,0x76,0xF5,0x50,0xAA,0x3D,
+		0x8A,0x1F,0xBF,0xF0,0xEB,0x19,0xCC,0xB1,0xA3,0x13,0xD5,0x5C,
+		0xDA,0x56,0xC9,0xEC,0x2E,0xF2,0x96,0x32,0x38,0x7F,0xE8,0xD7,
+		0x6E,0x3C,0x04,0x68,0x04,0x3E,0x8F,0x66,0x3F,0x48,0x60,0xEE,
+		0x12,0xBF,0x2D,0x5B,0x0B,0x74,0x74,0xD6,0xE6,0x94,0xF9,0x1E,
+		0x6D,0xBE,0x11,0x59,0x74,0xA3,0x92,0x6F,0x12,0xFE,0xE5,0xE4,
+		0x38,0x77,0x7C,0xB6,0xA9,0x32,0xDF,0x8C,0xD8,0xBE,0xC4,0xD0,
+		0x73,0xB9,0x31,0xBA,0x3B,0xC8,0x32,0xB6,0x8D,0x9D,0xD3,0x00,
+		0x74,0x1F,0xA7,0xBF,0x8A,0xFC,0x47,0xED,0x25,0x76,0xF6,0x93,
+		0x6B,0xA4,0x24,0x66,0x3A,0xAB,0x63,0x9C,0x5A,0xE4,0xF5,0x68,
+		0x34,0x23,0xB4,0x74,0x2B,0xF1,0xC9,0x78,0x23,0x8F,0x16,0xCB,
+		0xE3,0x9D,0x65,0x2D,0xE3,0xFD,0xB8,0xBE,0xFC,0x84,0x8A,0xD9,
+		0x22,0x22,0x2E,0x04,0xA4,0x03,0x7C,0x07,0x13,0xEB,0x57,0xA8,
+		0x1A,0x23,0xF0,0xC7,0x34,0x73,0xFC,0x64,0x6C,0xEA,0x30,0x6B,
+		0x4B,0xCB,0xC8,0x86,0x2F,0x83,0x85,0xDD,0xFA,0x9D,0x4B,0x7F,
+		0xA2,0xC0,0x87,0xE8,0x79,0x68,0x33,0x03,0xED,0x5B,0xDD,0x3A,
+		0x06,0x2B,0x3C,0xF5,0xB3,0xA2,0x78,0xA6,0x6D,0x2A,0x13,0xF8,
+		0x3F,0x44,0xF8,0x2D,0xDF,0x31,0x0E,0xE0,0x74,0xAB,0x6A,0x36,
+		0x45,0x97,0xE8,0x99,0xA0,0x25,0x5D,0xC1,0x64,0xF3,0x1C,0xC5,
+		0x08,0x46,0x85,0x1D,0xF9,0xAB,0x48,0x19,0x5D,0xED,0x7E,0xA1,
+		0xB1,0xD5,0x10,0xBD,0x7E,0xE7,0x4D,0x73,0xFA,0xF3,0x6B,0xC3,
+		0x1E,0xCF,0xA2,0x68,0x35,0x90,0x46,0xF4,0xEB,0x87,0x9F,0x92,
+		0x40,0x09,0x43,0x8B,0x48,0x1C,0x6C,0xD7,0x88,0x9A,0x00,0x2E,
+		0xD5,0xEE,0x38,0x2B,0xC9,0x19,0x0D,0xA6,0xFC,0x02,0x6E,0x47,
+		0x95,0x58,0xE4,0x47,0x56,0x77,0xE9,0xAA,0x9E,0x30,0x50,0xE2,
+		0x76,0x56,0x94,0xDF,0xC8,0x1F,0x56,0xE8,0x80,0xB9,0x6E,0x71,
+		0x60,0xC9,0x80,0xDD,0x98,0xED,0xD3,0xDF,0xFF,0xFF,0xFF,0xFF,
+		0xFF,0xFF,0xFF,0xFF,
+		};
+	return BN_bin2bn(RFC3526_PRIME_8192,sizeof(RFC3526_PRIME_8192),bn);
+	}
+
diff --git a/crypto/bn/bn_ctx.c b/crypto/bn/bn_ctx.c
index 5bd742a97c..3f2256f675 100644
--- a/crypto/bn/bn_ctx.c
+++ b/crypto/bn/bn_ctx.c
@@ -161,7 +161,7 @@ static void ctxdbg(BN_CTX *ctx)
 	fprintf(stderr,"(%08x): ", (unsigned int)ctx);
 	while(bnidx < ctx->used)
 		{
-		fprintf(stderr,"%02x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
+		fprintf(stderr,"%03x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
 		if(!(bnidx % BN_CTX_POOL_SIZE))
 			item = item->next;
 		}
@@ -171,8 +171,8 @@ static void ctxdbg(BN_CTX *ctx)
 	while(fpidx < stack->depth)
 		{
 		while(bnidx++ < stack->indexes[fpidx])
-			fprintf(stderr,"   ");
-		fprintf(stderr,"^^ ");
+			fprintf(stderr,"    ");
+		fprintf(stderr,"^^^ ");
 		bnidx++;
 		fpidx++;
 		}
@@ -230,7 +230,10 @@ BN_CTX *BN_CTX_new(void)
 
 void BN_CTX_free(BN_CTX *ctx)
 	{
+	if (ctx == NULL)
+		return;
 #ifdef BN_CTX_DEBUG
+	{
 	BN_POOL_ITEM *pool = ctx->pool.head;
 	fprintf(stderr,"BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
 		ctx->stack.size, ctx->pool.size);
@@ -242,6 +245,7 @@ void BN_CTX_free(BN_CTX *ctx)
 		pool = pool->next;
 	}
 	fprintf(stderr,"\n");
+	}
 #endif
 	BN_STACK_finish(&ctx->stack);
 	BN_POOL_finish(&ctx->pool);
@@ -257,8 +261,7 @@ void BN_CTX_start(BN_CTX *ctx)
 	/* (Try to) get a new frame pointer */
 	else if(!BN_STACK_push(&ctx->stack, ctx->used))
 		{
-		/* I know this isn't BN_CTX_get, but ... */
-		BNerr(BN_F_BN_CTX_GET,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+		BNerr(BN_F_BN_CTX_START,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
 		ctx->err_stack++;
 		}
 	CTXDBG_EXIT(ctx);
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 3b4392955e..899d07ca24 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -169,13 +169,15 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 #endif /* OPENSSL_NO_ASM */
 
 
-/* BN_div computes  dv := num / divisor,  rounding towards zero, and sets up
- * rm  such that  dv*divisor + rm = num  holds.
+/* BN_div[_no_branch] computes  dv := num / divisor,  rounding towards
+ * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
  * Thus:
  *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
  *     rm->neg == num->neg                 (unless the remainder is zero)
  * If 'dv' or 'rm' is NULL, the respective value is not returned.
  */
+static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
+        const BIGNUM *divisor, BN_CTX *ctx);
 int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	   BN_CTX *ctx)
 	{
@@ -185,11 +187,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	BN_ULONG d0,d1;
 	int num_n,div_n;
 
-	if (dv)
-		bn_check_top(dv);
-	if (rm)
-		bn_check_top(rm);
+	/* Invalid zero-padding would have particularly bad consequences
+	 * in the case of 'num', so don't just rely on bn_check_top() for this one
+	 * (bn_check_top() works only for BN_DEBUG builds) */
+	if (num->top > 0 && num->d[num->top - 1] == 0)
+		{
+		BNerr(BN_F_BN_DIV,BN_R_NOT_INITIALIZED);
+		return 0;
+		}
+
 	bn_check_top(num);
+
+	if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
+		{
+		return BN_div_no_branch(dv, rm, num, divisor, ctx);
+		}
+
+	bn_check_top(dv);
+	bn_check_top(rm);
+	/* bn_check_top(num); */ /* 'num' has been checked already */
 	bn_check_top(divisor);
 
 	if (BN_is_zero(divisor))
@@ -213,7 +229,8 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	if (dv == NULL)
 		res=BN_CTX_get(ctx);
 	else	res=dv;
-	if (sdiv == NULL || res == NULL) goto err;
+	if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL)
+		goto err;
 
 	/* First we normalise the numbers */
 	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
@@ -320,7 +337,7 @@ X) -> 0x%08X\n",
 				t2 -= d1;
 				}
 #else /* !BN_LLONG */
-			BN_ULONG t2l,t2h,ql,qh;
+			BN_ULONG t2l,t2h;
 
 			q=bn_div_words(n0,n1,d0);
 #ifdef BN_DEBUG_LEVITTE
@@ -338,9 +355,12 @@ X) -> 0x%08X\n",
 			t2l = d1 * q;
 			t2h = BN_UMULT_HIGH(d1,q);
 #else
+			{
+			BN_ULONG ql, qh;
 			t2l=LBITS(d1); t2h=HBITS(d1);
 			ql =LBITS(q);  qh =HBITS(q);
 			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+			}
 #endif
 
 			for (;;)
@@ -394,8 +414,235 @@ X) -> 0x%08X\n",
 	BN_CTX_end(ctx);
 	return(1);
 err:
-	if (rm)
+	bn_check_top(rm);
+	BN_CTX_end(ctx);
+	return(0);
+	}
+
+
+/* BN_div_no_branch is a special version of BN_div. It does not contain
+ * branches that may leak sensitive information.
+ */
+static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, 
+	const BIGNUM *divisor, BN_CTX *ctx)
+	{
+	int norm_shift,i,loop;
+	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
+	BN_ULONG *resp,*wnump;
+	BN_ULONG d0,d1;
+	int num_n,div_n;
+
+	bn_check_top(dv);
+	bn_check_top(rm);
+	/* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
+	bn_check_top(divisor);
+
+	if (BN_is_zero(divisor))
+		{
+		BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
+		return(0);
+		}
+
+	BN_CTX_start(ctx);
+	tmp=BN_CTX_get(ctx);
+	snum=BN_CTX_get(ctx);
+	sdiv=BN_CTX_get(ctx);
+	if (dv == NULL)
+		res=BN_CTX_get(ctx);
+	else	res=dv;
+	if (sdiv == NULL || res == NULL) goto err;
+
+	/* First we normalise the numbers */
+	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
+	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
+	sdiv->neg=0;
+	norm_shift+=BN_BITS2;
+	if (!(BN_lshift(snum,num,norm_shift))) goto err;
+	snum->neg=0;
+
+	/* Since we don't know whether snum is larger than sdiv,
+	 * we pad snum with enough zeroes without changing its
+	 * value. 
+	 */
+	if (snum->top <= sdiv->top+1) 
+		{
+		if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+		for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+		snum->top = sdiv->top + 2;
+		}
+	else
+		{
+		if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+		snum->d[snum->top] = 0;
+		snum->top ++;
+		}
+
+	div_n=sdiv->top;
+	num_n=snum->top;
+	loop=num_n-div_n;
+	/* Lets setup a 'window' into snum
+	 * This is the part that corresponds to the current
+	 * 'area' being divided */
+	wnum.neg   = 0;
+	wnum.d     = &(snum->d[loop]);
+	wnum.top   = div_n;
+	/* only needed when BN_ucmp messes up the values between top and max */
+	wnum.dmax  = snum->dmax - loop; /* so we don't step out of bounds */
+
+	/* Get the top 2 words of sdiv */
+	/* div_n=sdiv->top; */
+	d0=sdiv->d[div_n-1];
+	d1=(div_n == 1)?0:sdiv->d[div_n-2];
+
+	/* pointer to the 'top' of snum */
+	wnump= &(snum->d[num_n-1]);
+
+	/* Setup to 'res' */
+	res->neg= (num->neg^divisor->neg);
+	if (!bn_wexpand(res,(loop+1))) goto err;
+	res->top=loop-1;
+	resp= &(res->d[loop-1]);
+
+	/* space for temp */
+	if (!bn_wexpand(tmp,(div_n+1))) goto err;
+
+	/* if res->top == 0 then clear the neg value otherwise decrease
+	 * the resp pointer */
+	if (res->top == 0)
+		res->neg = 0;
+	else
+		resp--;
+
+	for (i=0; i<loop-1; i++, wnump--, resp--)
+		{
+		BN_ULONG q,l0;
+		/* the first part of the loop uses the top two words of
+		 * snum and sdiv to calculate a BN_ULONG q such that
+		 * | wnum - sdiv * q | < sdiv */
+#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
+		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
+		q=bn_div_3_words(wnump,d1,d0);
+#else
+		BN_ULONG n0,n1,rem=0;
+
+		n0=wnump[0];
+		n1=wnump[-1];
+		if (n0 == d0)
+			q=BN_MASK2;
+		else 			/* n0 < d0 */
+			{
+#ifdef BN_LLONG
+			BN_ULLONG t2;
+
+#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
+			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
+#else
+			q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+				n0, n1, d0, q);
+#endif
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+			/*
+			 * rem doesn't have to be BN_ULLONG. The least we
+			 * know it's less that d0, isn't it?
+			 */
+			rem=(n1-q*d0)&BN_MASK2;
+#endif
+			t2=(BN_ULLONG)d1*q;
+
+			for (;;)
+				{
+				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
+					break;
+				q--;
+				rem += d0;
+				if (rem < d0) break; /* don't let rem overflow */
+				t2 -= d1;
+				}
+#else /* !BN_LLONG */
+			BN_ULONG t2l,t2h;
+
+			q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+				n0, n1, d0, q);
+#endif
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+			rem=(n1-q*d0)&BN_MASK2;
+#endif
+
+#if defined(BN_UMULT_LOHI)
+			BN_UMULT_LOHI(t2l,t2h,d1,q);
+#elif defined(BN_UMULT_HIGH)
+			t2l = d1 * q;
+			t2h = BN_UMULT_HIGH(d1,q);
+#else
+			{
+			BN_ULONG ql, qh;
+			t2l=LBITS(d1); t2h=HBITS(d1);
+			ql =LBITS(q);  qh =HBITS(q);
+			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+			}
+#endif
+
+			for (;;)
+				{
+				if ((t2h < rem) ||
+					((t2h == rem) && (t2l <= wnump[-2])))
+					break;
+				q--;
+				rem += d0;
+				if (rem < d0) break; /* don't let rem overflow */
+				if (t2l < d1) t2h--; t2l -= d1;
+				}
+#endif /* !BN_LLONG */
+			}
+#endif /* !BN_DIV3W */
+
+		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
+		tmp->d[div_n]=l0;
+		wnum.d--;
+		/* ingore top values of the bignums just sub the two 
+		 * BN_ULONG arrays with bn_sub_words */
+		if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+			{
+			/* Note: As we have considered only the leading
+			 * two BN_ULONGs in the calculation of q, sdiv * q
+			 * might be greater than wnum (but then (q-1) * sdiv
+			 * is less or equal than wnum)
+			 */
+			q--;
+			if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
+				/* we can't have an overflow here (assuming
+				 * that q != 0, but if q == 0 then tmp is
+				 * zero anyway) */
+				(*wnump)++;
+			}
+		/* store part of the result */
+		*resp = q;
+		}
+	bn_correct_top(snum);
+	if (rm != NULL)
+		{
+		/* Keep a copy of the neg flag in num because if rm==num
+		 * BN_rshift() will overwrite it.
+		 */
+		int neg = num->neg;
+		BN_rshift(rm,snum,norm_shift);
+		if (!BN_is_zero(rm))
+			rm->neg = neg;
 		bn_check_top(rm);
+		}
+	bn_correct_top(res);
+	BN_CTX_end(ctx);
+	return(1);
+err:
+	bn_check_top(rm);
 	BN_CTX_end(ctx);
 	return(0);
 	}
diff --git a/crypto/bn/bn_err.c b/crypto/bn/bn_err.c
index b42208ae0b..cfe2eb94a0 100644
--- a/crypto/bn/bn_err.c
+++ b/crypto/bn/bn_err.c
@@ -1,6 +1,6 @@
 /* crypto/bn/bn_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2002 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -61,65 +61,77 @@
 #include <stdio.h>
 #include <openssl/err.h>
 #include <openssl/bn.h>
-#include <openssl/opensslconf.h> /* To see if OPENSSL_NO_ERR is defined */
 
 /* BEGIN ERROR CODES */
 #ifndef OPENSSL_NO_ERR
+
+#define ERR_FUNC(func) ERR_PACK(ERR_LIB_BN,func,0)
+#define ERR_REASON(reason) ERR_PACK(ERR_LIB_BN,0,reason)
+
 static ERR_STRING_DATA BN_str_functs[]=
 	{
-{ERR_PACK(0,BN_F_BN_BLINDING_CONVERT,0),	"BN_BLINDING_convert"},
-{ERR_PACK(0,BN_F_BN_BLINDING_INVERT,0),	"BN_BLINDING_invert"},
-{ERR_PACK(0,BN_F_BN_BLINDING_NEW,0),	"BN_BLINDING_new"},
-{ERR_PACK(0,BN_F_BN_BLINDING_UPDATE,0),	"BN_BLINDING_update"},
-{ERR_PACK(0,BN_F_BN_BN2DEC,0),	"BN_bn2dec"},
-{ERR_PACK(0,BN_F_BN_BN2HEX,0),	"BN_bn2hex"},
-{ERR_PACK(0,BN_F_BN_CTX_GET,0),	"BN_CTX_get"},
-{ERR_PACK(0,BN_F_BN_CTX_NEW,0),	"BN_CTX_new"},
-{ERR_PACK(0,BN_F_BN_DIV,0),	"BN_div"},
-{ERR_PACK(0,BN_F_BN_EXPAND2,0),	"bn_expand2"},
-{ERR_PACK(0,BN_F_BN_EXPAND_INTERNAL,0),	"BN_EXPAND_INTERNAL"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD,0),	"BN_GF2m_mod"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_DIV,0),	"BN_GF2m_mod_div"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_EXP,0),	"BN_GF2m_mod_exp"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_MUL,0),	"BN_GF2m_mod_mul"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_SOLVE_QUAD,0),	"BN_GF2m_mod_solve_quad"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR,0),	"BN_GF2m_mod_solve_quad_arr"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_SQR,0),	"BN_GF2m_mod_sqr"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP2_MONT,0),	"BN_mod_exp2_mont"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT,0),	"BN_mod_exp_mont"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0),	"BN_mod_exp_mont_word"},
-{ERR_PACK(0,BN_F_BN_MOD_INVERSE,0),	"BN_mod_inverse"},
-{ERR_PACK(0,BN_F_BN_MOD_LSHIFT_QUICK,0),	"BN_mod_lshift_quick"},
-{ERR_PACK(0,BN_F_BN_MOD_MUL_RECIPROCAL,0),	"BN_mod_mul_reciprocal"},
-{ERR_PACK(0,BN_F_BN_MOD_SQRT,0),	"BN_mod_sqrt"},
-{ERR_PACK(0,BN_F_BN_MPI2BN,0),	"BN_mpi2bn"},
-{ERR_PACK(0,BN_F_BN_NEW,0),	"BN_new"},
-{ERR_PACK(0,BN_F_BN_RAND,0),	"BN_rand"},
-{ERR_PACK(0,BN_F_BN_RAND_RANGE,0),	"BN_rand_range"},
-{ERR_PACK(0,BN_F_BN_USUB,0),	"BN_usub"},
+{ERR_FUNC(BN_F_BNRAND),	"BNRAND"},
+{ERR_FUNC(BN_F_BN_BLINDING_CONVERT_EX),	"BN_BLINDING_convert_ex"},
+{ERR_FUNC(BN_F_BN_BLINDING_CREATE_PARAM),	"BN_BLINDING_create_param"},
+{ERR_FUNC(BN_F_BN_BLINDING_INVERT_EX),	"BN_BLINDING_invert_ex"},
+{ERR_FUNC(BN_F_BN_BLINDING_NEW),	"BN_BLINDING_new"},
+{ERR_FUNC(BN_F_BN_BLINDING_UPDATE),	"BN_BLINDING_update"},
+{ERR_FUNC(BN_F_BN_BN2DEC),	"BN_bn2dec"},
+{ERR_FUNC(BN_F_BN_BN2HEX),	"BN_bn2hex"},
+{ERR_FUNC(BN_F_BN_CTX_GET),	"BN_CTX_get"},
+{ERR_FUNC(BN_F_BN_CTX_NEW),	"BN_CTX_new"},
+{ERR_FUNC(BN_F_BN_CTX_START),	"BN_CTX_start"},
+{ERR_FUNC(BN_F_BN_DIV),	"BN_div"},
+{ERR_FUNC(BN_F_BN_DIV_NO_BRANCH),	"BN_div_no_branch"},
+{ERR_FUNC(BN_F_BN_DIV_RECP),	"BN_div_recp"},
+{ERR_FUNC(BN_F_BN_EXP),	"BN_exp"},
+{ERR_FUNC(BN_F_BN_EXPAND2),	"bn_expand2"},
+{ERR_FUNC(BN_F_BN_EXPAND_INTERNAL),	"BN_EXPAND_INTERNAL"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD),	"BN_GF2m_mod"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_EXP),	"BN_GF2m_mod_exp"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_MUL),	"BN_GF2m_mod_mul"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD),	"BN_GF2m_mod_solve_quad"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR),	"BN_GF2m_mod_solve_quad_arr"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SQR),	"BN_GF2m_mod_sqr"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SQRT),	"BN_GF2m_mod_sqrt"},
+{ERR_FUNC(BN_F_BN_MOD_EXP2_MONT),	"BN_mod_exp2_mont"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_MONT),	"BN_mod_exp_mont"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_CONSTTIME),	"BN_mod_exp_mont_consttime"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_WORD),	"BN_mod_exp_mont_word"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_RECP),	"BN_mod_exp_recp"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE),	"BN_mod_exp_simple"},
+{ERR_FUNC(BN_F_BN_MOD_INVERSE),	"BN_mod_inverse"},
+{ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH),	"BN_mod_inverse_no_branch"},
+{ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK),	"BN_mod_lshift_quick"},
+{ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL),	"BN_mod_mul_reciprocal"},
+{ERR_FUNC(BN_F_BN_MOD_SQRT),	"BN_mod_sqrt"},
+{ERR_FUNC(BN_F_BN_MPI2BN),	"BN_mpi2bn"},
+{ERR_FUNC(BN_F_BN_NEW),	"BN_new"},
+{ERR_FUNC(BN_F_BN_RAND),	"BN_rand"},
+{ERR_FUNC(BN_F_BN_RAND_RANGE),	"BN_rand_range"},
+{ERR_FUNC(BN_F_BN_USUB),	"BN_usub"},
 {0,NULL}
 	};
 
 static ERR_STRING_DATA BN_str_reasons[]=
 	{
-{BN_R_ARG2_LT_ARG3                       ,"arg2 lt arg3"},
-{BN_R_BAD_RECIPROCAL                     ,"bad reciprocal"},
-{BN_R_BIGNUM_TOO_LONG                    ,"bignum too long"},
-{BN_R_CALLED_WITH_EVEN_MODULUS           ,"called with even modulus"},
-{BN_R_DIV_BY_ZERO                        ,"div by zero"},
-{BN_R_ENCODING_ERROR                     ,"encoding error"},
-{BN_R_EXPAND_ON_STATIC_BIGNUM_DATA       ,"expand on static bignum data"},
-{BN_R_INPUT_NOT_REDUCED                  ,"input not reduced"},
-{BN_R_INVALID_LENGTH                     ,"invalid length"},
-{BN_R_INVALID_RANGE                      ,"invalid range"},
-{BN_R_NOT_A_SQUARE                       ,"not a square"},
-{BN_R_NOT_IMPLEMENTED                    ,"not implemented"},
-{BN_R_NOT_INITIALIZED                    ,"not initialized"},
-{BN_R_NO_INVERSE                         ,"no inverse"},
-{BN_R_NO_SOLUTION                        ,"no solution"},
-{BN_R_P_IS_NOT_PRIME                     ,"p is not prime"},
-{BN_R_TOO_MANY_ITERATIONS                ,"too many iterations"},
-{BN_R_TOO_MANY_TEMPORARY_VARIABLES       ,"too many temporary variables"},
+{ERR_REASON(BN_R_ARG2_LT_ARG3)           ,"arg2 lt arg3"},
+{ERR_REASON(BN_R_BAD_RECIPROCAL)         ,"bad reciprocal"},
+{ERR_REASON(BN_R_BIGNUM_TOO_LONG)        ,"bignum too long"},
+{ERR_REASON(BN_R_CALLED_WITH_EVEN_MODULUS),"called with even modulus"},
+{ERR_REASON(BN_R_DIV_BY_ZERO)            ,"div by zero"},
+{ERR_REASON(BN_R_ENCODING_ERROR)         ,"encoding error"},
+{ERR_REASON(BN_R_EXPAND_ON_STATIC_BIGNUM_DATA),"expand on static bignum data"},
+{ERR_REASON(BN_R_INPUT_NOT_REDUCED)      ,"input not reduced"},
+{ERR_REASON(BN_R_INVALID_LENGTH)         ,"invalid length"},
+{ERR_REASON(BN_R_INVALID_RANGE)          ,"invalid range"},
+{ERR_REASON(BN_R_NOT_A_SQUARE)           ,"not a square"},
+{ERR_REASON(BN_R_NOT_INITIALIZED)        ,"not initialized"},
+{ERR_REASON(BN_R_NO_INVERSE)             ,"no inverse"},
+{ERR_REASON(BN_R_NO_SOLUTION)            ,"no solution"},
+{ERR_REASON(BN_R_P_IS_NOT_PRIME)         ,"p is not prime"},
+{ERR_REASON(BN_R_TOO_MANY_ITERATIONS)    ,"too many iterations"},
+{ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),"too many temporary variables"},
 {0,NULL}
 	};
 
@@ -127,15 +139,12 @@ static ERR_STRING_DATA BN_str_reasons[]=
 
 void ERR_load_BN_strings(void)
 	{
-	static int init=1;
-
-	if (init)
-		{
-		init=0;
 #ifndef OPENSSL_NO_ERR
-		ERR_load_strings(ERR_LIB_BN,BN_str_functs);
-		ERR_load_strings(ERR_LIB_BN,BN_str_reasons);
-#endif
 
+	if (ERR_func_error_string(BN_str_functs[0].error) == NULL)
+		{
+		ERR_load_strings(0,BN_str_functs);
+		ERR_load_strings(0,BN_str_reasons);
 		}
+#endif
 	}
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index d6bb2b4397..d9b6c737fc 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -56,7 +56,7 @@
  * [including the GNU Public Licence.]
  */
 /* ====================================================================
- * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -113,6 +113,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+/* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE	32
 
 /* this one works - simple but works */
@@ -121,12 +122,20 @@ int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	int i,bits,ret=0;
 	BIGNUM *v,*rr;
 
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+		{
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+		BNerr(BN_F_BN_EXP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return -1;
+		}
+
 	BN_CTX_start(ctx);
 	if ((r == a) || (r == p))
 		rr = BN_CTX_get(ctx);
 	else
 		rr = r;
-	if ((v = BN_CTX_get(ctx)) == NULL) goto err;
+	v = BN_CTX_get(ctx);
+	if (rr == NULL || v == NULL) goto err;
 
 	if (BN_copy(v,a) == NULL) goto err;
 	bits=BN_num_bits(p);
@@ -205,7 +214,7 @@ int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
 	if (BN_is_odd(m))
 		{
 #  ifdef MONT_EXP_WORD
-		if (a->top == 1 && !a->neg)
+		if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0))
 			{
 			BN_ULONG A = a->d[0];
 			ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL);
@@ -237,6 +246,13 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	BIGNUM *val[TABLE_SIZE];
 	BN_RECP_CTX recp;
 
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+		{
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+		BNerr(BN_F_BN_MOD_EXP_RECP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return -1;
+		}
+
 	bits=BN_num_bits(p);
 
 	if (bits == 0)
@@ -364,6 +380,11 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 	BIGNUM *val[TABLE_SIZE];
 	BN_MONT_CTX *mont=NULL;
 
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+		{
+		return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
+		}
+
 	bn_check_top(a);
 	bn_check_top(p);
 	bn_check_top(m);
@@ -495,6 +516,212 @@ err:
 	return(ret);
 	}
 
+
+/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
+ * so that accessing any of these table values shows the same access pattern as far
+ * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
+ * from/to that table. */
+
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+	{
+	size_t i, j;
+
+	if (bn_wexpand(b, top) == NULL)
+		return 0;
+	while (b->top < top)
+		{
+		b->d[b->top++] = 0;
+		}
+	
+	for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
+		{
+		buf[j] = ((unsigned char*)b->d)[i];
+		}
+
+	bn_correct_top(b);
+	return 1;
+	}
+
+static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+	{
+	size_t i, j;
+
+	if (bn_wexpand(b, top) == NULL)
+		return 0;
+
+	for (i=0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
+		{
+		((unsigned char*)b->d)[i] = buf[j];
+		}
+
+	b->top = top;
+	bn_correct_top(b);
+	return 1;
+	}	
+
+/* Given a pointer value, compute the next address that is a cache line multiple. */
+#define MOD_EXP_CTIME_ALIGN(x_) \
+	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+
+/* This variant of BN_mod_exp_mont() uses fixed windows and the special
+ * precomputation memory layout to limit data-dependency to a minimum
+ * to protect secret exponents (cf. the hyper-threading timing attacks
+ * pointed out by Colin Percival,
+ * http://www.daemonology.net/hyperthreading-considered-harmful/)
+ */
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+		    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
+	{
+	int i,bits,ret=0,idx,window,wvalue;
+	int top;
+ 	BIGNUM *r;
+	const BIGNUM *aa;
+	BN_MONT_CTX *mont=NULL;
+
+	int numPowers;
+	unsigned char *powerbufFree=NULL;
+	int powerbufLen = 0;
+	unsigned char *powerbuf=NULL;
+	BIGNUM *computeTemp=NULL, *am=NULL;
+
+	bn_check_top(a);
+	bn_check_top(p);
+	bn_check_top(m);
+
+	top = m->top;
+
+	if (!(m->d[0] & 1))
+		{
+		BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,BN_R_CALLED_WITH_EVEN_MODULUS);
+		return(0);
+		}
+	bits=BN_num_bits(p);
+	if (bits == 0)
+		{
+		ret = BN_one(rr);
+		return ret;
+		}
+
+ 	/* Initialize BIGNUM context and allocate intermediate result */
+	BN_CTX_start(ctx);
+	r = BN_CTX_get(ctx);
+	if (r == NULL) goto err;
+
+	/* Allocate a montgomery context if it was not supplied by the caller.
+	 * If this is not done, things will break in the montgomery part.
+ 	 */
+	if (in_mont != NULL)
+		mont=in_mont;
+	else
+		{
+		if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
+		if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
+		}
+
+	/* Get the window size to use with size of p. */
+	window = BN_window_bits_for_ctime_exponent_size(bits);
+
+	/* Allocate a buffer large enough to hold all of the pre-computed
+	 * powers of a.
+	 */
+	numPowers = 1 << window;
+	powerbufLen = sizeof(m->d[0])*top*numPowers;
+	if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
+		goto err;
+		
+	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
+	memset(powerbuf, 0, powerbufLen);
+
+ 	/* Initialize the intermediate result. Do this early to save double conversion,
+	 * once each for a^0 and intermediate result.
+	 */
+ 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+
+	/* Initialize computeTemp as a^1 with montgomery precalcs */
+	computeTemp = BN_CTX_get(ctx);
+	am = BN_CTX_get(ctx);
+	if (computeTemp==NULL || am==NULL) goto err;
+
+	if (a->neg || BN_ucmp(a,m) >= 0)
+		{
+		if (!BN_mod(am,a,m,ctx))
+			goto err;
+		aa= am;
+		}
+	else
+		aa=a;
+	if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
+	if (!BN_copy(computeTemp, am)) goto err;
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+
+	/* If the window size is greater than 1, then calculate
+	 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
+	 * (even powers could instead be computed as (a^(i/2))^2
+	 * to use the slight performance advantage of sqr over mul).
+	 */
+	if (window > 1)
+		{
+		for (i=2; i<numPowers; i++)
+			{
+			/* Calculate a^i = a^(i-1) * a */
+			if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+				goto err;
+			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+			}
+		}
+
+ 	/* Adjust the number of bits up to a multiple of the window size.
+ 	 * If the exponent length is not a multiple of the window size, then
+ 	 * this pads the most significant bits with zeros to normalize the
+ 	 * scanning loop to there's no special cases.
+ 	 *
+ 	 * * NOTE: Making the window size a power of two less than the native
+	 * * word size ensures that the padded bits won't go past the last
+ 	 * * word in the internal BIGNUM structure. Going past the end will
+ 	 * * still produce the correct result, but causes a different branch
+ 	 * * to be taken in the BN_is_bit_set function.
+ 	 */
+ 	bits = ((bits+window-1)/window)*window;
+ 	idx=bits-1;	/* The top bit of the window */
+
+ 	/* Scan the exponent one window at a time starting from the most
+ 	 * significant bits.
+ 	 */
+ 	while (idx >= 0)
+  		{
+ 		wvalue=0; /* The 'value' of the window */
+ 		
+ 		/* Scan the window, squaring the result as we go */
+ 		for (i=0; i<window; i++,idx--)
+ 			{
+			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))	goto err;
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+  			}
+ 		
+		/* Fetch the appropriate pre-computed value from the pre-buf */
+		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+
+ 		/* Multiply the result into the intermediate result */
+ 		if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+  		}
+
+ 	/* Convert the final result from montgomery to standard format */
+	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+	ret=1;
+err:
+	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
+	if (powerbuf!=NULL)
+		{
+		OPENSSL_cleanse(powerbuf,powerbufLen);
+		OPENSSL_free(powerbufFree);
+		}
+ 	if (am!=NULL) BN_clear(am);
+ 	if (computeTemp!=NULL) BN_clear(computeTemp);
+	BN_CTX_end(ctx);
+	return(ret);
+	}
+
 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
                          const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
@@ -519,6 +746,13 @@ int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
 		(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
 
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+		{
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+		BNerr(BN_F_BN_MOD_EXP_MONT_WORD,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return -1;
+		}
+
 	bn_check_top(p);
 	bn_check_top(m);
 
@@ -648,6 +882,13 @@ int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	/* Table of variables obtained from 'ctx' */
 	BIGNUM *val[TABLE_SIZE];
 
+	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+		{
+		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+		BNerr(BN_F_BN_MOD_EXP_SIMPLE,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return -1;
+		}
+
 	bits=BN_num_bits(p);
 
 	if (bits == 0)
diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c
index 0248753f6d..4a352119ba 100644
--- a/crypto/bn/bn_gcd.c
+++ b/crypto/bn/bn_gcd.c
@@ -203,6 +203,8 @@ err:
 
 
 /* solves ax == 1 (mod n) */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+        const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
 BIGNUM *BN_mod_inverse(BIGNUM *in,
 	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
 	{
@@ -210,6 +212,11 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
 	BIGNUM *ret=NULL;
 	int sign;
 
+	if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(n, BN_FLG_CONSTTIME) != 0))
+		{
+		return BN_mod_inverse_no_branch(in, a, n, ctx);
+		}
+
 	bn_check_top(a);
 	bn_check_top(n);
 
@@ -488,7 +495,160 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
 err:
 	if ((ret == NULL) && (in == NULL)) BN_free(R);
 	BN_CTX_end(ctx);
-	if (ret)
-		bn_check_top(ret);
+	bn_check_top(ret);
+	return(ret);
+	}
+
+
+/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse. 
+ * It does not contain branches that may leak sensitive information.
+ */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
+	{
+	BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL;
+	BIGNUM local_A, local_B;
+	BIGNUM *pA, *pB;
+	BIGNUM *ret=NULL;
+	int sign;
+
+	bn_check_top(a);
+	bn_check_top(n);
+
+	BN_CTX_start(ctx);
+	A = BN_CTX_get(ctx);
+	B = BN_CTX_get(ctx);
+	X = BN_CTX_get(ctx);
+	D = BN_CTX_get(ctx);
+	M = BN_CTX_get(ctx);
+	Y = BN_CTX_get(ctx);
+	T = BN_CTX_get(ctx);
+	if (T == NULL) goto err;
+
+	if (in == NULL)
+		R=BN_new();
+	else
+		R=in;
+	if (R == NULL) goto err;
+
+	BN_one(X);
+	BN_zero(Y);
+	if (BN_copy(B,a) == NULL) goto err;
+	if (BN_copy(A,n) == NULL) goto err;
+	A->neg = 0;
+
+	if (B->neg || (BN_ucmp(B, A) >= 0))
+		{
+		/* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+	 	 * BN_div_no_branch will be called eventually.
+	 	 */
+		pB = &local_B;
+		BN_with_flags(pB, B, BN_FLG_CONSTTIME);	
+		if (!BN_nnmod(B, pB, A, ctx)) goto err;
+		}
+	sign = -1;
+	/* From  B = a mod |n|,  A = |n|  it follows that
+	 *
+	 *      0 <= B < A,
+	 *     -sign*X*a  ==  B   (mod |n|),
+	 *      sign*Y*a  ==  A   (mod |n|).
+	 */
+
+	while (!BN_is_zero(B))
+		{
+		BIGNUM *tmp;
+		
+		/*
+		 *      0 < B < A,
+		 * (*) -sign*X*a  ==  B   (mod |n|),
+		 *      sign*Y*a  ==  A   (mod |n|)
+		 */
+
+		/* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+	 	 * BN_div_no_branch will be called eventually.
+	 	 */
+		pA = &local_A;
+		BN_with_flags(pA, A, BN_FLG_CONSTTIME);	
+		
+		/* (D, M) := (A/B, A%B) ... */		
+		if (!BN_div(D,M,pA,B,ctx)) goto err;
+		
+		/* Now
+		 *      A = D*B + M;
+		 * thus we have
+		 * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+		 */
+		
+		tmp=A; /* keep the BIGNUM object, the value does not matter */
+		
+		/* (A, B) := (B, A mod B) ... */
+		A=B;
+		B=M;
+		/* ... so we have  0 <= B < A  again */
+		
+		/* Since the former  M  is now  B  and the former  B  is now  A,
+		 * (**) translates into
+		 *       sign*Y*a  ==  D*A + B    (mod |n|),
+		 * i.e.
+		 *       sign*Y*a - D*A  ==  B    (mod |n|).
+		 * Similarly, (*) translates into
+		 *      -sign*X*a  ==  A          (mod |n|).
+		 *
+		 * Thus,
+		 *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+		 * i.e.
+		 *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+		 *
+		 * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+		 *      -sign*X*a  ==  B   (mod |n|),
+		 *       sign*Y*a  ==  A   (mod |n|).
+		 * Note that  X  and  Y  stay non-negative all the time.
+		 */
+			
+		if (!BN_mul(tmp,D,X,ctx)) goto err;
+		if (!BN_add(tmp,tmp,Y)) goto err;
+
+		M=Y; /* keep the BIGNUM object, the value does not matter */
+		Y=X;
+		X=tmp;
+		sign = -sign;
+		}
+		
+	/*
+	 * The while loop (Euclid's algorithm) ends when
+	 *      A == gcd(a,n);
+	 * we have
+	 *       sign*Y*a  ==  A  (mod |n|),
+	 * where  Y  is non-negative.
+	 */
+
+	if (sign < 0)
+		{
+		if (!BN_sub(Y,n,Y)) goto err;
+		}
+	/* Now  Y*a  ==  A  (mod |n|).  */
+
+	if (BN_is_one(A))
+		{
+		/* Y*a == 1  (mod |n|) */
+		if (!Y->neg && BN_ucmp(Y,n) < 0)
+			{
+			if (!BN_copy(R,Y)) goto err;
+			}
+		else
+			{
+			if (!BN_nnmod(R,Y,n,ctx)) goto err;
+			}
+		}
+	else
+		{
+		BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH,BN_R_NO_INVERSE);
+		goto err;
+		}
+	ret=R;
+err:
+	if ((ret == NULL) && (in == NULL)) BN_free(R);
+	BN_CTX_end(ctx);
+	bn_check_top(ret);
 	return(ret);
 	}
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 8a945f043f..f7551dacd9 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -121,74 +121,12 @@ static const BN_ULONG SQR_tb[16] =
     SQR_tb[(w) >> 12 & 0xF] << 24 | SQR_tb[(w) >>  8 & 0xF] << 16 | \
     SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
 #endif
-#ifdef SIXTEEN_BIT
-#define SQR1(w) \
-    SQR_tb[(w) >> 12 & 0xF] <<  8 | SQR_tb[(w) >>  8 & 0xF]
-#define SQR0(w) \
-    SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
-#endif
-#ifdef EIGHT_BIT
-#define SQR1(w) \
-    SQR_tb[(w) >>  4 & 0xF]
-#define SQR0(w) \
-    SQR_tb[(w)       & 15]
-#endif
 
 /* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
  * result is a polynomial r with degree < 2 * BN_BITS - 1
  * The caller MUST ensure that the variables have the right amount
  * of space allocated.
  */
-#ifdef EIGHT_BIT
-static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
-	{
-	register BN_ULONG h, l, s;
-	BN_ULONG tab[4], top1b = a >> 7;
-	register BN_ULONG a1, a2;
-
-	a1 = a & (0x7F); a2 = a1 << 1;
-
-	tab[0] = 0; tab[1] = a1; tab[2] = a2; tab[3] = a1^a2;
-
-	s = tab[b      & 0x3]; l  = s;
-	s = tab[b >> 2 & 0x3]; l ^= s << 2; h  = s >> 6;
-	s = tab[b >> 4 & 0x3]; l ^= s << 4; h ^= s >> 4;
-	s = tab[b >> 6      ]; l ^= s << 6; h ^= s >> 2;
-	
-	/* compensate for the top bit of a */
-
-	if (top1b & 01) { l ^= b << 7; h ^= b >> 1; } 
-
-	*r1 = h; *r0 = l;
-	} 
-#endif
-#ifdef SIXTEEN_BIT
-static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
-	{
-	register BN_ULONG h, l, s;
-	BN_ULONG tab[4], top1b = a >> 15; 
-	register BN_ULONG a1, a2;
-
-	a1 = a & (0x7FFF); a2 = a1 << 1;
-
-	tab[0] = 0; tab[1] = a1; tab[2] = a2; tab[3] = a1^a2;
-
-	s = tab[b      & 0x3]; l  = s;
-	s = tab[b >> 2 & 0x3]; l ^= s <<  2; h  = s >> 14;
-	s = tab[b >> 4 & 0x3]; l ^= s <<  4; h ^= s >> 12;
-	s = tab[b >> 6 & 0x3]; l ^= s <<  6; h ^= s >> 10;
-	s = tab[b >> 8 & 0x3]; l ^= s <<  8; h ^= s >>  8;
-	s = tab[b >>10 & 0x3]; l ^= s << 10; h ^= s >>  6;
-	s = tab[b >>12 & 0x3]; l ^= s << 12; h ^= s >>  4;
-	s = tab[b >>14      ]; l ^= s << 14; h ^= s >>  2;
-
-	/* compensate for the top bit of a */
-
-	if (top1b & 01) { l ^= b << 15; h ^= b >> 1; } 
-
-	*r1 = h; *r0 = l;
-	} 
-#endif
 #ifdef THIRTY_TWO_BIT
 static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
 	{
@@ -320,7 +258,7 @@ int	BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
 
 
 /* Performs modular reduction of a and store result in r.  r could be a. */
-int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
+int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
 	{
 	int j, k;
 	int n, dN, d0, d1;
@@ -384,7 +322,11 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
 		if (zz == 0) break;
 		d1 = BN_BITS2 - d0;
 		
-		if (d0) z[dN] = (z[dN] << d1) >> d1; /* clear up the top d1 bits */
+		/* clear up the top d1 bits */
+		if (d0)
+			z[dN] = (z[dN] << d1) >> d1;
+		else
+			z[dN] = 0;
 		z[0] ^= zz; /* reduction t^0 component */
 
 		for (k = 1; p[k] != 0; k++)
@@ -417,11 +359,11 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
 int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p);
-	unsigned int *arr=NULL;
+	const int max = BN_num_bits(p) + 1;
+	int *arr=NULL;
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
 	ret = BN_GF2m_poly2arr(p, arr, max);
 	if (!ret || ret > max)
 		{
@@ -439,7 +381,7 @@ err:
 /* Compute the product of two polynomials a and b, reduce modulo p, and store
  * the result in r.  r could be a or b; a could be b.
  */
-int	BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const unsigned int p[], BN_CTX *ctx)
+int	BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[], BN_CTX *ctx)
 	{
 	int zlen, i, j, k, ret = 0;
 	BIGNUM *s;
@@ -495,12 +437,12 @@ err:
 int	BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p);
-	unsigned int *arr=NULL;
+	const int max = BN_num_bits(p) + 1;
+	int *arr=NULL;
 	bn_check_top(a);
 	bn_check_top(b);
 	bn_check_top(p);
-	if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
 	ret = BN_GF2m_poly2arr(p, arr, max);
 	if (!ret || ret > max)
 		{
@@ -516,7 +458,7 @@ err:
 
 
 /* Square a, reduce the result mod p, and store it in a.  r could be a. */
-int	BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[], BN_CTX *ctx)
+int	BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
 	{
 	int i, ret = 0;
 	BIGNUM *s;
@@ -551,12 +493,12 @@ err:
 int	BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p);
-	unsigned int *arr=NULL;
+	const int max = BN_num_bits(p) + 1;
+	int *arr=NULL;
 
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
 	ret = BN_GF2m_poly2arr(p, arr, max);
 	if (!ret || ret > max)
 		{
@@ -638,7 +580,7 @@ err:
  * function is only provided for convenience; for best performance, use the 
  * BN_GF2m_mod_inv function.
  */
-int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *xx, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *xx, const int p[], BN_CTX *ctx)
 	{
 	BIGNUM *field;
 	int ret = 0;
@@ -764,7 +706,7 @@ err:
  * function is only provided for convenience; for best performance, use the 
  * BN_GF2m_mod_div function.
  */
-int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *yy, const BIGNUM *xx, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *yy, const BIGNUM *xx, const int p[], BN_CTX *ctx)
 	{
 	BIGNUM *field;
 	int ret = 0;
@@ -789,7 +731,7 @@ err:
  * the result in r.  r could be a.
  * Uses simple square-and-multiply algorithm A.5.1 from IEEE P1363.
  */
-int	BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const unsigned int p[], BN_CTX *ctx)
+int	BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[], BN_CTX *ctx)
 	{
 	int ret = 0, i, n;
 	BIGNUM *u;
@@ -835,12 +777,12 @@ err:
 int BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p);
-	unsigned int *arr=NULL;
+	const int max = BN_num_bits(p) + 1;
+	int *arr=NULL;
 	bn_check_top(a);
 	bn_check_top(b);
 	bn_check_top(p);
-	if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
 	ret = BN_GF2m_poly2arr(p, arr, max);
 	if (!ret || ret > max)
 		{
@@ -858,7 +800,7 @@ err:
  * the result in r.  r could be a.
  * Uses exponentiation as in algorithm A.4.1 from IEEE P1363.
  */
-int	BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[], BN_CTX *ctx)
+int	BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
 	{
 	int ret = 0;
 	BIGNUM *u;
@@ -894,15 +836,15 @@ err:
 int BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p);
-	unsigned int *arr=NULL;
+	const int max = BN_num_bits(p) + 1;
+	int *arr=NULL;
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
 	ret = BN_GF2m_poly2arr(p, arr, max);
 	if (!ret || ret > max)
 		{
-		BNerr(BN_F_BN_GF2M_MOD_EXP,BN_R_INVALID_LENGTH);
+		BNerr(BN_F_BN_GF2M_MOD_SQRT,BN_R_INVALID_LENGTH);
 		goto err;
 		}
 	ret = BN_GF2m_mod_sqrt_arr(r, a, arr, ctx);
@@ -915,10 +857,9 @@ err:
 /* Find r such that r^2 + r = a mod p.  r could be a. If no r exists returns 0.
  * Uses algorithms A.4.7 and A.4.6 from IEEE P1363.
  */
-int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a_, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a_, const int p[], BN_CTX *ctx)
 	{
-	int ret = 0, count = 0;
-	unsigned int j;
+	int ret = 0, count = 0, j;
 	BIGNUM *a, *z, *rho, *w, *w2, *tmp;
 
 	bn_check_top(a_);
@@ -1013,11 +954,11 @@ err:
 int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p);
-	unsigned int *arr=NULL;
+	const int max = BN_num_bits(p) + 1;
+	int *arr=NULL;
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) *
+	if ((arr = (int *)OPENSSL_malloc(sizeof(int) *
 						max)) == NULL) goto err;
 	ret = BN_GF2m_poly2arr(p, arr, max);
 	if (!ret || ret > max)
@@ -1033,20 +974,17 @@ err:
 	}
 
 /* Convert the bit-string representation of a polynomial
- * ( \sum_{i=0}^n a_i * x^i , where a_0 is *not* zero) into an array
- * of integers corresponding to the bits with non-zero coefficient.
+ * ( \sum_{i=0}^n a_i * x^i) into an array of integers corresponding 
+ * to the bits with non-zero coefficient.  Array is terminated with -1.
  * Up to max elements of the array will be filled.  Return value is total
- * number of coefficients that would be extracted if array was large enough.
+ * number of array elements that would be filled if array was large enough.
  */
-int BN_GF2m_poly2arr(const BIGNUM *a, unsigned int p[], int max)
+int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max)
 	{
 	int i, j, k = 0;
 	BN_ULONG mask;
 
-	if (BN_is_zero(a) || !BN_is_bit_set(a, 0))
-		/* a_0 == 0 => return error (the unsigned int array
-		 * must be terminated by 0)
-		 */
+	if (BN_is_zero(a))
 		return 0;
 
 	for (i = a->top - 1; i >= 0; i--)
@@ -1066,23 +1004,28 @@ int BN_GF2m_poly2arr(const BIGNUM *a, unsigned int p[], int max)
 			}
 		}
 
+	if (k < max) {
+		p[k] = -1;
+		k++;
+	}
+
 	return k;
 	}
 
 /* Convert the coefficient array representation of a polynomial to a 
- * bit-string.  The array must be terminated by 0.
+ * bit-string.  The array must be terminated by -1.
  */
-int BN_GF2m_arr2poly(const unsigned int p[], BIGNUM *a)
+int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
 	{
 	int i;
 
 	bn_check_top(a);
 	BN_zero(a);
-	for (i = 0; p[i] != 0; i++)
+	for (i = 0; p[i] != -1; i++)
 		{
-		BN_set_bit(a, p[i]);
+		if (BN_set_bit(a, p[i]) == 0)
+			return 0;
 		}
-	BN_set_bit(a, 0);
 	bn_check_top(a);
 
 	return 1;
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 45e19221aa..d7dff0d90c 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -163,6 +163,45 @@ extern "C" {
 
 
 
+/* BN_mod_exp_mont_conttime is based on the assumption that the
+ * L1 data cache line width of the target processor is at least
+ * the following value.
+ */
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH	( 64 )
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK	(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/* Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime).
+ *
+ * To achieve the security goals of BN_mode_exp_mont_consttime, the
+ * maximum size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). 
+ *
+ * Window size thresholds are defined for cache line sizes of 32 and 64,
+ * cache line sizes where log_2(32)=5 and log_2(64)=6 respectively. A
+ * window size of 7 should only be used on processors that have a 128
+ * byte or greater cache line size.
+ */
+#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+#  define BN_window_bits_for_ctime_exponent_size(b) \
+		((b) > 937 ? 6 : \
+		 (b) > 306 ? 5 : \
+		 (b) >  89 ? 4 : \
+		 (b) >  22 ? 3 : 1)
+#  define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE	(6)
+
+#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+#  define BN_window_bits_for_ctime_exponent_size(b) \
+		((b) > 306 ? 5 : \
+		 (b) >  89 ? 4 : \
+		 (b) >  22 ? 3 : 1)
+#  define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE	(5)
+
+#endif
+
+
 /* Pentium pro 16,16,16,32,64 */
 /* Alpha       16,16,16,16.64 */
 #define BN_MULL_SIZE_NORMAL			(16) /* 32 */
@@ -199,7 +238,7 @@ extern "C" {
 #  if defined(__DECC)
 #   include <c_asm.h>
 #   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-#  elif defined(__GNUC__)
+#  elif defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("umulh	%1,%2,%0"	\
@@ -208,7 +247,7 @@ extern "C" {
 	ret;			})
 #  endif	/* compiler */
 # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("mulhdu	%0,%1,%2"	\
@@ -216,8 +255,9 @@ extern "C" {
 	     : "r"(a), "r"(b));		\
 	ret;			})
 #  endif	/* compiler */
-# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
-#  if defined(__GNUC__)
+# elif (defined(__x86_64) || defined(__x86_64__)) && \
+       (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret,discard;	\
 	asm ("mulq	%3"		\
@@ -231,6 +271,28 @@ extern "C" {
 		: "a"(a),"g"(b)		\
 		: "cc");
 #  endif
+# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
+#  if defined(_MSC_VER) && _MSC_VER>=1400
+    unsigned __int64 __umulh	(unsigned __int64 a,unsigned __int64 b);
+    unsigned __int64 _umul128	(unsigned __int64 a,unsigned __int64 b,
+				 unsigned __int64 *h);
+#   pragma intrinsic(__umulh,_umul128)
+#   define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
+#   define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
+#  endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("dmultu	%1,%2"		\
+	     : "=h"(ret)		\
+	     : "r"(a), "r"(b) : "l");	\
+	ret;			})
+#   define BN_UMULT_LOHI(low,high,a,b)	\
+	asm ("dmultu	%2,%3"		\
+	     : "=l"(low),"=h"(high)	\
+	     : "r"(a), "r"(b));
+#  endif
 # endif		/* cpu */
 #endif		/* OPENSSL_NO_ASM */
 
@@ -274,6 +336,33 @@ extern "C" {
 	(r1)=Hw(t); \
 	}
 
+#elif defined(BN_UMULT_LOHI)
+#define mul_add(r,a,w,c) {		\
+	BN_ULONG high,low,ret,tmp=(a);	\
+	ret =  (r);			\
+	BN_UMULT_LOHI(low,high,w,tmp);	\
+	ret += (c);			\
+	(c) =  (ret<(c))?1:0;		\
+	(c) += high;			\
+	ret += low;			\
+	(c) += (ret<low)?1:0;		\
+	(r) =  ret;			\
+	}
+
+#define mul(r,a,w,c)	{		\
+	BN_ULONG high,low,ret,ta=(a);	\
+	BN_UMULT_LOHI(low,high,w,ta);	\
+	ret =  low + (c);		\
+	(c) =  high;			\
+	(c) += (ret<low)?1:0;		\
+	(r) =  ret;			\
+	}
+
+#define sqr(r0,r1,a)	{		\
+	BN_ULONG tmp=(a);		\
+	BN_UMULT_LOHI(r0,r1,tmp,tmp);	\
+	}
+
 #elif defined(BN_UMULT_HIGH)
 #define mul_add(r,a,w,c) {		\
 	BN_ULONG high,low,ret,tmp=(a);	\
@@ -406,6 +495,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 	int cl, int dl);
 BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 	int cl, int dl);
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 
 #ifdef  __cplusplus
 }
diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index bbefd80309..5470fbe6ef 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -67,7 +67,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-const char *BN_version="Big Number" OPENSSL_VERSION_PTEXT;
+const char BN_version[]="Big Number" OPENSSL_VERSION_PTEXT;
 
 /* This stuff appears to be completely unused, so is deprecated */
 #ifndef OPENSSL_NO_DEPRECATED
@@ -133,8 +133,8 @@ int BN_get_params(int which)
 
 const BIGNUM *BN_value_one(void)
 	{
-	static BN_ULONG data_one=1L;
-	static BIGNUM const_one={&data_one,1,1,0,BN_FLG_STATIC_DATA};
+	static const BN_ULONG data_one=1L;
+	static const BIGNUM const_one={(BN_ULONG *)&data_one,1,1,0,BN_FLG_STATIC_DATA};
 
 	return(&const_one);
 	}
@@ -160,7 +160,7 @@ char *BN_options(void)
 
 int BN_num_bits_word(BN_ULONG l)
 	{
-	static const char bits[256]={
+	static const unsigned char bits[256]={
 		0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
 		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
 		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -235,7 +235,7 @@ int BN_num_bits_word(BN_ULONG l)
 		else
 #endif
 			{
-#if defined(SIXTEEN_BIT) || defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
+#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
 			if (l & 0xff00L)
 				return(bits[(int)(l>>8)]+8);
 			else	
@@ -531,46 +531,6 @@ BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
 	return(a);
 	}
 
-BIGNUM *BN_ncopy(BIGNUM *a, const BIGNUM *b, size_t n)
-	{
-	int i, min;
-	BN_ULONG *A;
-	const BN_ULONG *B;
-
-	bn_check_top(b);
-	if (a == b)
-		return a;
-
-	min = (b->top < (int)n)? b->top: (int)n;
-	if (!min)
-		{
-		BN_zero(a);
-		return a;
-		}
-	if (bn_wexpand(a, min) == NULL)
-		return NULL;
-
-	A=a->d;
-	B=b->d;
-	for (i=min>>2; i>0; i--, A+=4, B+=4)
-		{
-		BN_ULONG a0,a1,a2,a3;
-		a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
-		A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
-		}
-	switch (min&3)
-		{
-		case 3: A[2]=B[2];
-		case 2: A[1]=B[1];
-		case 1: A[0]=B[0];
-		case 0: ;
-		}
-	a->top = min;
-	a->neg = b->neg;
-	bn_correct_top(a);
-	return(a);
-	}
-
 void BN_swap(BIGNUM *a, BIGNUM *b)
 	{
 	int flags_old_a, flags_old_b;
@@ -803,7 +763,7 @@ int BN_is_bit_set(const BIGNUM *a, int n)
 	i=n/BN_BITS2;
 	j=n%BN_BITS2;
 	if (a->top <= i) return 0;
-	return((a->d[i]&(((BN_ULONG)1)<<j))?1:0);
+	return (int)(((a->d[i])>>j)&((BN_ULONG)1));
 	}
 
 int BN_mask_bits(BIGNUM *a, int n)
@@ -827,6 +787,14 @@ int BN_mask_bits(BIGNUM *a, int n)
 	return(1);
 	}
 
+void BN_set_negative(BIGNUM *a, int b)
+	{
+	if (b && !BN_is_zero(a))
+		a->neg = 1;
+	else
+		a->neg = 0;
+	}
+
 int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
 	{
 	int i;
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 61416483cb..7224637ab3 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -55,6 +55,59 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.]
  */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 
 /*
  * Details about Montgomery multiplication algorithms can be found at
@@ -69,11 +122,30 @@
 
 #define MONT_WORD /* use the faster word-based algorithm */
 
+#ifdef MONT_WORD
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
+#endif
+
 int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 			  BN_MONT_CTX *mont, BN_CTX *ctx)
 	{
 	BIGNUM *tmp;
 	int ret=0;
+#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
+	int num = mont->N.top;
+
+	if (num>1 && a->top==num && b->top==num)
+		{
+		if (bn_wexpand(r,num) == NULL) return(0);
+		if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
+			{
+			r->neg = a->neg^b->neg;
+			r->top = num;
+			bn_correct_top(r);
+			return(1);
+			}
+		}
+#endif
 
 	BN_CTX_start(ctx);
 	tmp = BN_CTX_get(ctx);
@@ -89,7 +161,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
 		if (!BN_mul(tmp,a,b,ctx)) goto err;
 		}
 	/* reduce from aRR to aR */
+#ifdef MONT_WORD
+	if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
+#else
 	if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
+#endif
 	bn_check_top(r);
 	ret=1;
 err:
@@ -97,35 +173,25 @@ err:
 	return(ret);
 	}
 
-int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
-	     BN_CTX *ctx)
-	{
-	int retn=0;
-
 #ifdef MONT_WORD
-	BIGNUM *n,*r;
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
+	{
+	BIGNUM *n;
 	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
 	int al,nl,max,i,x,ri;
 
-	BN_CTX_start(ctx);
-	if ((r = BN_CTX_get(ctx)) == NULL) goto err;
-
-	if (!BN_copy(r,a)) goto err;
 	n= &(mont->N);
-
-	ap=a->d;
 	/* mont->ri is the size of mont->N in bits (rounded up
 	   to the word size) */
 	al=ri=mont->ri/BN_BITS2;
-	
+
 	nl=n->top;
-	if ((al == 0) || (nl == 0)) { r->top=0; return(1); }
+	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
 
 	max=(nl+al+1); /* allow for overflow (no?) XXX */
-	if (bn_wexpand(r,max) == NULL) goto err;
-	if (bn_wexpand(ret,max) == NULL) goto err;
+	if (bn_wexpand(r,max) == NULL) return(0);
 
-	r->neg=a->neg^n->neg;
+	r->neg^=n->neg;
 	np=n->d;
 	rp=r->d;
 	nrp= &(r->d[nl]);
@@ -139,10 +205,10 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 #endif
 
 	r->top=max;
-	n0=mont->n0;
+	n0=mont->n0[0];
 
 #ifdef BN_COUNT
-	fprintf(stderr,"word BN_from_montgomery %d * %d\n",nl,nl);
+	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
 #endif
 	for (i=0; i<nl; i++)
 		{
@@ -174,20 +240,72 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 			}
 		}
 	bn_correct_top(r);
-	
-	/* mont->ri will be a multiple of the word size */
-#if 0
-	BN_rshift(ret,r,mont->ri);
-#else
-	ret->neg = r->neg;
-	x=ri;
+
+	/* mont->ri will be a multiple of the word size and below code
+	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
+	if (r->top <= ri)
+		{
+		ret->top=0;
+		return(1);
+		}
+	al=r->top-ri;
+
+#define BRANCH_FREE 1
+#if BRANCH_FREE
+	if (bn_wexpand(ret,ri) == NULL) return(0);
+	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
+	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	ret->neg=r->neg;
+
 	rp=ret->d;
-	ap= &(r->d[x]);
-	if (r->top < x)
-		al=0;
-	else
-		al=r->top-x;
+	ap=&(r->d[ri]);
+
+	{
+	size_t m1,m2;
+
+	v=bn_sub_words(rp,ap,np,ri);
+	/* this ----------------^^ works even in al<ri case
+	 * thanks to zealous zeroing of top of the vector in the
+	 * beginning. */
+
+	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+	/* in other words if subtraction result is real, then
+	 * trick unconditional memcpy below to perform in-place
+	 * "refresh" instead of actual copy. */
+	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
+	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
+	m1|=m2;			/* (al!=ri) */
+	m1|=(0-(size_t)v);	/* (al!=ri || v) */
+	m1&=~m2;		/* (al!=ri || v) && !al>ri */
+	nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+	}
+
+	/* 'i<ri' is chosen to eliminate dependency on input data, even
+	 * though it results in redundant copy in al<ri case. */
+	for (i=0,ri-=4; i<ri; i+=4)
+		{
+		BN_ULONG t1,t2,t3,t4;
+		
+		t1=nrp[i+0];
+		t2=nrp[i+1];
+		t3=nrp[i+2];	ap[i+0]=0;
+		t4=nrp[i+3];	ap[i+1]=0;
+		rp[i+0]=t1;	ap[i+2]=0;
+		rp[i+1]=t2;	ap[i+3]=0;
+		rp[i+2]=t3;
+		rp[i+3]=t4;
+		}
+	for (ri+=4; i<ri; i++)
+		rp[i]=nrp[i], ap[i]=0;
+	bn_correct_top(r);
+	bn_correct_top(ret);
+#else
+	if (bn_wexpand(ret,al) == NULL) return(0);
 	ret->top=al;
+	ret->neg=r->neg;
+
+	rp=ret->d;
+	ap=&(r->d[ri]);
 	al-=4;
 	for (i=0; i<al; i+=4)
 		{
@@ -205,8 +323,30 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	al+=4;
 	for (; i<al; i++)
 		rp[i]=ap[i];
+
+	if (BN_ucmp(ret, &(mont->N)) >= 0)
+		{
+		if (!BN_usub(ret,ret,&(mont->N))) return(0);
+		}
 #endif
-#else /* !MONT_WORD */ 
+	bn_check_top(ret);
+
+	return(1);
+	}
+#endif	/* MONT_WORD */
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+	     BN_CTX *ctx)
+	{
+	int retn=0;
+#ifdef MONT_WORD
+	BIGNUM *t;
+
+	BN_CTX_start(ctx);
+	if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
+		retn = BN_from_montgomery_word(ret,t,mont);
+	BN_CTX_end(ctx);
+#else /* !MONT_WORD */
 	BIGNUM *t1,*t2;
 
 	BN_CTX_start(ctx);
@@ -223,7 +363,6 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	if (!BN_mul(t1,t2,&mont->N,ctx)) goto err;
 	if (!BN_add(t2,a,t1)) goto err;
 	if (!BN_rshift(ret,t2,mont->ri)) goto err;
-#endif /* MONT_WORD */
 
 	if (BN_ucmp(ret, &(mont->N)) >= 0)
 		{
@@ -233,6 +372,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
 	bn_check_top(ret);
  err:
 	BN_CTX_end(ctx);
+#endif /* MONT_WORD */
 	return(retn);
 	}
 
@@ -254,6 +394,7 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
 	BN_init(&(ctx->RR));
 	BN_init(&(ctx->N));
 	BN_init(&(ctx->Ni));
+	ctx->n0[0] = ctx->n0[1] = 0;
 	ctx->flags=0;
 	}
 
@@ -285,16 +426,55 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 		BIGNUM tmod;
 		BN_ULONG buf[2];
 
+		BN_init(&tmod);
+		tmod.d=buf;
+		tmod.dmax=2;
+		tmod.neg=0;
+
 		mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
+
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+		/* Only certain BN_BITS2<=32 platforms actually make use of
+		 * n0[1], and we could use the #else case (with a shorter R
+		 * value) for the others.  However, currently only the assembler
+		 * files do know which is which. */
+
+		BN_zero(R);
+		if (!(BN_set_bit(R,2*BN_BITS2))) goto err;
+
+								tmod.top=0;
+		if ((buf[0] = mod->d[0]))			tmod.top=1;
+		if ((buf[1] = mod->top>1 ? mod->d[1] : 0))	tmod.top=2;
+
+		if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
+			goto err;
+		if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
+		if (!BN_is_zero(Ri))
+			{
+			if (!BN_sub_word(Ri,1)) goto err;
+			}
+		else /* if N mod word size == 1 */
+			{
+			if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
+				goto err;
+			/* Ri-- (mod double word size) */
+			Ri->neg=0;
+			Ri->d[0]=BN_MASK2;
+			Ri->d[1]=BN_MASK2;
+			Ri->top=2;
+			}
+		if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
+		/* Ni = (R*Ri-1)/N,
+		 * keep only couple of least significant words: */
+		mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+		mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
 		BN_zero(R);
 		if (!(BN_set_bit(R,BN_BITS2))) goto err;	/* R */
 
 		buf[0]=mod->d[0]; /* tmod = N mod word size */
 		buf[1]=0;
-		tmod.d=buf;
-		tmod.top=1;
-		tmod.dmax=2;
-		tmod.neg=0;
+		tmod.top = buf[0] != 0 ? 1 : 0;
 							/* Ri = R^-1 mod N*/
 		if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
 			goto err;
@@ -310,7 +490,9 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 		if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
 		/* Ni = (R*Ri-1)/N,
 		 * keep only least significant word: */
-		mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0;
+		mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+		mont->n0[1] = 0;
+#endif
 		}
 #else /* !MONT_WORD */
 		{ /* bignum version */
@@ -346,7 +528,40 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
 	if (!BN_copy(&(to->N),&(from->N))) return NULL;
 	if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
 	to->ri=from->ri;
-	to->n0=from->n0;
+	to->n0[0]=from->n0[0];
+	to->n0[1]=from->n0[1];
 	return(to);
 	}
 
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+					const BIGNUM *mod, BN_CTX *ctx)
+	{
+	int got_write_lock = 0;
+	BN_MONT_CTX *ret;
+
+	CRYPTO_r_lock(lock);
+	if (!*pmont)
+		{
+		CRYPTO_r_unlock(lock);
+		CRYPTO_w_lock(lock);
+		got_write_lock = 1;
+
+		if (!*pmont)
+			{
+			ret = BN_MONT_CTX_new();
+			if (ret && !BN_MONT_CTX_set(ret, mod, ctx))
+				BN_MONT_CTX_free(ret);
+			else
+				*pmont = ret;
+			}
+		}
+	
+	ret = *pmont;
+	
+	if (got_write_lock)
+		CRYPTO_w_unlock(lock);
+	else
+		CRYPTO_r_unlock(lock);
+		
+	return ret;
+	}
diff --git a/crypto/bn/bn_mul.c b/crypto/bn/bn_mul.c
index aec1eafc65..a0e9ec3b46 100644
--- a/crypto/bn/bn_mul.c
+++ b/crypto/bn/bn_mul.c
@@ -389,6 +389,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r,
  * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
  * a[1]*b[1]
  */
+/* dnX may not be positive, but n2/2+dnX has to be */
 void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 	int dna, int dnb, BN_ULONG *t)
 	{
@@ -398,7 +399,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 	BN_ULONG ln,lo,*p;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2);
+	fprintf(stderr," bn_mul_recursive %d%+d * %d%+d\n",n2,dna,n2,dnb);
 # endif
 # ifdef BN_MUL_COMBA
 #  if 0
@@ -545,6 +546,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 
 /* n+tn is the word length
  * t needs to be n*4 is size, as does r */
+/* tnX may not be negative but less than n */
 void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 	     int tna, int tnb, BN_ULONG *t)
 	{
@@ -553,8 +555,8 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 	BN_ULONG ln,lo,*p;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n",
-		tna, n, tnb, n);
+	fprintf(stderr," bn_mul_part_recursive (%d%+d) * (%d%+d)\n",
+		n, tna, n, tnb);
 # endif
 	if (n < 8)
 		{
@@ -655,14 +657,17 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 				for (;;)
 					{
 					i/=2;
-					if (i < tna && i < tnb)
+					/* these simplified conditions work
+					 * exclusively because difference
+					 * between tna and tnb is 1 or 0 */
+					if (i < tna || i < tnb)
 						{
 						bn_mul_part_recursive(&(r[n2]),
 							&(a[n]),&(b[n]),
 							i,tna-i,tnb-i,p);
 						break;
 						}
-					else if (i <= tna && i <= tnb)
+					else if (i == tna || i == tnb)
 						{
 						bn_mul_recursive(&(r[n2]),
 							&(a[n]),&(b[n]),
@@ -1023,17 +1028,19 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 			assert(j <= al || j <= bl);
 			k = j+j;
 			t = BN_CTX_get(ctx);
+			if (t == NULL)
+				goto err;
 			if (al > j || bl > j)
 				{
-				bn_wexpand(t,k*4);
-				bn_wexpand(rr,k*4);
+				if (bn_wexpand(t,k*4) == NULL) goto err;
+				if (bn_wexpand(rr,k*4) == NULL) goto err;
 				bn_mul_part_recursive(rr->d,a->d,b->d,
 					j,al-j,bl-j,t->d);
 				}
 			else	/* al <= j || bl <= j */
 				{
-				bn_wexpand(t,k*2);
-				bn_wexpand(rr,k*2);
+				if (bn_wexpand(t,k*2) == NULL) goto err;
+				if (bn_wexpand(rr,k*2) == NULL) goto err;
 				bn_mul_recursive(rr->d,a->d,b->d,
 					j,al-j,bl-j,t->d);
 				}
diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index bbe2cbe749..2ca5b01391 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@@ -1,6 +1,9 @@
 /* crypto/bn/bn_nist.c */
+/*
+ * Written by Nils Larsch for the OpenSSL project
+ */
 /* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -56,795 +59,778 @@
 #include "bn_lcl.h"
 #include "cryptlib.h"
 
+
 #define BN_NIST_192_TOP	(192+BN_BITS2-1)/BN_BITS2
 #define BN_NIST_224_TOP	(224+BN_BITS2-1)/BN_BITS2
 #define BN_NIST_256_TOP	(256+BN_BITS2-1)/BN_BITS2
 #define BN_NIST_384_TOP	(384+BN_BITS2-1)/BN_BITS2
 #define BN_NIST_521_TOP	(521+BN_BITS2-1)/BN_BITS2
 
+/* pre-computed tables are "carry-less" values of modulus*(i+1) */
 #if BN_BITS2 == 64
-const static BN_ULONG _nist_p_192[] =
-	{0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFEULL,
-	0xFFFFFFFFFFFFFFFFULL};
-const static BN_ULONG _nist_p_224[] =
+static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
+	{0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFFULL},
+	{0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFFULL},
+	{0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFCULL,0xFFFFFFFFFFFFFFFFULL}
+	};
+static const BN_ULONG _nist_p_192_sqr[] = {
+	0x0000000000000001ULL,0x0000000000000002ULL,0x0000000000000001ULL,
+	0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFFULL
+	};
+static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
 	{0x0000000000000001ULL,0xFFFFFFFF00000000ULL,
-	0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL};
-const static BN_ULONG _nist_p_256[] =
+	 0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL},
+	{0x0000000000000002ULL,0xFFFFFFFE00000000ULL,
+	 0xFFFFFFFFFFFFFFFFULL,0x00000001FFFFFFFFULL} /* this one is "carry-full" */
+	};
+static const BN_ULONG _nist_p_224_sqr[] = {
+	0x0000000000000001ULL,0xFFFFFFFE00000000ULL,
+	0xFFFFFFFFFFFFFFFFULL,0x0000000200000000ULL,
+	0x0000000000000000ULL,0xFFFFFFFFFFFFFFFEULL,
+	0xFFFFFFFFFFFFFFFFULL
+	};
+static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
 	{0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL,
-	0x0000000000000000ULL,0xFFFFFFFF00000001ULL};
-const static BN_ULONG _nist_p_384[] =
-	{0x00000000FFFFFFFFULL,0xFFFFFFFF00000000ULL,
-	0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFFULL,
-	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL};
-const static BN_ULONG _nist_p_521[] =
+	 0x0000000000000000ULL,0xFFFFFFFF00000001ULL},
+	{0xFFFFFFFFFFFFFFFEULL,0x00000001FFFFFFFFULL,
+	 0x0000000000000000ULL,0xFFFFFFFE00000002ULL},
+	{0xFFFFFFFFFFFFFFFDULL,0x00000002FFFFFFFFULL,
+	 0x0000000000000000ULL,0xFFFFFFFD00000003ULL},
+	{0xFFFFFFFFFFFFFFFCULL,0x00000003FFFFFFFFULL,
+	 0x0000000000000000ULL,0xFFFFFFFC00000004ULL},
+	{0xFFFFFFFFFFFFFFFBULL,0x00000004FFFFFFFFULL,
+	 0x0000000000000000ULL,0xFFFFFFFB00000005ULL},
+	};
+static const BN_ULONG _nist_p_256_sqr[] = {
+	0x0000000000000001ULL,0xFFFFFFFE00000000ULL,
+	0xFFFFFFFFFFFFFFFFULL,0x00000001FFFFFFFEULL,
+	0x00000001FFFFFFFEULL,0x00000001FFFFFFFEULL,
+	0xFFFFFFFE00000001ULL,0xFFFFFFFE00000002ULL
+	};
+static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
+	{0x00000000FFFFFFFFULL,0xFFFFFFFF00000000ULL,0xFFFFFFFFFFFFFFFEULL,
+	 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+	{0x00000001FFFFFFFEULL,0xFFFFFFFE00000000ULL,0xFFFFFFFFFFFFFFFDULL,
+	 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+	{0x00000002FFFFFFFDULL,0xFFFFFFFD00000000ULL,0xFFFFFFFFFFFFFFFCULL,
+	 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+	{0x00000003FFFFFFFCULL,0xFFFFFFFC00000000ULL,0xFFFFFFFFFFFFFFFBULL,
+	 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+	{0x00000004FFFFFFFBULL,0xFFFFFFFB00000000ULL,0xFFFFFFFFFFFFFFFAULL,
+	 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+	};
+static const BN_ULONG _nist_p_384_sqr[] = {
+	0xFFFFFFFE00000001ULL,0x0000000200000000ULL,0xFFFFFFFE00000000ULL,
+	0x0000000200000000ULL,0x0000000000000001ULL,0x0000000000000000ULL,
+	0x00000001FFFFFFFEULL,0xFFFFFFFE00000000ULL,0xFFFFFFFFFFFFFFFDULL,
+	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL
+	};
+static const BN_ULONG _nist_p_521[] =
 	{0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
 	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
 	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
 	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
 	0x00000000000001FFULL};
+static const BN_ULONG _nist_p_521_sqr[] = {
+	0x0000000000000001ULL,0x0000000000000000ULL,0x0000000000000000ULL,
+	0x0000000000000000ULL,0x0000000000000000ULL,0x0000000000000000ULL,
+	0x0000000000000000ULL,0x0000000000000000ULL,0xFFFFFFFFFFFFFC00ULL,
+	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
+	0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
+	0xFFFFFFFFFFFFFFFFULL,0x000000000003FFFFULL
+	};
 #elif BN_BITS2 == 32
-const static BN_ULONG _nist_p_192[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,
-	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-const static BN_ULONG _nist_p_224[] = {0x00000001,0x00000000,0x00000000,
-	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-const static BN_ULONG _nist_p_256[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
-	0x00000000,0x00000000,0x00000000,0x00000001,0xFFFFFFFF};
-const static BN_ULONG _nist_p_384[] = {0xFFFFFFFF,0x00000000,0x00000000,
-	0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
-	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-const static BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
+static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
+	{0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFC,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}
+	};
+static const BN_ULONG _nist_p_192_sqr[] = {
+	0x00000001,0x00000000,0x00000002,0x00000000,0x00000001,0x00000000,
+	0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
+	};
+static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
+	{0x00000001,0x00000000,0x00000000,0xFFFFFFFF,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0x00000002,0x00000000,0x00000000,0xFFFFFFFE,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}
+	};
+static const BN_ULONG _nist_p_224_sqr[] = {
+	0x00000001,0x00000000,0x00000000,0xFFFFFFFE,
+	0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000002,
+	0x00000000,0x00000000,0xFFFFFFFE,0xFFFFFFFF,
+	0xFFFFFFFF,0xFFFFFFFF
+	};
+static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
+	{0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x00000000,
+	 0x00000000,0x00000000,0x00000001,0xFFFFFFFF},
+	{0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0x00000001,
+	 0x00000000,0x00000000,0x00000002,0xFFFFFFFE},
+	{0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0x00000002,
+	 0x00000000,0x00000000,0x00000003,0xFFFFFFFD},
+	{0xFFFFFFFC,0xFFFFFFFF,0xFFFFFFFF,0x00000003,
+	 0x00000000,0x00000000,0x00000004,0xFFFFFFFC},
+	{0xFFFFFFFB,0xFFFFFFFF,0xFFFFFFFF,0x00000004,
+	 0x00000000,0x00000000,0x00000005,0xFFFFFFFB},
+	};
+static const BN_ULONG _nist_p_256_sqr[] = {
+	0x00000001,0x00000000,0x00000000,0xFFFFFFFE,
+	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0x00000001,
+	0xFFFFFFFE,0x00000001,0xFFFFFFFE,0x00000001,
+	0x00000001,0xFFFFFFFE,0x00000002,0xFFFFFFFE
+	};
+static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
+	{0xFFFFFFFF,0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0xFFFFFFFE,0x00000001,0x00000000,0xFFFFFFFE,0xFFFFFFFD,0xFFFFFFFF,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0xFFFFFFFD,0x00000002,0x00000000,0xFFFFFFFD,0xFFFFFFFC,0xFFFFFFFF,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0xFFFFFFFC,0x00000003,0x00000000,0xFFFFFFFC,0xFFFFFFFB,0xFFFFFFFF,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	{0xFFFFFFFB,0x00000004,0x00000000,0xFFFFFFFB,0xFFFFFFFA,0xFFFFFFFF,
+	 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+	};
+static const BN_ULONG _nist_p_384_sqr[] = {
+	0x00000001,0xFFFFFFFE,0x00000000,0x00000002,0x00000000,0xFFFFFFFE,
+	0x00000000,0x00000002,0x00000001,0x00000000,0x00000000,0x00000000,
+	0xFFFFFFFE,0x00000001,0x00000000,0xFFFFFFFE,0xFFFFFFFD,0xFFFFFFFF,
+	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
+	};
+static const BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
 	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
 	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
 	0xFFFFFFFF,0x000001FF};
-#elif BN_BITS2 == 16
-const static BN_ULONG _nist_p_192[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFE,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-const static BN_ULONG _nist_p_224[] = {0x0001,0x0000,0x0000,0x0000,0x0000,
-	0x0000,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-const static BN_ULONG _nist_p_256[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0000,0xFFFF,
-	0xFFFF};
-const static BN_ULONG _nist_p_384[] = {0xFFFF,0xFFFF,0x0000,0x0000,0x0000,
-	0x0000,0xFFFF,0xFFFF,0xFFFE,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-const static BN_ULONG _nist_p_521[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
-	0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0x01FF};
-#elif BN_BITS2 == 8
-const static BN_ULONG _nist_p_192[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF};
-const static BN_ULONG _nist_p_224[] = {0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-	0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-const static BN_ULONG _nist_p_256[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-	0x00,0x00,0x01,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF};
-const static BN_ULONG _nist_p_384[] = {0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,
-	0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-const static BN_ULONG _nist_p_521[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-	0xFF,0x01};
+static const BN_ULONG _nist_p_521_sqr[] = {
+	0x00000001,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
+	0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
+	0x00000000,0x00000000,0x00000000,0x00000000,0xFFFFFC00,0xFFFFFFFF,
+	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
+	0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
+	0xFFFFFFFF,0xFFFFFFFF,0x0003FFFF
+	};
+#else
+#error "unsupported BN_BITS2"
 #endif
 
+
+static const BIGNUM _bignum_nist_p_192 =
+	{
+	(BN_ULONG *)_nist_p_192[0],
+	BN_NIST_192_TOP,
+	BN_NIST_192_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_224 =
+	{
+	(BN_ULONG *)_nist_p_224[0],
+	BN_NIST_224_TOP,
+	BN_NIST_224_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_256 =
+	{
+	(BN_ULONG *)_nist_p_256[0],
+	BN_NIST_256_TOP,
+	BN_NIST_256_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_384 =
+	{
+	(BN_ULONG *)_nist_p_384[0],
+	BN_NIST_384_TOP,
+	BN_NIST_384_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+static const BIGNUM _bignum_nist_p_521 =
+	{
+	(BN_ULONG *)_nist_p_521,
+	BN_NIST_521_TOP,
+	BN_NIST_521_TOP,
+	0,
+	BN_FLG_STATIC_DATA
+	};
+
+
 const BIGNUM *BN_get0_nist_prime_192(void)
 	{
-	static BIGNUM const_nist_192 = { (BN_ULONG *)_nist_p_192,
-		BN_NIST_192_TOP, BN_NIST_192_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_192;
+	return &_bignum_nist_p_192;
 	}
 
 const BIGNUM *BN_get0_nist_prime_224(void)
 	{
-	static BIGNUM const_nist_224 = { (BN_ULONG *)_nist_p_224,
-		BN_NIST_224_TOP, BN_NIST_224_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_224;
+	return &_bignum_nist_p_224;
 	}
 
 const BIGNUM *BN_get0_nist_prime_256(void)
 	{
-	static BIGNUM const_nist_256 = { (BN_ULONG *)_nist_p_256,
-		BN_NIST_256_TOP, BN_NIST_256_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_256;
+	return &_bignum_nist_p_256;
 	}
 
 const BIGNUM *BN_get0_nist_prime_384(void)
 	{
-	static BIGNUM const_nist_384 = { (BN_ULONG *)_nist_p_384,
-		BN_NIST_384_TOP, BN_NIST_384_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_384;
+	return &_bignum_nist_p_384;
 	}
 
 const BIGNUM *BN_get0_nist_prime_521(void)
 	{
-	static BIGNUM const_nist_521 = { (BN_ULONG *)_nist_p_521,
-		BN_NIST_521_TOP, BN_NIST_521_TOP, 0, BN_FLG_STATIC_DATA };
-	return &const_nist_521;
+	return &_bignum_nist_p_521;
 	}
 
-/* some misc internal functions */
-static BN_ULONG _256_data[BN_NIST_256_TOP*6];
-static int _is_set_256_data = 0;
-static void _init_256_data(void);
 
-static BN_ULONG _384_data[BN_NIST_384_TOP*8];
-static int _is_set_384_data = 0;
-static void _init_384_data(void);
+static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max)
+	{
+	int i;
+	BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
+
+#ifdef BN_DEBUG
+	OPENSSL_assert(top <= max);
+#endif
+	for (i = (top); i != 0; i--)
+		*_tmp1++ = *_tmp2++;
+	for (i = (max) - (top); i != 0; i--)
+		*_tmp1++ = (BN_ULONG) 0;
+	}
+
+static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
+	{ 
+	int i;
+	BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
+	for (i = (top); i != 0; i--)
+		*_tmp1++ = *_tmp2++;
+	}
 
-#define BN_NIST_ADD_ONE(a)	while (!(++(*(a)))) ++(a);
-#define __buf_0			(BN_ULONG)0
-#define __buf_0_1		(BN_ULONG)0
-#define __buf_0_2		(BN_ULONG)0
 #if BN_BITS2 == 64
-#define BN_64_BIT_BUF(n)	BN_ULONG __buf_##n = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n)	__buf_##n = (a)[(n)];
-#define BN_CP_64_FROM_BUF(a,n)	*(a)++ = __buf_##n;
-#define BN_CASE_64_BIT(n,a)	case (n): __buf_##n = (a)[(n)];
-#if	UINT_MAX == 4294967295UL
-#define	nist32	unsigned int
-#define BN_32_BIT_BUF(n)	nist32 __buf_##n = (nist32)0;
-#define BN_CP_32_TO_BUF(n)	__buf_##n = ((nist32 *)(a))[(n)];
-#define BN_CP_32_FROM_BUF(a,n)	*((nist32)(a))++ = __buf_##n;
-#define BN_CASE_32_BIT(n,a)	case (n): __buf_##n = ((nist32)(a))[(n)];
-#elif	ULONG_MAX == 4294967295UL
-#define	nist32	unsigned long
-#define BN_32_BIT_BUF(n)	nist32 __buf_##n = (nist32)0;
-#define BN_CP_32_TO_BUF(n)	__buf_##n = ((nist32 *)(a))[(n)];
-#define BN_CP_32_FROM_BUF(a,n)	*((nist32)(a))++ = __buf_##n;
-#define BN_CASE_32_BIT(n,a)	case (n): __buf_##n = ((nist32)(a))[(n)];
+#define bn_cp_64(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
+#define bn_64_set_0(to, n)		(to)[n] = (BN_ULONG)0;
+/*
+ * two following macros are implemented under assumption that they
+ * are called in a sequence with *ascending* n, i.e. as they are...
+ */
+#define bn_cp_32_naked(to, n, from, m)	(((n)&1)?(to[(n)/2]|=((m)&1)?(from[(m)/2]&BN_MASK2h):(from[(m)/2]<<32))\
+						:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
+#define bn_32_set_0(to, n)		(((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
+#define bn_cp_32(to,n,from,m)		((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
 #else
-#define	NO_32_BIT_TYPE
-#endif
-#elif BN_BITS2 == 32
-#define BN_64_BIT_BUF(n)	BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_2 = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n)	__buf_##n##_2 = (a)[2*(n)+1];\
-				__buf_##n##_1 = (a)[2*(n)];
-#define BN_CP_64_FROM_BUF(a,n)	*(a)++ = __buf_##n##_1;\
-				*(a)++ = __buf_##n##_2;
-#define BN_CASE_64_BIT(n,a)	case 2*(n)+1: __buf_##n##_2 = (a)[2*(n)+1];\
-				case 2*(n):   __buf_##n##_1 = (a)[2*(n)];
-				
-#define BN_32_BIT_BUF(n)	BN_ULONG __buf_##n = (BN_ULONG)0;
-#define BN_CP_32_TO_BUF(n)	__buf_##n = (a)[(n)];
-#define BN_CP_32_FROM_BUF(a,n)	*(a)++ = __buf_##n;
-#define BN_CASE_32_BIT(n,a)	case (n): __buf_##n = (a)[(n)];
-#elif BN_BITS2 == 16
-#define __buf_0_3		(BN_ULONG)0
-#define __buf_0_4		(BN_ULONG)0
-#define BN_64_BIT_BUF(n)	BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_2 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_3 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_4 = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n)	__buf_##n##_4 = (a)[4*(n)+3];\
-				__buf_##n##_3 = (a)[4*(n)+2];\
-				__buf_##n##_2 = (a)[4*(n)+1];\
-				__buf_##n##_1 = (a)[4*(n)];
-#define BN_CP_64_FROM_BUF(a,n)	*(a)++ = __buf_##n##_1;\
-				*(a)++ = __buf_##n##_2;\
-				*(a)++ = __buf_##n##_3;\
-				*(a)++ = __buf_##n##_4;
-#define BN_CASE_64_BIT(n,a)	case 4*(n)+3: __buf_##n##_4 = (a)[4*(n)+3];\
-				case 4*(n)+2: __buf_##n##_3 = (a)[4*(n)+2];\
-				case 4*(n)+1: __buf_##n##_2 = (a)[4*(n)+1];\
-				case 4*(n):   __buf_##n##_1 = (a)[4*(n)];
-#define BN_32_BIT_BUF(n)	BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_2 = (BN_ULONG)0;
-#define BN_CP_32_TO_BUF(n)	__buf_##n##_1 = (a)[2*(n)];\
-				__buf_##n##_2 = (a)[2*(n)+1];
-#define BN_CP_32_FROM_BUF(a,n)	*(a)++ = __buf_##n##_1;\
-				*(a)++ = __buf_##n##_2;
-#define BN_CASE_32_BIT(n,a)	case 2*(n)+1: __buf_##n##_2 = (a)[2*(n)+1];\
-				case 2*(n):   __buf_##n##_1 = (a)[2*(n)];
-#elif BN_BITS2 == 8
-#define __buf_0_3		(BN_ULONG)0
-#define __buf_0_4		(BN_ULONG)0
-#define __buf_0_5		(BN_ULONG)0
-#define __buf_0_6		(BN_ULONG)0
-#define __buf_0_7		(BN_ULONG)0
-#define __buf_0_8		(BN_ULONG)0
-#define BN_64_BIT_BUF(n)	BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_2 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_3 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_4 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_5 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_6 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_7 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_8 = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n)	__buf_##n##_8 = (a)[8*(n)+7];\
-				__buf_##n##_7 = (a)[8*(n)+6];\
-				__buf_##n##_6 = (a)[8*(n)+5];\
-				__buf_##n##_5 = (a)[8*(n)+4];\
-				__buf_##n##_4 = (a)[8*(n)+3];\
-				__buf_##n##_3 = (a)[8*(n)+2];\
-				__buf_##n##_2 = (a)[8*(n)+1];\
-				__buf_##n##_1 = (a)[8*(n)];
-#define BN_CP_64_FROM_BUF(a,n)	*(a)++ = __buf_##n##_1;\
-				*(a)++ = __buf_##n##_2;\
-				*(a)++ = __buf_##n##_3;\
-				*(a)++ = __buf_##n##_4;\
-				*(a)++ = __buf_##n##_5;\
-				*(a)++ = __buf_##n##_6;\
-				*(a)++ = __buf_##n##_7;\
-				*(a)++ = __buf_##n##_8;
-#define BN_CASE_64_BIT(n,a)	case 8*(n)+7: __buf_##n##_8 = (a)[8*(n)+7];\
-				case 8*(n)+6: __buf_##n##_7 = (a)[8*(n)+6];\
-				case 8*(n)+5: __buf_##n##_6 = (a)[8*(n)+5];\
-				case 8*(n)+4: __buf_##n##_5 = (a)[8*(n)+4];\
-				case 8*(n)+3: __buf_##n##_4 = (a)[8*(n)+3];\
-				case 8*(n)+2: __buf_##n##_3 = (a)[8*(n)+2];\
-				case 8*(n)+1: __buf_##n##_2 = (a)[8*(n)+1];\
-				case 8*(n):   __buf_##n##_1 = (a)[8*(n)];
-#define BN_32_BIT_BUF(n)	BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_2 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_3 = (BN_ULONG)0;\
-				BN_ULONG __buf_##n##_4 = (BN_ULONG)0;
-#define BN_CP_32_TO_BUF(n)	__buf_##n##_1 = (a)[4*(n)];\
-				__buf_##n##_2 = (a)[4*(n)+1];\
-				__buf_##n##_3 = (a)[4*(n)+2];\
-				__buf_##n##_4 = (a)[4*(n)+3];
-#define BN_CP_32_FROM_BUF(a,n)	*(a)++ = __buf_##n##_1;\
-				*(a)++ = __buf_##n##_2;\
-				*(a)++ = __buf_##n##_3;\
-				*(a)++ = __buf_##n##_4;
-#define BN_CASE_32_BIT(n,a)	case 4*(n)+3: __buf_##n##_4 = (a)[4*(n)+3];\
-				case 4*(n)+2: __buf_##n##_3 = (a)[4*(n)+2];\
-				case 4*(n)+1: __buf_##n##_2 = (a)[4*(n)+1];\
-				case 4*(n):   __buf_##n##_1 = (a)[4*(n)];
+#define bn_cp_64(to, n, from, m) \
+	{ \
+	bn_cp_32(to, (n)*2, from, (m)*2); \
+	bn_cp_32(to, (n)*2+1, from, (m)*2+1); \
+	}
+#define bn_64_set_0(to, n) \
+	{ \
+	bn_32_set_0(to, (n)*2); \
+	bn_32_set_0(to, (n)*2+1); \
+	}
+#if BN_BITS2 == 32
+#define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
+#define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
 #endif
+#endif /* BN_BITS2 != 64 */
 
 
-#define BN_192_SET(d,a1,a2,a3) \
-	{\
-	register BN_ULONG *td = (d);\
-	BN_CP_64_FROM_BUF(td,a3); BN_CP_64_FROM_BUF(td,a2);\
-	BN_CP_64_FROM_BUF(td,a1);\
+#define nist_set_192(to, from, a1, a2, a3) \
+	{ \
+	bn_cp_64(to, 0, from, (a3) - 3) \
+	bn_cp_64(to, 1, from, (a2) - 3) \
+	bn_cp_64(to, 2, from, (a1) - 3) \
 	}
 
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-	int      top;
-	BN_ULONG carry = 0;
-	register BN_ULONG *r_d, *a_d;
-	BN_ULONG t_d[BN_NIST_192_TOP];
-	BN_64_BIT_BUF(3)  BN_64_BIT_BUF(4)
-	BN_64_BIT_BUF(5)
-
-	top = BN_ucmp(field, a);
-	if (top == 0)
+	int      top = a->top, i;
+	int      carry;
+	register BN_ULONG *r_d, *a_d = a->d;
+	BN_ULONG t_d[BN_NIST_192_TOP],
+	         buf[BN_NIST_192_TOP],
+		 c_d[BN_NIST_192_TOP],
+		*res;
+	size_t   mask;
+	static const BIGNUM _bignum_nist_p_192_sqr = {
+		(BN_ULONG *)_nist_p_192_sqr,
+		sizeof(_nist_p_192_sqr)/sizeof(_nist_p_192_sqr[0]),
+		sizeof(_nist_p_192_sqr)/sizeof(_nist_p_192_sqr[0]),
+		0,BN_FLG_STATIC_DATA };
+
+	field = &_bignum_nist_p_192; /* just to make sure */
+
+ 	if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_192_sqr)>=0)
+		return BN_nnmod(r, a, field, ctx);
+
+	i = BN_ucmp(field, a);
+	if (i == 0)
 		{
 		BN_zero(r);
 		return 1;
 		}
-	else if (top > 0)
-		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
+	else if (i > 0)
+		return (r == a) ? 1 : (BN_copy(r ,a) != NULL);
 
 	if (r != a)
-		if (!BN_ncopy(r, a, BN_NIST_192_TOP))
-			return 0;
-
-	r_d = r->d;
-	a_d = a->d;
-	top = a->top-1;
-
-	switch (top)
 		{
-		BN_CASE_64_BIT(5, a_d)
-		BN_CASE_64_BIT(4, a_d)
-		BN_CASE_64_BIT(3, a_d)
-			break;
-		default: /* a->top == field->top */
-			return BN_usub(r, a, field);
+		if (!bn_wexpand(r, BN_NIST_192_TOP))
+			return 0;
+		r_d = r->d;
+		nist_cp_bn(r_d, a_d, BN_NIST_192_TOP);
 		}
+	else
+		r_d = a_d;
 
-	BN_192_SET(t_d,0,3,3)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
-		++carry;
-
-	BN_192_SET(t_d,4,4,0)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
-		++carry;
+	nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
 
-	BN_192_SET(t_d,5,5,5)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
-		++carry;
+	nist_set_192(t_d, buf, 0, 3, 3);
+	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
+	nist_set_192(t_d, buf, 4, 4, 0);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
+	nist_set_192(t_d, buf, 5, 5, 5)
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
 
-	while (carry)
-		{
-		if (bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP))
-			--carry; 
-		}
+	if (carry > 0)
+		carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
+	else
+		carry = 1;
+
+	/*
+	 * we need 'if (carry==0 || result>=modulus) result-=modulus;'
+	 * as comparison implies subtraction, we can write
+	 * 'tmp=result-modulus; if (!carry || !borrow) result=tmp;'
+	 * this is what happens below, but without explicit if:-) a.
+	 */
+	mask  = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192[0],BN_NIST_192_TOP);
+	mask &= 0-(size_t)carry;
+	res   = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+	nist_cp_bn(r_d, res, BN_NIST_192_TOP);
 	r->top = BN_NIST_192_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
-		{
-		bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP);
-		bn_correct_top(r);
-		}
 
-	bn_check_top(r);
 	return 1;
 	}
 
-#define BN_224_SET(d,a1,a2,a3,a4,a5,a6,a7) \
-	{\
-	register BN_ULONG *td = (d);\
-	BN_CP_32_FROM_BUF(td,a7); BN_CP_32_FROM_BUF(td,a6);\
-	BN_CP_32_FROM_BUF(td,a5); BN_CP_32_FROM_BUF(td,a4);\
-	BN_CP_32_FROM_BUF(td,a3); BN_CP_32_FROM_BUF(td,a2);\
-	BN_CP_32_FROM_BUF(td,a1);\
+typedef BN_ULONG (*bn_addsub_f)(BN_ULONG *,const BN_ULONG *,const BN_ULONG *,int);
+
+#define nist_set_224(to, from, a1, a2, a3, a4, a5, a6, a7) \
+	{ \
+	bn_cp_32(to, 0, from, (a7) - 7) \
+	bn_cp_32(to, 1, from, (a6) - 7) \
+	bn_cp_32(to, 2, from, (a5) - 7) \
+	bn_cp_32(to, 3, from, (a4) - 7) \
+	bn_cp_32(to, 4, from, (a3) - 7) \
+	bn_cp_32(to, 5, from, (a2) - 7) \
+	bn_cp_32(to, 6, from, (a1) - 7) \
 	}
 
 int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#ifndef NO_32_BIT_TYPE
-	int	tmp_int;
-	int	carry = 0;
-	BN_ULONG *r_d, *a_d;
-	BN_ULONG t_d[BN_NIST_224_TOP];
-	BN_32_BIT_BUF(7)  BN_32_BIT_BUF(8)
-	BN_32_BIT_BUF(9)  BN_32_BIT_BUF(10)
-	BN_32_BIT_BUF(11) BN_32_BIT_BUF(12)
-	BN_32_BIT_BUF(13)
-
-	tmp_int = BN_ucmp(field, a);
-	if (tmp_int == 0)
+	int	top = a->top, i;
+	int	carry;
+	BN_ULONG *r_d, *a_d = a->d;
+	BN_ULONG t_d[BN_NIST_224_TOP],
+	         buf[BN_NIST_224_TOP],
+		 c_d[BN_NIST_224_TOP],
+		*res;
+	size_t   mask;
+	union { bn_addsub_f f; size_t p; } u;
+	static const BIGNUM _bignum_nist_p_224_sqr = {
+		(BN_ULONG *)_nist_p_224_sqr,
+		sizeof(_nist_p_224_sqr)/sizeof(_nist_p_224_sqr[0]),
+		sizeof(_nist_p_224_sqr)/sizeof(_nist_p_224_sqr[0]),
+		0,BN_FLG_STATIC_DATA };
+
+
+	field = &_bignum_nist_p_224; /* just to make sure */
+
+ 	if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_224_sqr)>=0)
+		return BN_nnmod(r, a, field, ctx);
+
+	i = BN_ucmp(field, a);
+	if (i == 0)
 		{
 		BN_zero(r);
 		return 1;
 		}
-	else if (tmp_int > 0)
+	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
 	if (r != a)
-		if (!BN_ncopy(r, a, BN_NIST_224_TOP))
-			return 0;
-
-	r_d = r->d;
-	a_d = a->d;
-
-	tmp_int = a->top-1;
-
-	switch (tmp_int)
 		{
-		BN_CASE_32_BIT(13, a_d)
-		BN_CASE_32_BIT(12, a_d)
-		BN_CASE_32_BIT(11, a_d)
-		BN_CASE_32_BIT(10, a_d)
-		BN_CASE_32_BIT(9,  a_d)
-		BN_CASE_32_BIT(8,  a_d)
-		BN_CASE_32_BIT(7,  a_d)
-			break;
-		default: /* a->top == field->top */
-			return BN_usub(r, a, field);
+		if (!bn_wexpand(r, BN_NIST_224_TOP))
+			return 0;
+		r_d = r->d;
+		nist_cp_bn(r_d, a_d, BN_NIST_224_TOP);
 		}
-
-	BN_224_SET(t_d,10,9,8,7,0,0,0)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		++carry;
-	BN_224_SET(t_d,0,13,12,11,0,0,0)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		++carry;
-	BN_224_SET(t_d,13,12,11,10,9,8,7)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		--carry;
-	BN_224_SET(t_d,0,0,0,0,13,12,11)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
-		--carry;
-
+	else
+		r_d = a_d;
+
+#if BN_BITS2==64
+	/* copy upper 256 bits of 448 bit number ... */
+	nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+	/* ... and right shift by 32 to obtain upper 224 bits */
+	nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+	/* truncate lower part to 224 bits too */
+	r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
+#else
+	nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
+#endif
+	nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
+	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+	nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+	nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+	nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+
+#if BN_BITS2==64
+	carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
+#endif
+	u.f = bn_sub_words;
 	if (carry > 0)
-		while (carry)
-			{
-			if (bn_sub_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP))
-				--carry;
-			}
-	else if (carry < 0)
-		while (carry)
-			{
-			if (bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP))
-				++carry;
-			}
-
-	r->top = BN_NIST_224_TOP;
-	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
 		{
-		bn_sub_words(r_d, r_d, _nist_p_224, BN_NIST_224_TOP);
-		bn_correct_top(r);
-		}
-	bn_check_top(r);
-	return 1;
-#else
-	return 0;
+		carry = (int)bn_sub_words(r_d,r_d,_nist_p_224[carry-1],BN_NIST_224_TOP);
+#if BN_BITS2==64
+		carry=(int)(~(r_d[BN_NIST_224_TOP-1]>>32))&1;
 #endif
-	}
-
-static void _init_256_data(void)
-	{
-	int	i;
-	BN_ULONG *tmp1 = _256_data;
-	const BN_ULONG *tmp2 = tmp1;
-
-	memcpy(tmp1, _nist_p_256, BN_NIST_256_TOP * sizeof(BN_ULONG));
-	tmp1 += BN_NIST_256_TOP;
-
-	for (i=0; i<5; i++)
+		}
+	else if (carry < 0)
 		{
-		bn_add_words(tmp1, _nist_p_256, tmp2, BN_NIST_256_TOP);
-		tmp2  = tmp1;
-		tmp1 += BN_NIST_256_TOP;
+		/* it's a bit more comlicated logic in this case.
+		 * if bn_add_words yields no carry, then result
+		 * has to be adjusted by unconditionally *adding*
+		 * the modulus. but if it does, then result has
+		 * to be compared to the modulus and conditionally
+		 * adjusted by *subtracting* the latter. */
+		carry = (int)bn_add_words(r_d,r_d,_nist_p_224[-carry-1],BN_NIST_224_TOP);
+		mask = 0-(size_t)carry;
+		u.p = ((size_t)bn_sub_words&mask) | ((size_t)bn_add_words&~mask);
 		}
-	_is_set_256_data = 1;
+	else
+		carry = 1;
+
+	/* otherwise it's effectively same as in BN_nist_mod_192... */
+	mask  = 0-(size_t)(*u.f)(c_d,r_d,_nist_p_224[0],BN_NIST_224_TOP);
+	mask &= 0-(size_t)carry;
+	res   = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+	nist_cp_bn(r_d, res, BN_NIST_224_TOP);
+	r->top = BN_NIST_224_TOP;
+	bn_correct_top(r);
+
+	return 1;
 	}
 
-#define BN_256_SET(d,a1,a2,a3,a4,a5,a6,a7,a8) \
-	{\
-	register BN_ULONG *td = (d);\
-	BN_CP_32_FROM_BUF(td,a8); BN_CP_32_FROM_BUF(td,a7);\
-	BN_CP_32_FROM_BUF(td,a6); BN_CP_32_FROM_BUF(td,a5);\
-	BN_CP_32_FROM_BUF(td,a4); BN_CP_32_FROM_BUF(td,a3);\
-	BN_CP_32_FROM_BUF(td,a2); BN_CP_32_FROM_BUF(td,a1);\
+#define nist_set_256(to, from, a1, a2, a3, a4, a5, a6, a7, a8) \
+	{ \
+	bn_cp_32(to, 0, from, (a8) - 8) \
+	bn_cp_32(to, 1, from, (a7) - 8) \
+	bn_cp_32(to, 2, from, (a6) - 8) \
+	bn_cp_32(to, 3, from, (a5) - 8) \
+	bn_cp_32(to, 4, from, (a4) - 8) \
+	bn_cp_32(to, 5, from, (a3) - 8) \
+	bn_cp_32(to, 6, from, (a2) - 8) \
+	bn_cp_32(to, 7, from, (a1) - 8) \
 	}
 
 int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#ifndef NO_32_BIT_TYPE
-	int	tmp_int;
+	int	i, top = a->top;
 	int	carry = 0;
-	register BN_ULONG *a_d, *r_d;
-	BN_ULONG t_d[BN_NIST_256_TOP];
-	BN_ULONG t_d2[BN_NIST_256_TOP];
-	BN_32_BIT_BUF(8)  BN_32_BIT_BUF(9)
-	BN_32_BIT_BUF(10) BN_32_BIT_BUF(11)
-	BN_32_BIT_BUF(12) BN_32_BIT_BUF(13)
-	BN_32_BIT_BUF(14) BN_32_BIT_BUF(15)
-
-	if (!_is_set_256_data)
-		{
-		CRYPTO_w_lock(CRYPTO_LOCK_BN);
-		
-		if (!_is_set_256_data)
-			_init_256_data();
-		
-		CRYPTO_w_unlock(CRYPTO_LOCK_BN);
-		}
-	
-	tmp_int = BN_ucmp(field, a);
-	if (tmp_int == 0)
+	register BN_ULONG *a_d = a->d, *r_d;
+	BN_ULONG t_d[BN_NIST_256_TOP],
+	         buf[BN_NIST_256_TOP],
+		 c_d[BN_NIST_256_TOP],
+		*res;
+	size_t   mask;
+	union { bn_addsub_f f; size_t p; } u;
+	static const BIGNUM _bignum_nist_p_256_sqr = {
+		(BN_ULONG *)_nist_p_256_sqr,
+		sizeof(_nist_p_256_sqr)/sizeof(_nist_p_256_sqr[0]),
+		sizeof(_nist_p_256_sqr)/sizeof(_nist_p_256_sqr[0]),
+		0,BN_FLG_STATIC_DATA };
+
+	field = &_bignum_nist_p_256; /* just to make sure */
+
+ 	if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_256_sqr)>=0)
+		return BN_nnmod(r, a, field, ctx);
+
+	i = BN_ucmp(field, a);
+	if (i == 0)
 		{
 		BN_zero(r);
 		return 1;
 		}
-	else if (tmp_int > 0)
+	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
 	if (r != a)
-		if (!BN_ncopy(r, a, BN_NIST_256_TOP))
-			return 0;
-
-	tmp_int = a->top-1;
-
-	a_d = a->d;
-	r_d = r->d;
-	switch (tmp_int)
 		{
-		BN_CASE_32_BIT(15, a_d)
-		BN_CASE_32_BIT(14, a_d)
-		BN_CASE_32_BIT(13, a_d)
-		BN_CASE_32_BIT(12, a_d)
-		BN_CASE_32_BIT(11, a_d)
-		BN_CASE_32_BIT(10, a_d)
-		BN_CASE_32_BIT(9,  a_d)
-		BN_CASE_32_BIT(8,  a_d)
-			break;
-		default: /* a->top == field->top */
-			return BN_usub(r, a, field);
+		if (!bn_wexpand(r, BN_NIST_256_TOP))
+			return 0;
+		r_d = r->d;
+		nist_cp_bn(r_d, a_d, BN_NIST_256_TOP);
 		}
+	else
+		r_d = a_d;
+
+	nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
 
 	/*S1*/
-	BN_256_SET(t_d,15,14,13,12,11,0,0,0)
+	nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
 	/*S2*/
-	BN_256_SET(t_d2,0,15,14,13,12,0,0,0)
-	if (bn_add_words(t_d, t_d, t_d2, BN_NIST_256_TOP))
-		carry = 2;
+	nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+	carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
 	/* left shift */
 		{
 		register BN_ULONG *ap,t,c;
 		ap = t_d;
 		c=0;
-		for (tmp_int=BN_NIST_256_TOP; tmp_int != 0; --tmp_int)
+		for (i = BN_NIST_256_TOP; i != 0; --i)
 			{
 			t= *ap;
 			*(ap++)=((t<<1)|c)&BN_MASK2;
 			c=(t & BN_TBIT)?1:0;
 			}
-		if (c)
-			++carry;
+		carry <<= 1;
+		carry  |= c;
 		}
-
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		++carry;
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S3*/
-	BN_256_SET(t_d,15,14,0,0,0,10,9,8)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		++carry;
+	nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S4*/
-	BN_256_SET(t_d,8,13,15,14,13,11,10,9)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		++carry;
+	nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D1*/
-	BN_256_SET(t_d,10,8,0,0,0,13,12,11)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
+	nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D2*/
-	BN_256_SET(t_d,11,9,0,0,15,14,13,12)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
+	nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D3*/
-	BN_256_SET(t_d,12,0,10,9,8,15,14,13)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
+	nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D4*/
-	BN_256_SET(t_d,13,0,11,10,9,0,15,14)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
-		--carry;
-	
-	if (carry)
+	nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
+
+	/* see BN_nist_mod_224 for explanation */
+	u.f = bn_sub_words;
+	if (carry > 0)
+		carry = (int)bn_sub_words(r_d,r_d,_nist_p_256[carry-1],BN_NIST_256_TOP);
+	else if (carry < 0)
 		{
-		if (carry > 0)
-			bn_sub_words(r_d, r_d, _256_data + BN_NIST_256_TOP *
-				--carry, BN_NIST_256_TOP);
-		else
-			{
-			carry = -carry;
-			bn_add_words(r_d, r_d, _256_data + BN_NIST_256_TOP *
-				--carry, BN_NIST_256_TOP);
-			}
+		carry = (int)bn_add_words(r_d,r_d,_nist_p_256[-carry-1],BN_NIST_256_TOP);
+		mask = 0-(size_t)carry;
+		u.p = ((size_t)bn_sub_words&mask) | ((size_t)bn_add_words&~mask);
 		}
+	else
+		carry = 1;
 
+	mask  = 0-(size_t)(*u.f)(c_d,r_d,_nist_p_256[0],BN_NIST_256_TOP);
+	mask &= 0-(size_t)carry;
+	res   = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+	nist_cp_bn(r_d, res, BN_NIST_256_TOP);
 	r->top = BN_NIST_256_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
-		{
-		bn_sub_words(r_d, r_d, _nist_p_256, BN_NIST_256_TOP);
-		bn_correct_top(r);
-		}
-	bn_check_top(r);
-	return 1;
-#else
-	return 0;
-#endif
-	}
-
-static void _init_384_data(void)
-	{
-	int	i;
-	BN_ULONG *tmp1 = _384_data;
-	const BN_ULONG *tmp2 = tmp1;
-
-	memcpy(tmp1, _nist_p_384, BN_NIST_384_TOP * sizeof(BN_ULONG));
-	tmp1 += BN_NIST_384_TOP;
 
-	for (i=0; i<7; i++)
-		{
-		bn_add_words(tmp1, _nist_p_384, tmp2, BN_NIST_384_TOP);
-		tmp2  = tmp1;
-		tmp1 += BN_NIST_384_TOP;
-		}
-	_is_set_384_data = 1;
+	return 1;
 	}
 
-#define BN_384_SET(d,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
-	{\
-	register BN_ULONG *td = (d);\
-	BN_CP_32_FROM_BUF(td,a12); BN_CP_32_FROM_BUF(td,a11);\
-	BN_CP_32_FROM_BUF(td,a10); BN_CP_32_FROM_BUF(td,a9);\
-	BN_CP_32_FROM_BUF(td,a8);  BN_CP_32_FROM_BUF(td,a7);\
-	BN_CP_32_FROM_BUF(td,a6);  BN_CP_32_FROM_BUF(td,a5);\
-	BN_CP_32_FROM_BUF(td,a4);  BN_CP_32_FROM_BUF(td,a3);\
-	BN_CP_32_FROM_BUF(td,a2);  BN_CP_32_FROM_BUF(td,a1);\
+#define nist_set_384(to,from,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
+	{ \
+	bn_cp_32(to, 0, from,  (a12) - 12) \
+	bn_cp_32(to, 1, from,  (a11) - 12) \
+	bn_cp_32(to, 2, from,  (a10) - 12) \
+	bn_cp_32(to, 3, from,  (a9) - 12)  \
+	bn_cp_32(to, 4, from,  (a8) - 12)  \
+	bn_cp_32(to, 5, from,  (a7) - 12)  \
+	bn_cp_32(to, 6, from,  (a6) - 12)  \
+	bn_cp_32(to, 7, from,  (a5) - 12)  \
+	bn_cp_32(to, 8, from,  (a4) - 12)  \
+	bn_cp_32(to, 9, from,  (a3) - 12)  \
+	bn_cp_32(to, 10, from, (a2) - 12)  \
+	bn_cp_32(to, 11, from, (a1) - 12)  \
 	}
 
 int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#ifndef NO_32_BIT_TYPE
-	int	tmp_int;
+	int	i, top = a->top;
 	int	carry = 0;
-	register BN_ULONG *r_d, *a_d;
-	BN_ULONG t_d[BN_NIST_384_TOP];
-	BN_32_BIT_BUF(12) BN_32_BIT_BUF(13)
-	BN_32_BIT_BUF(14) BN_32_BIT_BUF(15)
-	BN_32_BIT_BUF(16) BN_32_BIT_BUF(17)
-	BN_32_BIT_BUF(18) BN_32_BIT_BUF(19)
-	BN_32_BIT_BUF(20) BN_32_BIT_BUF(21)
-	BN_32_BIT_BUF(22) BN_32_BIT_BUF(23)
-
-	if (!_is_set_384_data)
-		{
-		CRYPTO_w_lock(CRYPTO_LOCK_BN);
-		
-		if (!_is_set_384_data)
-			_init_384_data();
-
-		CRYPTO_w_unlock(CRYPTO_LOCK_BN);
-		}
-
-	tmp_int = BN_ucmp(field, a);
-	if (tmp_int == 0)
+	register BN_ULONG *r_d, *a_d = a->d;
+	BN_ULONG t_d[BN_NIST_384_TOP],
+	         buf[BN_NIST_384_TOP],
+		 c_d[BN_NIST_384_TOP],
+		*res;
+	size_t	 mask;
+	union { bn_addsub_f f; size_t p; } u;
+	static const BIGNUM _bignum_nist_p_384_sqr = {
+		(BN_ULONG *)_nist_p_384_sqr,
+		sizeof(_nist_p_384_sqr)/sizeof(_nist_p_384_sqr[0]),
+		sizeof(_nist_p_384_sqr)/sizeof(_nist_p_384_sqr[0]),
+		0,BN_FLG_STATIC_DATA };
+
+
+	field = &_bignum_nist_p_384; /* just to make sure */
+
+ 	if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_384_sqr)>=0)
+		return BN_nnmod(r, a, field, ctx);
+
+	i = BN_ucmp(field, a);
+	if (i == 0)
 		{
 		BN_zero(r);
 		return 1;
 		}
-	else if (tmp_int > 0)
+	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
 	if (r != a)
-		if (!BN_ncopy(r, a, BN_NIST_384_TOP))
-			return 0;
-
-	r_d = r->d;
-	a_d = a->d;
-	tmp_int = a->top-1;
-
-	switch (tmp_int)
 		{
-		BN_CASE_32_BIT(23, a_d)
-		BN_CASE_32_BIT(22, a_d)
-		BN_CASE_32_BIT(21, a_d)
-		BN_CASE_32_BIT(20, a_d)
-		BN_CASE_32_BIT(19, a_d)
-		BN_CASE_32_BIT(18, a_d)
-		BN_CASE_32_BIT(17, a_d)
-		BN_CASE_32_BIT(16, a_d)
-		BN_CASE_32_BIT(15, a_d)
-		BN_CASE_32_BIT(14, a_d)
-		BN_CASE_32_BIT(13, a_d)
-		BN_CASE_32_BIT(12, a_d)
-			break;
-		default: /* a->top == field->top */
-			return BN_usub(r, a, field);
+		if (!bn_wexpand(r, BN_NIST_384_TOP))
+			return 0;
+		r_d = r->d;
+		nist_cp_bn(r_d, a_d, BN_NIST_384_TOP);
 		}
+	else
+		r_d = a_d;
+
+	nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
 
 	/*S1*/
-	BN_256_SET(t_d,0,0,0,0,0,23,22,21)
+	nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
 		/* left shift */
 		{
 		register BN_ULONG *ap,t,c;
 		ap = t_d;
 		c=0;
-		for (tmp_int=BN_NIST_256_TOP; tmp_int != 0; --tmp_int)
+		for (i = 3; i != 0; --i)
 			{
 			t= *ap;
 			*(ap++)=((t<<1)|c)&BN_MASK2;
 			c=(t & BN_TBIT)?1:0;
 			}
+		*ap=c;
 		}
-	if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
-		t_d, BN_NIST_256_TOP))
-		++carry;
-	/*S2*/
-	BN_384_SET(t_d,23,22,21,20,19,18,17,16,15,14,13,12)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
+		t_d, BN_NIST_256_TOP);
+	/*S2 */
+	carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
 	/*S3*/
-	BN_384_SET(t_d,20,19,18,17,16,15,14,13,12,23,22,21)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S4*/
-	BN_384_SET(t_d,19,18,17,16,15,14,13,12,20,0,23,0)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S5*/
-	BN_256_SET(t_d,0,0,0,0,23,22,21,20)
-	if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
-		t_d, BN_NIST_256_TOP))
-		++carry;
+	nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S6*/
-	BN_384_SET(t_d,0,0,0,0,0,0,23,22,21,0,0,20)
-	if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		++carry;
+	nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D1*/
-	BN_384_SET(t_d,22,21,20,19,18,17,16,15,14,13,12,23)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		--carry;
+	nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D2*/
-	BN_384_SET(t_d,0,0,0,0,0,0,0,23,22,21,20,0)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		--carry;
+	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D3*/
-	BN_384_SET(t_d,0,0,0,0,0,0,0,23,23,0,0,0)
-	if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
-		--carry;
-	
-	if (carry)
+	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
+
+	/* see BN_nist_mod_224 for explanation */
+	u.f = bn_sub_words;
+	if (carry > 0)
+		carry = (int)bn_sub_words(r_d,r_d,_nist_p_384[carry-1],BN_NIST_384_TOP);
+	else if (carry < 0)
 		{
-		if (carry > 0)
-			bn_sub_words(r_d, r_d, _384_data + BN_NIST_384_TOP *
-				--carry, BN_NIST_384_TOP);
-		else
-			{
-			carry = -carry;
-			bn_add_words(r_d, r_d, _384_data + BN_NIST_384_TOP *
-				--carry, BN_NIST_384_TOP);
-			}
+		carry = (int)bn_add_words(r_d,r_d,_nist_p_384[-carry-1],BN_NIST_384_TOP);
+		mask = 0-(size_t)carry;
+		u.p = ((size_t)bn_sub_words&mask) | ((size_t)bn_add_words&~mask);
 		}
+	else
+		carry = 1;
 
+	mask  = 0-(size_t)(*u.f)(c_d,r_d,_nist_p_384[0],BN_NIST_384_TOP);
+	mask &= 0-(size_t)carry;
+	res   = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+	nist_cp_bn(r_d, res, BN_NIST_384_TOP);
 	r->top = BN_NIST_384_TOP;
 	bn_correct_top(r);
-	if (BN_ucmp(r, field) >= 0)
-		{
-		bn_sub_words(r_d, r_d, _nist_p_384, BN_NIST_384_TOP);
-		bn_correct_top(r);
-		}
-	bn_check_top(r);
+
 	return 1;
-#else
-	return 0;
-#endif
 	}
 
+#define BN_NIST_521_RSHIFT	(521%BN_BITS2)
+#define BN_NIST_521_LSHIFT	(BN_BITS2-BN_NIST_521_RSHIFT)
+#define BN_NIST_521_TOP_MASK	((BN_ULONG)BN_MASK2>>BN_NIST_521_LSHIFT)
+
 int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	BN_CTX *ctx)
 	{
-#if BN_BITS2 == 64
-#define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1FF
-#elif BN_BITS2 == 32
-#define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1FF
-#elif BN_BITS2 == 16
-#define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1FF
-#elif BN_BITS2 == 8
-#define BN_NIST_521_TOP_MASK	(BN_ULONG)0x1
-#endif
-	int	top, ret = 0;
-	BN_ULONG *r_d;
-	BIGNUM	*tmp;
-
-	/* check whether a reduction is necessary */
-	top = a->top;
-	if (top < BN_NIST_521_TOP  || ( top == BN_NIST_521_TOP &&
-           (!(a->d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))))
+	int	top = a->top, i;
+	BN_ULONG *r_d, *a_d = a->d,
+		 t_d[BN_NIST_521_TOP],
+		 val,tmp,*res;
+	size_t	mask;
+	static const BIGNUM _bignum_nist_p_521_sqr = {
+		(BN_ULONG *)_nist_p_521_sqr,
+		sizeof(_nist_p_521_sqr)/sizeof(_nist_p_521_sqr[0]),
+		sizeof(_nist_p_521_sqr)/sizeof(_nist_p_521_sqr[0]),
+		0,BN_FLG_STATIC_DATA };
+
+	field = &_bignum_nist_p_521; /* just to make sure */
+
+ 	if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_521_sqr)>=0)
+		return BN_nnmod(r, a, field, ctx);
+
+	i = BN_ucmp(field, a);
+	if (i == 0)
+		{
+		BN_zero(r);
+		return 1;
+		}
+	else if (i > 0)
 		return (r == a)? 1 : (BN_copy(r ,a) != NULL);
 
-	BN_CTX_start(ctx);
-	tmp = BN_CTX_get(ctx);
-	if (!tmp)
-		goto err;
-
-	if (!BN_ncopy(tmp, a, BN_NIST_521_TOP))
-		return 0;
-	if (!BN_rshift(r, a, 521))
-		return 0;
-
-	if (tmp->top == BN_NIST_521_TOP)
-		tmp->d[BN_NIST_521_TOP-1]  &= BN_NIST_521_TOP_MASK;
-
-	bn_correct_top(tmp);
-	if (!BN_uadd(r, tmp, r))
-		return 0;
-	top = r->top;
-	r_d = r->d;
-	if (top == BN_NIST_521_TOP  && 
-           (r_d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))
+	if (r != a)
 		{
-		BN_NIST_ADD_ONE(r_d)
-		r_d[BN_NIST_521_TOP-1] &= BN_NIST_521_TOP_MASK; 
+		if (!bn_wexpand(r,BN_NIST_521_TOP))
+			return 0;
+		r_d = r->d;
+		nist_cp_bn(r_d,a_d, BN_NIST_521_TOP);
 		}
-	bn_correct_top(r);
+	else
+		r_d = a_d;
 
-	ret = 1;
-err:
-	BN_CTX_end(ctx);
+	/* upper 521 bits, copy ... */
+	nist_cp_bn_0(t_d,a_d + (BN_NIST_521_TOP-1), top - (BN_NIST_521_TOP-1),BN_NIST_521_TOP);
+	/* ... and right shift */
+	for (val=t_d[0],i=0; i<BN_NIST_521_TOP-1; i++)
+		{
+		tmp = val>>BN_NIST_521_RSHIFT;
+		val = t_d[i+1];
+		t_d[i] = (tmp | val<<BN_NIST_521_LSHIFT) & BN_MASK2;
+		}
+	t_d[i] = val>>BN_NIST_521_RSHIFT;
+	/* lower 521 bits */
+	r_d[i] &= BN_NIST_521_TOP_MASK;
+
+	bn_add_words(r_d,r_d,t_d,BN_NIST_521_TOP);
+	mask = 0-(size_t)bn_sub_words(t_d,r_d,_nist_p_521,BN_NIST_521_TOP);
+	res  = (BN_ULONG *)(((size_t)t_d&~mask) | ((size_t)r_d&mask));
+	nist_cp_bn(r_d,res,BN_NIST_521_TOP);
+	r->top = BN_NIST_521_TOP;
+	bn_correct_top(r);
 
-	bn_check_top(r);
-	return ret;
+	return 1;
 	}
diff --git a/crypto/bn/bn_prime.c b/crypto/bn/bn_prime.c
index d03403a600..7b25979dd1 100644
--- a/crypto/bn/bn_prime.c
+++ b/crypto/bn/bn_prime.c
@@ -258,7 +258,8 @@ int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
 
 	/* first look for small factors */
 	if (!BN_is_odd(a))
-		return 0;
+		/* a is even => a is prime if and only if a == 2 */
+		return BN_is_word(a, 2);
 	if (do_trial_division)
 		{
 		for (i = 1; i < NUMPRIMES; i++)
@@ -376,14 +377,15 @@ static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
 static int probable_prime(BIGNUM *rnd, int bits)
 	{
 	int i;
-	BN_ULONG mods[NUMPRIMES];
-	BN_ULONG delta,d;
+	prime_t mods[NUMPRIMES];
+	BN_ULONG delta,maxdelta;
 
 again:
 	if (!BN_rand(rnd,bits,1,1)) return(0);
 	/* we now have a random number 'rand' to test. */
 	for (i=1; i<NUMPRIMES; i++)
-		mods[i]=BN_mod_word(rnd,(BN_ULONG)primes[i]);
+		mods[i]=(prime_t)BN_mod_word(rnd,(BN_ULONG)primes[i]);
+	maxdelta=BN_MASK2 - primes[NUMPRIMES-1];
 	delta=0;
 	loop: for (i=1; i<NUMPRIMES; i++)
 		{
@@ -391,12 +393,8 @@ again:
 		 * that gcd(rnd-1,primes) == 1 (except for 2) */
 		if (((mods[i]+delta)%primes[i]) <= 1)
 			{
-			d=delta;
 			delta+=2;
-			/* perhaps need to check for overflow of
-			 * delta (but delta can be up to 2^32)
-			 * 21-May-98 eay - added overflow check */
-			if (delta < d) goto again;
+			if (delta > maxdelta) goto again;
 			goto loop;
 			}
 		}
diff --git a/crypto/bn/bn_prime.h b/crypto/bn/bn_prime.h
index b7cf9a9bfe..51d2194feb 100644
--- a/crypto/bn/bn_prime.h
+++ b/crypto/bn/bn_prime.h
@@ -58,10 +58,12 @@
 
 #ifndef EIGHT_BIT
 #define NUMPRIMES 2048
+typedef unsigned short prime_t;
 #else
 #define NUMPRIMES 54
+typedef unsigned char prime_t;
 #endif
-static const unsigned int primes[NUMPRIMES]=
+static const prime_t primes[NUMPRIMES]=
 	{
 	   2,   3,   5,   7,  11,  13,  17,  19,
 	  23,  29,  31,  37,  41,  43,  47,  53,
diff --git a/crypto/bn/bn_prime.pl b/crypto/bn/bn_prime.pl
index e583d1d53b..3fafb6f3e9 100644
--- a/crypto/bn/bn_prime.pl
+++ b/crypto/bn/bn_prime.pl
@@ -101,10 +101,12 @@ for ($i=0; $i <= $#primes; $i++)
 
 printf "#ifndef EIGHT_BIT\n";
 printf "#define NUMPRIMES %d\n",$num;
+printf "typedef unsigned short prime_t;\n";
 printf "#else\n";
 printf "#define NUMPRIMES %d\n",$eight;
+printf "typedef unsigned char prime_t;\n";
 printf "#endif\n";
-print "static const unsigned int primes[NUMPRIMES]=\n\t{\n\t";
+print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
 $init=0;
 for ($i=0; $i <= $#primes; $i++)
 	{
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index 5fb8473f37..bebb466d08 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -62,7 +62,7 @@
 #include <openssl/buffer.h>
 #include "bn_lcl.h"
 
-static const char *Hex="0123456789ABCDEF";
+static const char Hex[]="0123456789ABCDEF";
 
 /* Must 'OPENSSL_free' the returned data */
 char *BN_bn2hex(const BIGNUM *a)
@@ -134,7 +134,7 @@ char *BN_bn2dec(const BIGNUM *a)
 		}
 	else
 		{
-		if (BN_get_sign(t))
+		if (BN_is_negative(t))
 			*p++ = '-';
 
 		i=0;
@@ -294,6 +294,27 @@ err:
 	return(0);
 	}
 
+int BN_asc2bn(BIGNUM **bn, const char *a)
+	{
+	const char *p = a;
+	if (*p == '-')
+		p++;
+
+	if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x'))
+		{		
+		if (!BN_hex2bn(bn, p + 2))
+			return 0;
+		}
+	else
+		{
+		if (!BN_dec2bn(bn, p))
+			return 0;
+		}
+	if (*a == '-')
+		(*bn)->neg = 1;
+	return 1;
+	}
+
 #ifndef OPENSSL_NO_BIO
 #ifndef OPENSSL_NO_FP_API
 int BN_print_fp(FILE *fp, const BIGNUM *a)
diff --git a/crypto/bn/bn_rand.c b/crypto/bn/bn_rand.c
index 323bfa74bc..b376c28ff3 100644
--- a/crypto/bn/bn_rand.c
+++ b/crypto/bn/bn_rand.c
@@ -134,7 +134,7 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
 	buf=(unsigned char *)OPENSSL_malloc(bytes);
 	if (buf == NULL)
 		{
-		BNerr(BN_F_BN_RAND,ERR_R_MALLOC_FAILURE);
+		BNerr(BN_F_BNRAND,ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
 
@@ -227,7 +227,7 @@ int     BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
 
 
 /* random number r:  0 <= r < range */
-static int bn_rand_range(int pseudo, BIGNUM *r, BIGNUM *range)
+static int bn_rand_range(int pseudo, BIGNUM *r, const BIGNUM *range)
 	{
 	int (*bn_rand)(BIGNUM *, int, int, int) = pseudo ? BN_pseudo_rand : BN_rand;
 	int n;
@@ -294,12 +294,12 @@ static int bn_rand_range(int pseudo, BIGNUM *r, BIGNUM *range)
 	}
 
 
-int	BN_rand_range(BIGNUM *r, BIGNUM *range)
+int	BN_rand_range(BIGNUM *r, const BIGNUM *range)
 	{
 	return bn_rand_range(0, r, range);
 	}
 
-int	BN_pseudo_rand_range(BIGNUM *r, BIGNUM *range)
+int	BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
 	{
 	return bn_rand_range(1, r, range);
 	}
diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c
index a08489e04a..2e8efb8dae 100644
--- a/crypto/bn/bn_recp.c
+++ b/crypto/bn/bn_recp.c
@@ -191,7 +191,7 @@ int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 		{
 		if (j++ > 2)
 			{
-			BNerr(BN_F_BN_MOD_MUL_RECIPROCAL,BN_R_BAD_RECIPROCAL);
+			BNerr(BN_F_BN_DIV_RECP,BN_R_BAD_RECIPROCAL);
 			goto err;
 			}
 		if (!BN_usub(r,r,&(recp->N))) goto err;
@@ -204,8 +204,8 @@ int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	ret=1;
 err:
 	BN_CTX_end(ctx);
-	if(dv) bn_check_top(dv);
-	if(rem) bn_check_top(rem);
+	bn_check_top(dv);
+	bn_check_top(rem);
 	return(ret);
 	} 
 
diff --git a/crypto/bn/bn_shift.c b/crypto/bn/bn_shift.c
index de9312dce2..c4d301afc4 100644
--- a/crypto/bn/bn_shift.c
+++ b/crypto/bn/bn_shift.c
@@ -177,7 +177,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 	nw=n/BN_BITS2;
 	rb=n%BN_BITS2;
 	lb=BN_BITS2-rb;
-	if (nw > a->top || a->top == 0)
+	if (nw >= a->top || a->top == 0)
 		{
 		BN_zero(r);
 		return(1);
diff --git a/crypto/bn/bn_sqr.c b/crypto/bn/bn_sqr.c
index 3b4b3f0d38..270d0cd348 100644
--- a/crypto/bn/bn_sqr.c
+++ b/crypto/bn/bn_sqr.c
@@ -148,8 +148,8 @@ int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
 	if (rr != r) BN_copy(r,rr);
 	ret = 1;
  err:
-	if(rr) bn_check_top(rr);
-	if(tmp) bn_check_top(tmp);
+	bn_check_top(rr);
+	bn_check_top(tmp);
 	BN_CTX_end(ctx);
 	return(ret);
 	}
diff --git a/crypto/bn/bn_sqrt.c b/crypto/bn/bn_sqrt.c
index 924ee274df..6beaf9e5e5 100644
--- a/crypto/bn/bn_sqrt.c
+++ b/crypto/bn/bn_sqrt.c
@@ -1,4 +1,4 @@
-/* crypto/bn/bn_mod.c */
+/* crypto/bn/bn_sqrt.c */
 /* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
  * and Bodo Moeller for the OpenSSL project. */
 /* ====================================================================
@@ -83,7 +83,8 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 				goto end;
 			if (!BN_set_word(ret, BN_is_bit_set(a, 0)))
 				{
-				BN_free(ret);
+				if (ret != in)
+					BN_free(ret);
 				return NULL;
 				}
 			bn_check_top(ret);
@@ -102,7 +103,8 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 			goto end;
 		if (!BN_set_word(ret, BN_is_one(a)))
 			{
-			BN_free(ret);
+			if (ret != in)
+				BN_free(ret);
 			return NULL;
 			}
 		bn_check_top(ret);
diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c
index 1bcb37e292..ee7b87c45c 100644
--- a/crypto/bn/bn_word.c
+++ b/crypto/bn/bn_word.c
@@ -69,6 +69,9 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)
 #endif
 	int i;
 
+	if (w == 0)
+		return (BN_ULONG)-1;
+
 	bn_check_top(a);
 	w&=BN_MASK2;
 	for (i=a->top-1; i>=0; i--)
@@ -94,7 +97,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
 
 	if (!w)
 		/* actually this an error (division by zero) */
-		return 0;
+		return (BN_ULONG)-1;
 	if (a->top == 0)
 		return 0;
 
@@ -102,7 +105,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
 	j = BN_BITS2 - BN_num_bits_word(w);
 	w <<= j;
 	if (!BN_lshift(a, a, j))
-		return 0;
+		return (BN_ULONG)-1;
 
 	for (i=a->top-1; i>=0; i--)
 		{
@@ -175,7 +178,13 @@ int BN_sub_word(BIGNUM *a, BN_ULONG w)
 	/* degenerate case: w is zero */
 	if (!w) return 1;
 	/* degenerate case: a is zero */
-	if(BN_is_zero(a)) return BN_set_word(a,w);
+	if(BN_is_zero(a))
+		{
+		i = BN_set_word(a,w);
+		if (i != 0)
+			BN_set_negative(a, 1);
+		return i;
+		}
 	/* handle 'a' when negative */
 	if (a->neg)
 		{
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 9169cc8813..0cd99c5b4b 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -106,6 +106,7 @@ int test_mont(BIO *bp,BN_CTX *ctx);
 int test_mod(BIO *bp,BN_CTX *ctx);
 int test_mod_mul(BIO *bp,BN_CTX *ctx);
 int test_mod_exp(BIO *bp,BN_CTX *ctx);
+int test_mod_exp_mont_consttime(BIO *bp,BN_CTX *ctx);
 int test_exp(BIO *bp,BN_CTX *ctx);
 int test_gf2m_add(BIO *bp);
 int test_gf2m_mod(BIO *bp);
@@ -183,116 +184,120 @@ int main(int argc, char *argv[])
 
 	message(out,"BN_add");
 	if (!test_add(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_sub");
 	if (!test_sub(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_lshift1");
 	if (!test_lshift1(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_lshift (fixed)");
 	if (!test_lshift(out,ctx,BN_bin2bn(lst,sizeof(lst)-1,NULL)))
 	    goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_lshift");
 	if (!test_lshift(out,ctx,NULL)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_rshift1");
 	if (!test_rshift1(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_rshift");
 	if (!test_rshift(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_sqr");
 	if (!test_sqr(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mul");
 	if (!test_mul(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_div");
 	if (!test_div(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_div_word");
 	if (!test_div_word(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_div_recp");
 	if (!test_div_recp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod");
 	if (!test_mod(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_mul");
 	if (!test_mod_mul(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mont");
 	if (!test_mont(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_exp");
 	if (!test_mod_exp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
+
+	message(out,"BN_mod_exp_mont_consttime");
+	if (!test_mod_exp_mont_consttime(out,ctx)) goto err;
+	(void)BIO_flush(out);
 
 	message(out,"BN_exp");
 	if (!test_exp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_kronecker");
 	if (!test_kron(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_mod_sqrt");
 	if (!test_sqrt(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_add");
 	if (!test_gf2m_add(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod");
 	if (!test_gf2m_mod(out)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_mul");
 	if (!test_gf2m_mod_mul(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_sqr");
 	if (!test_gf2m_mod_sqr(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_inv");
 	if (!test_gf2m_mod_inv(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_div");
 	if (!test_gf2m_mod_div(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_exp");
 	if (!test_gf2m_mod_exp(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_sqrt");
 	if (!test_gf2m_mod_sqrt(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	message(out,"BN_GF2m_mod_solve_quad");
 	if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
-	BIO_flush(out);
+	(void)BIO_flush(out);
 
 	BN_CTX_free(ctx);
 	BIO_free(out);
@@ -302,7 +307,7 @@ int main(int argc, char *argv[])
 err:
 	BIO_puts(out,"1\n"); /* make sure the Perl script fed by bc notices
 	                      * the failure, see test_bn in test/Makefile.ssl*/
-	BIO_flush(out);
+	(void)BIO_flush(out);
 	ERR_load_crypto_strings();
 	ERR_print_errors_fp(stderr);
 	EXIT(1);
@@ -481,7 +486,7 @@ static void print_word(BIO *bp,BN_ULONG w)
 		return;
 		}
 #endif
-	BIO_printf(bp,"%lX",w);
+	BIO_printf(bp,BN_HEX_FMT1,w);
 	}
 
 int test_div_word(BIO *bp)
@@ -727,6 +732,8 @@ int test_mont(BIO *bp, BN_CTX *ctx)
 	BN_init(&n);
 
 	mont=BN_MONT_CTX_new();
+	if (mont == NULL)
+		return 0;
 
 	BN_bntest_rand(&a,100,0,0); /**/
 	BN_bntest_rand(&b,100,0,0); /**/
@@ -921,6 +928,57 @@ int test_mod_exp(BIO *bp, BN_CTX *ctx)
 		BN_bntest_rand(b,2+i,0,0); /**/
 
 		if (!BN_mod_exp(d,a,b,c,ctx))
+			return(0);
+
+		if (bp != NULL)
+			{
+			if (!results)
+				{
+				BN_print(bp,a);
+				BIO_puts(bp," ^ ");
+				BN_print(bp,b);
+				BIO_puts(bp," % ");
+				BN_print(bp,c);
+				BIO_puts(bp," - ");
+				}
+			BN_print(bp,d);
+			BIO_puts(bp,"\n");
+			}
+		BN_exp(e,a,b,ctx);
+		BN_sub(e,e,d);
+		BN_div(a,b,e,c,ctx);
+		if(!BN_is_zero(b))
+		    {
+		    fprintf(stderr,"Modulo exponentiation test failed!\n");
+		    return 0;
+		    }
+		}
+	BN_free(a);
+	BN_free(b);
+	BN_free(c);
+	BN_free(d);
+	BN_free(e);
+	return(1);
+	}
+
+int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx)
+	{
+	BIGNUM *a,*b,*c,*d,*e;
+	int i;
+
+	a=BN_new();
+	b=BN_new();
+	c=BN_new();
+	d=BN_new();
+	e=BN_new();
+
+	BN_bntest_rand(c,30,0,1); /* must be odd for montgomery */
+	for (i=0; i<num2; i++)
+		{
+		BN_bntest_rand(a,20+i*5,0,0); /**/
+		BN_bntest_rand(b,2+i,0,0); /**/
+
+		if (!BN_mod_exp_mont_consttime(d,a,b,c,ctx,NULL))
 			return(00);
 
 		if (bp != NULL)
@@ -971,8 +1029,8 @@ int test_exp(BIO *bp, BN_CTX *ctx)
 		BN_bntest_rand(a,20+i*5,0,0); /**/
 		BN_bntest_rand(b,2+i,0,0); /**/
 
-		if (!BN_exp(d,a,b,ctx))
-			return(00);
+		if (BN_exp(d,a,b,ctx) <= 0)
+			return(0);
 
 		if (bp != NULL)
 			{
@@ -1060,8 +1118,8 @@ int test_gf2m_mod(BIO *bp)
 	{
 	BIGNUM *a,*b[2],*c,*d,*e;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1118,8 +1176,8 @@ int test_gf2m_mod_mul(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d,*e,*f,*g,*h;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1189,8 +1247,8 @@ int test_gf2m_mod_sqr(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1248,8 +1306,8 @@ int test_gf2m_mod_inv(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1303,8 +1361,8 @@ int test_gf2m_mod_div(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d,*e,*f;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1366,8 +1424,8 @@ int test_gf2m_mod_exp(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d,*e,*f;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1437,8 +1495,8 @@ int test_gf2m_mod_sqrt(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d,*e,*f;
 	int i, j, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
@@ -1496,8 +1554,8 @@ int test_gf2m_mod_solve_quad(BIO *bp,BN_CTX *ctx)
 	{
 	BIGNUM *a,*b[2],*c,*d,*e;
 	int i, j, s = 0, t, ret = 0;
-	unsigned int p0[] = {163,7,6,3,0};
-	unsigned int p1[] = {193,15,0};
+	int p0[] = {163,7,6,3,0,-1};
+	int p1[] = {193,15,0,-1};
 
 	a=BN_new();
 	b[0]=BN_new();
diff --git a/crypto/bn/expspeed.c b/crypto/bn/expspeed.c
index 07a1bcf51c..4d5f221f33 100644
--- a/crypto/bn/expspeed.c
+++ b/crypto/bn/expspeed.c
@@ -321,7 +321,7 @@ void do_mul_exp(BIGNUM *r, BIGNUM *a, BIGNUM *b, BIGNUM *c, BN_CTX *ctx)
 #else /* TEST_SQRT */
 			"2*sqrt [prime == %d (mod 64)] %4d %4d mod %4d"
 #endif
-			" -> %8.3fms %5.1f (%ld)\n",
+			" -> %8.6fms %5.1f (%ld)\n",
 #ifdef TEST_SQRT
 			P_MOD_64,
 #endif
diff --git a/crypto/bn/exptest.c b/crypto/bn/exptest.c
index 37aec55b89..074a8e882a 100644
--- a/crypto/bn/exptest.c
+++ b/crypto/bn/exptest.c
@@ -77,7 +77,7 @@ int main(int argc, char *argv[])
 	BIO *out=NULL;
 	int i,ret;
 	unsigned char c;
-	BIGNUM *r_mont,*r_recp,*r_simple,*a,*b,*m;
+	BIGNUM *r_mont,*r_mont_const,*r_recp,*r_simple,*a,*b,*m;
 
 	RAND_seed(rnd_seed, sizeof rnd_seed); /* or BN_rand may fail, and we don't
 	                                       * even check its return value
@@ -88,6 +88,7 @@ int main(int argc, char *argv[])
 	ctx=BN_CTX_new();
 	if (ctx == NULL) EXIT(1);
 	r_mont=BN_new();
+	r_mont_const=BN_new();
 	r_recp=BN_new();
 	r_simple=BN_new();
 	a=BN_new();
@@ -143,8 +144,17 @@ int main(int argc, char *argv[])
 			EXIT(1);
 			}
 
+		ret=BN_mod_exp_mont_consttime(r_mont_const,a,b,m,ctx,NULL);
+		if (ret <= 0)
+			{
+			printf("BN_mod_exp_mont_consttime() problems\n");
+			ERR_print_errors(out);
+			EXIT(1);
+			}
+
 		if (BN_cmp(r_simple, r_mont) == 0
-		    && BN_cmp(r_simple,r_recp) == 0)
+		    && BN_cmp(r_simple,r_recp) == 0
+			&& BN_cmp(r_simple,r_mont_const) == 0)
 			{
 			printf(".");
 			fflush(stdout);
@@ -153,6 +163,8 @@ int main(int argc, char *argv[])
 		  	{
 			if (BN_cmp(r_simple,r_mont) != 0)
 				printf("\nsimple and mont results differ\n");
+			if (BN_cmp(r_simple,r_mont_const) != 0)
+				printf("\nsimple and mont const time results differ\n");
 			if (BN_cmp(r_simple,r_recp) != 0)
 				printf("\nsimple and recp results differ\n");
 
@@ -162,18 +174,20 @@ int main(int argc, char *argv[])
 			printf("\nsimple   =");	BN_print(out,r_simple);
 			printf("\nrecp     =");	BN_print(out,r_recp);
 			printf("\nmont     ="); BN_print(out,r_mont);
+			printf("\nmont_ct  ="); BN_print(out,r_mont_const);
 			printf("\n");
 			EXIT(1);
 			}
 		}
 	BN_free(r_mont);
+	BN_free(r_mont_const);
 	BN_free(r_recp);
 	BN_free(r_simple);
 	BN_free(a);
 	BN_free(b);
 	BN_free(m);
 	BN_CTX_free(ctx);
-	ERR_remove_state(0);
+	ERR_remove_thread_state(NULL);
 	CRYPTO_mem_leaks(out);
 	BIO_free(out);
 	printf(" done\n");