summaryrefslogtreecommitdiff
path: root/crypto/bn
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/bn')
-rw-r--r--crypto/bn/.cvsignore5
-rw-r--r--crypto/bn/Makefile.ssl338
-rw-r--r--crypto/bn/asm/.cvsignore2
-rw-r--r--crypto/bn/asm/alpha-mont.pl317
-rw-r--r--crypto/bn/asm/armv4-mont.pl200
-rw-r--r--crypto/bn/asm/bn-586.pl203
-rw-r--r--crypto/bn/asm/co-586.pl3
-rw-r--r--crypto/bn/asm/ia64-mont.pl764
-rw-r--r--crypto/bn/asm/ia64.S35
-rw-r--r--crypto/bn/asm/mips3-mont.pl327
-rw-r--r--crypto/bn/asm/parisc-mont.pl993
-rw-r--r--crypto/bn/asm/ppc-mont.pl328
-rw-r--r--crypto/bn/asm/ppc.pl254
-rw-r--r--crypto/bn/asm/ppc64-mont.pl1086
-rw-r--r--crypto/bn/asm/s390x-mont.pl225
-rwxr-xr-xcrypto/bn/asm/s390x.S678
-rw-r--r--crypto/bn/asm/sparcv8plus.S31
-rw-r--r--crypto/bn/asm/sparcv9-mont.pl606
-rwxr-xr-xcrypto/bn/asm/sparcv9a-mont.pl882
-rw-r--r--crypto/bn/asm/via-mont.pl242
-rwxr-xr-xcrypto/bn/asm/x86-mont.pl591
-rw-r--r--crypto/bn/asm/x86_64-gcc.c34
-rwxr-xr-xcrypto/bn/asm/x86_64-mont.pl330
-rw-r--r--crypto/bn/bn.h310
-rw-r--r--crypto/bn/bn_asm.c352
-rw-r--r--crypto/bn/bn_blind.c259
-rwxr-xr-xcrypto/bn/bn_const.c402
-rw-r--r--crypto/bn/bn_ctx.c13
-rw-r--r--crypto/bn/bn_div.c265
-rw-r--r--crypto/bn/bn_err.c125
-rw-r--r--crypto/bn/bn_exp.c247
-rw-r--r--crypto/bn/bn_gcd.c164
-rw-r--r--crypto/bn/bn_gf2m.c153
-rw-r--r--crypto/bn/bn_lcl.h98
-rw-r--r--crypto/bn/bn_lib.c60
-rw-r--r--crypto/bn/bn_mont.c293
-rw-r--r--crypto/bn/bn_mul.c25
-rw-r--r--crypto/bn/bn_nist.c1200
-rw-r--r--crypto/bn/bn_prime.c16
-rw-r--r--crypto/bn/bn_prime.h4
-rw-r--r--crypto/bn/bn_prime.pl4
-rw-r--r--crypto/bn/bn_print.c25
-rw-r--r--crypto/bn/bn_rand.c8
-rw-r--r--crypto/bn/bn_recp.c6
-rw-r--r--crypto/bn/bn_shift.c2
-rw-r--r--crypto/bn/bn_sqr.c4
-rw-r--r--crypto/bn/bn_sqrt.c8
-rw-r--r--crypto/bn/bn_word.c15
-rw-r--r--crypto/bn/bntest.c154
-rw-r--r--crypto/bn/expspeed.c2
-rw-r--r--crypto/bn/exptest.c20
51 files changed, 10923 insertions, 1785 deletions
diff --git a/crypto/bn/.cvsignore b/crypto/bn/.cvsignore
index c6d03a9dbc..ebe4b61bb3 100644
--- a/crypto/bn/.cvsignore
+++ b/crypto/bn/.cvsignore
@@ -1,2 +1,7 @@
lib
Makefile.save
+*.flc
+semantic.cache
+co-*.s
+bn-*.s
+*-mont.s
diff --git a/crypto/bn/Makefile.ssl b/crypto/bn/Makefile.ssl
deleted file mode 100644
index 817bfa54cd..0000000000
--- a/crypto/bn/Makefile.ssl
+++ /dev/null
@@ -1,338 +0,0 @@
-#
-# SSLeay/crypto/bn/Makefile
-#
-
-DIR= bn
-TOP= ../..
-CC= cc
-CPP= $(CC) -E
-INCLUDES= -I.. -I$(TOP) -I../../include
-CFLAG=-g
-INSTALL_PREFIX=
-OPENSSLDIR= /usr/local/ssl
-INSTALLTOP=/usr/local/ssl
-MAKE= make -f Makefile.ssl
-MAKEDEPPROG= makedepend
-MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG)
-MAKEFILE= Makefile.ssl
-AR= ar r
-
-BN_ASM= bn_asm.o
-# or use
-#BN_ASM= bn86-elf.o
-
-CFLAGS= $(INCLUDES) $(CFLAG)
-ASFLAGS= $(INCLUDES) $(ASFLAG)
-AFLAGS= $(ASFLAGS)
-
-GENERAL=Makefile
-TEST=bntest.c exptest.c
-APPS=
-
-LIB=$(TOP)/libcrypto.a
-LIBSRC= bn_add.c bn_div.c bn_exp.c bn_lib.c bn_ctx.c bn_mul.c bn_mod.c \
- bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
- bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c bn_asm.c \
- bn_recp.c bn_mont.c bn_mpi.c bn_exp2.c bn_gf2m.c bn_nist.c \
- bn_depr.c
-
-LIBOBJ= bn_add.o bn_div.o bn_exp.o bn_lib.o bn_ctx.o bn_mul.o bn_mod.o \
- bn_print.o bn_rand.o bn_shift.o bn_word.o bn_blind.o \
- bn_kron.o bn_sqrt.o bn_gcd.o bn_prime.o bn_err.o bn_sqr.o $(BN_ASM) \
- bn_recp.o bn_mont.o bn_mpi.o bn_exp2.o bn_gf2m.o bn_nist.o \
- bn_depr.o
-
-SRC= $(LIBSRC)
-
-EXHEADER= bn.h
-HEADER= bn_lcl.h bn_prime.h $(EXHEADER)
-
-ALL= $(GENERAL) $(SRC) $(HEADER)
-
-top:
- (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
-
-all: lib
-
-bn_prime.h: bn_prime.pl
- $(PERL) bn_prime.pl >bn_prime.h
-
-divtest: divtest.c ../../libcrypto.a
- cc -I../../include divtest.c -o divtest ../../libcrypto.a
-
-bnbug: bnbug.c ../../libcrypto.a top
- cc -g -I../../include bnbug.c -o bnbug ../../libcrypto.a
-
-lib: $(LIBOBJ)
- $(AR) $(LIB) $(LIBOBJ)
- $(RANLIB) $(LIB) || echo Never mind.
- @touch lib
-
-# ELF
-bn86-elf.s: asm/bn-586.pl ../perlasm/x86asm.pl
- (cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@)
-co86-elf.s: asm/co-586.pl ../perlasm/x86asm.pl
- (cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@)
-# COFF
-bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl
- (cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@)
-co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl
- (cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@)
-# a.out
-bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl
- (cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@)
-co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl
- (cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@)
-
-sparcv8.o: asm/sparcv8.S
-sparcv8plus.o: asm/sparcv8plus.S
-mips3.o: asm/mips3.s
-
-x86_64-gcc.o: asm/x86_64-gcc.c
-
-bn-ia64.s: asm/ia64.S
- $(CC) $(CFLAGS) -E asm/ia64.S > $@
-
-# GNU assembler fails to compile PA-RISC2 modules, insist on calling
-# vendor assembler...
-pa-risc2W.o: asm/pa-risc2W.s
- /usr/ccs/bin/as -o pa-risc2W.o asm/pa-risc2W.s
-pa-risc2.o: asm/pa-risc2.s
- /usr/ccs/bin/as -o pa-risc2.o asm/pa-risc2.s
-
-# ppc - AIX, Linux, MacOS X...
-linux_ppc32.s: asm/ppc.pl; $(PERL) $< $@
-linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@
-aix_ppc32.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@
-aix_ppc64.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@
-osx_ppc32.s: asm/ppc.pl; $(PERL) $< $@
-
-files:
- $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
-
-links:
- @sh $(TOP)/util/point.sh Makefile.ssl Makefile
- @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
- @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
- @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
-
-install:
- @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
- do \
- (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
- chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
- done;
-
-exptest:
- rm -f exptest
- gcc -I../../include -g2 -ggdb -o exptest exptest.c ../../libcrypto.a
-
-div:
- rm -f a.out
- gcc -I.. -g div.c ../../libcrypto.a
-
-tags:
- ctags $(SRC)
-
-tests:
-
-lint:
- lint -DLINT $(INCLUDES) $(SRC)>fluff
-
-depend:
- $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
-
-dclean:
- $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
- mv -f Makefile.new $(MAKEFILE)
-
-clean:
- rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
-
-# DO NOT DELETE THIS LINE -- make depend depends on it.
-
-bn_add.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_add.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_add.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_add.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_add.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_add.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_add.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_add.c bn_lcl.h
-bn_asm.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_asm.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_asm.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_asm.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_asm.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_asm.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_asm.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_asm.c bn_lcl.h
-bn_blind.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_blind.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_blind.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_blind.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_blind.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_blind.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_blind.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_blind.c bn_lcl.h
-bn_ctx.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_ctx.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_ctx.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_ctx.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_ctx.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_ctx.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_ctx.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_ctx.c bn_lcl.h
-bn_depr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_depr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_depr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_depr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_depr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_depr.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
-bn_depr.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_depr.o: ../cryptlib.h bn_depr.c bn_lcl.h
-bn_div.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_div.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_div.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_div.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_div.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_div.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_div.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_div.c bn_lcl.h
-bn_err.o: ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_err.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
-bn_err.o: ../../include/openssl/err.h ../../include/openssl/lhash.h
-bn_err.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
-bn_err.o: ../../include/openssl/ossl_typ.h ../../include/openssl/safestack.h
-bn_err.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_err.o: bn_err.c
-bn_exp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_exp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h
-bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_exp2.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_exp2.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_exp2.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_exp2.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp2.c bn_lcl.h
-bn_gcd.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_gcd.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_gcd.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_gcd.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_gcd.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_gcd.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_gcd.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gcd.c bn_lcl.h
-bn_gf2m.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_gf2m.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_gf2m.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_gf2m.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_gf2m.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_gf2m.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_gf2m.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_gf2m.c bn_lcl.h
-bn_kron.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_kron.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_kron.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_kron.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_kron.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_kron.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_kron.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_kron.c bn_lcl.h
-bn_lib.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_lib.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_lib.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_lib.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_lib.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_lib.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_lib.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_lib.c
-bn_mod.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mod.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mod.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mod.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mod.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mod.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mod.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mod.c
-bn_mont.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mont.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mont.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mont.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mont.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mont.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mont.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mont.c
-bn_mpi.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mpi.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mpi.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mpi.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mpi.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mpi.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mpi.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mpi.c
-bn_mul.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_mul.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_mul.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_mul.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_mul.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_mul.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_mul.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_mul.c
-bn_nist.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_nist.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_nist.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_nist.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_nist.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_nist.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_nist.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_nist.c
-bn_prime.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_prime.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_prime.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_prime.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_prime.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_prime.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
-bn_prime.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_prime.o: ../cryptlib.h bn_lcl.h bn_prime.c bn_prime.h
-bn_print.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_print.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_print.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_print.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_print.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_print.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_print.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_print.c
-bn_rand.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_rand.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_rand.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_rand.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_rand.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_rand.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
-bn_rand.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
-bn_rand.o: ../cryptlib.h bn_lcl.h bn_rand.c
-bn_recp.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_recp.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_recp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_recp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_recp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_recp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_recp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_recp.c
-bn_shift.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_shift.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_shift.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_shift.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_shift.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_shift.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_shift.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_shift.c
-bn_sqr.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_sqr.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_sqr.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_sqr.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_sqr.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_sqr.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_sqr.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqr.c
-bn_sqrt.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_sqrt.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_sqrt.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_sqrt.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_sqrt.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_sqrt.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_sqrt.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_sqrt.c
-bn_word.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h
-bn_word.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
-bn_word.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
-bn_word.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h
-bn_word.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
-bn_word.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
-bn_word.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_lcl.h bn_word.c
diff --git a/crypto/bn/asm/.cvsignore b/crypto/bn/asm/.cvsignore
index 671eb02019..26475028f5 100644
--- a/crypto/bn/asm/.cvsignore
+++ b/crypto/bn/asm/.cvsignore
@@ -2,3 +2,5 @@ bn86unix.cpp
co86unix.cpp
bn86-elf.s
co86-elf.s
+*.flc
+semantic.cache
diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl
new file mode 100644
index 0000000000..7a2cc3173b
--- /dev/null
+++ b/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,317 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+# int bn_mul_mont(
+$rp="a0"; # BN_ULONG *rp,
+$ap="a1"; # const BN_ULONG *ap,
+$bp="a2"; # const BN_ULONG *bp,
+$np="a3"; # const BN_ULONG *np,
+$n0="a4"; # const BN_ULONG *n0,
+$num="a5"; # int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set noat
+.set noreorder
+
+.globl bn_mul_mont
+.align 5
+.ent bn_mul_mont
+bn_mul_mont:
+ lda sp,-40(sp)
+ stq ra,0(sp)
+ stq s3,8(sp)
+ stq s4,16(sp)
+ stq s5,24(sp)
+ stq fp,32(sp)
+ mov sp,fp
+ .mask 0x0400f000,-40
+ .frame fp,40,ra
+ .prologue 0
+
+ .align 4
+ .set reorder
+ sextl $num,$num
+ mov 0,v0
+ cmplt $num,4,AT
+ bne AT,.Lexit
+
+ ldq $hi0,0($ap) # ap[0]
+ s8addq $num,16,AT
+ ldq $aj,8($ap)
+ subq sp,AT,sp
+ ldq $bi,0($bp) # bp[0]
+ mov -4096,AT
+ ldq $n0,0($n0)
+ and sp,AT,sp
+
+ mulq $hi0,$bi,$lo0
+ ldq $hi1,0($np) # np[0]
+ umulh $hi0,$bi,$hi0
+ ldq $nj,8($np)
+
+ mulq $lo0,$n0,$m1
+
+ mulq $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ addq $hi1,AT,$hi1
+
+ mulq $aj,$bi,$alo
+ mov 2,$j
+ umulh $aj,$bi,$ahi
+ mov sp,$tp
+
+ mulq $nj,$m1,$nlo
+ s8addq $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+ s8addq $j,$np,$nj
+.align 4
+.L1st:
+ .set noreorder
+ ldq $aj,($aj)
+ addl $j,1,$j
+ ldq $nj,($nj)
+ lda $tp,8($tp)
+
+ addq $alo,$hi0,$lo0
+ mulq $aj,$bi,$alo
+ cmpult $lo0,$hi0,AT
+ addq $nlo,$hi1,$lo1
+
+ mulq $nj,$m1,$nlo
+ addq $ahi,AT,$hi0
+ cmpult $lo1,$hi1,v0
+ cmplt $j,$num,$tj
+
+ umulh $aj,$bi,$ahi
+ addq $nhi,v0,$hi1
+ addq $lo1,$lo0,$lo1
+ s8addq $j,$ap,$aj
+
+ umulh $nj,$m1,$nhi
+ cmpult $lo1,$lo0,v0
+ addq $hi1,v0,$hi1
+ s8addq $j,$np,$nj
+
+ stq $lo1,-8($tp)
+ nop
+ unop
+ bne $tj,.L1st
+ .set reorder
+
+ addq $alo,$hi0,$lo0
+ addq $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addq $ahi,AT,$hi0
+ addq $nhi,v0,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,v0
+ addq $hi1,v0,$hi1
+
+ stq $lo1,0($tp)
+
+ addq $hi1,$hi0,$hi1
+ cmpult $hi1,$hi0,AT
+ stq $hi1,8($tp)
+ stq AT,16($tp)
+
+ mov 1,$i
+.align 4
+.Louter:
+ s8addq $i,$bp,$bi
+ ldq $hi0,($ap)
+ ldq $aj,8($ap)
+ ldq $bi,($bi)
+ ldq $hi1,($np)
+ ldq $nj,8($np)
+ ldq $tj,(sp)
+
+ mulq $hi0,$bi,$lo0
+ umulh $hi0,$bi,$hi0
+
+ addq $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addq $hi0,AT,$hi0
+
+ mulq $lo0,$n0,$m1
+
+ mulq $hi1,$m1,$lo1
+ umulh $hi1,$m1,$hi1
+
+ addq $lo1,$lo0,$lo1
+ cmpult $lo1,$lo0,AT
+ mov 2,$j
+ addq $hi1,AT,$hi1
+
+ mulq $aj,$bi,$alo
+ mov sp,$tp
+ umulh $aj,$bi,$ahi
+
+ mulq $nj,$m1,$nlo
+ s8addq $j,$ap,$aj
+ umulh $nj,$m1,$nhi
+.align 4
+.Linner:
+ .set noreorder
+ ldq $tj,8($tp) #L0
+ nop #U1
+ ldq $aj,($aj) #L1
+ s8addq $j,$np,$nj #U0
+
+ ldq $nj,($nj) #L0
+ nop #U1
+ addq $alo,$hi0,$lo0 #L1
+ lda $tp,8($tp)
+
+ mulq $aj,$bi,$alo #U1
+ cmpult $lo0,$hi0,AT #L0
+ addq $nlo,$hi1,$lo1 #L1
+ addl $j,1,$j
+
+ mulq $nj,$m1,$nlo #U1
+ addq $ahi,AT,$hi0 #L0
+ addq $lo0,$tj,$lo0 #L1
+ cmpult $lo1,$hi1,v0 #U0
+
+ umulh $aj,$bi,$ahi #U1
+ cmpult $lo0,$tj,AT #L0
+ addq $lo1,$lo0,$lo1 #L1
+ addq $nhi,v0,$hi1 #U0
+
+ umulh $nj,$m1,$nhi #U1
+ s8addq $j,$ap,$aj #L0
+ cmpult $lo1,$lo0,v0 #L1
+ cmplt $j,$num,$tj #U0 # borrow $tj
+
+ addq $hi0,AT,$hi0 #L0
+ addq $hi1,v0,$hi1 #U1
+ stq $lo1,-8($tp) #L1
+ bne $tj,.Linner #U0
+ .set reorder
+
+ ldq $tj,8($tp)
+ addq $alo,$hi0,$lo0
+ addq $nlo,$hi1,$lo1
+ cmpult $lo0,$hi0,AT
+ cmpult $lo1,$hi1,v0
+ addq $ahi,AT,$hi0
+ addq $nhi,v0,$hi1
+
+ addq $lo0,$tj,$lo0
+ cmpult $lo0,$tj,AT
+ addq $hi0,AT,$hi0
+
+ ldq $tj,16($tp)
+ addq $lo1,$lo0,$j
+ cmpult $j,$lo0,v0
+ addq $hi1,v0,$hi1
+
+ addq $hi1,$hi0,$lo1
+ stq $j,($tp)
+ cmpult $lo1,$hi0,$hi1
+ addq $lo1,$tj,$lo1
+ cmpult $lo1,$tj,AT
+ addl $i,1,$i
+ addq $hi1,AT,$hi1
+ stq $lo1,8($tp)
+ cmplt $i,$num,$tj # borrow $tj
+ stq $hi1,16($tp)
+ bne $tj,.Louter
+
+ s8addq $num,sp,$tj # &tp[num]
+ mov $rp,$bp # put rp aside
+ mov sp,$tp
+ mov sp,$ap
+ mov 0,$hi0 # clear borrow bit
+
+.align 4
+.Lsub: ldq $lo0,($tp)
+ ldq $lo1,($np)
+ lda $tp,8($tp)
+ lda $np,8($np)
+ subq $lo0,$lo1,$lo1 # tp[i]-np[i]
+ cmpult $lo0,$lo1,AT
+ subq $lo1,$hi0,$lo0
+ cmpult $lo1,$lo0,$hi0
+ or $hi0,AT,$hi0
+ stq $lo0,($rp)
+ cmpult $tp,$tj,v0
+ lda $rp,8($rp)
+ bne v0,.Lsub
+
+ subq $hi1,$hi0,$hi0 # handle upmost overflow bit
+ mov sp,$tp
+ mov $bp,$rp # restore rp
+
+ and sp,$hi0,$ap
+ bic $bp,$hi0,$bp
+ bis $bp,$ap,$ap # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: ldq $aj,($ap) # copy or in-place refresh
+ lda $tp,8($tp)
+ lda $rp,8($rp)
+ lda $ap,8($ap)
+ stq zero,-8($tp) # zap tp
+ cmpult $tp,$tj,AT
+ stq $aj,-8($rp)
+ bne AT,.Lcopy
+ mov 1,v0
+
+.Lexit:
+ .set noreorder
+ mov fp,sp
+ /*ldq ra,0(sp)*/
+ ldq s3,8(sp)
+ ldq s4,16(sp)
+ ldq s5,24(sp)
+ ldq fp,32(sp)
+ lda sp,40(sp)
+ ret (ra)
+.end bn_mul_mont
+.rdata
+.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000000..05d5dc1a48
--- /dev/null
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,200 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+$num="r0"; # starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10"; # sl, gcc uses it to keep @GOT
+$ahi="r11"; # fp
+$nlo="r12"; # ip
+########### # r13 is stack pointer
+$nhi="r14"; # lr
+########### # r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4"; $_bpend=$_num;
+
+$code=<<___;
+.text
+
+.global bn_mul_mont
+.type bn_mul_mont,%function
+
+.align 2
+bn_mul_mont:
+ stmdb sp!,{r0,r2} @ sp points at argument block
+ ldr $num,[sp,#3*4] @ load num
+ cmp $num,#2
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt .Labrt
+
+ stmdb sp!,{r4-r12,lr} @ save 10 registers
+
+ mov $num,$num,lsl#2 @ rescale $num for byte count
+ sub sp,sp,$num @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub $num,$num,#4 @ "num=num-1"
+ add $tp,$bp,$num @ &bp[num-1]
+
+ add $num,sp,$num @ $num to point at &tp[num-1]
+ ldr $n0,[$_n0] @ &n0
+ ldr $bi,[$bp] @ bp[0]
+ ldr $aj,[$ap],#4 @ ap[0],ap++
+ ldr $nj,[$np],#4 @ np[0],np++
+ ldr $n0,[$n0] @ *n0
+ str $tp,[$_bpend] @ save &bp[num]
+
+ umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
+ str $n0,[$_n0] @ save n0 value
+ mul $n0,$alo,$n0 @ "tp[0]"*n0
+ mov $nlo,#0
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
+ mov $tp,sp
+
+.L1st:
+ ldr $aj,[$ap],#4 @ ap[j],ap++
+ mov $alo,$ahi
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
+ ldr $nj,[$np],#4 @ np[j],np++
+ mov $nhi,#0
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
+ adds $nlo,$nlo,$alo
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
+ adc $nlo,$nhi,#0
+ cmp $tp,$num
+ bne .L1st
+
+ adds $nlo,$nlo,$ahi
+ mov $nhi,#0
+ adc $nhi,$nhi,#0
+ ldr $tp,[$_bp] @ restore bp
+ str $nlo,[$num] @ tp[num-1]=
+ ldr $n0,[$_n0] @ restore n0
+ str $nhi,[$num,#4] @ tp[num]=
+
+.Louter:
+ sub $tj,$num,sp @ "original" $num-1 value
+ sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
+ sub $np,$np,$tj @ "rewind" np to &np[1]
+ ldr $bi,[$tp,#4]! @ *(++bp)
+ ldr $aj,[$ap,#-4] @ ap[0]
+ ldr $nj,[$np,#-4] @ np[0]
+ ldr $alo,[sp] @ tp[0]
+ ldr $tj,[sp,#4] @ tp[1]
+
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
+ str $tp,[$_bp] @ save bp
+ mul $n0,$alo,$n0
+ mov $nlo,#0
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
+ mov $tp,sp
+
+.Linner:
+ ldr $aj,[$ap],#4 @ ap[j],ap++
+ adds $alo,$ahi,$tj @ +=tp[j]
+ mov $ahi,#0
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
+ ldr $nj,[$np],#4 @ np[j],np++
+ mov $nhi,#0
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
+ ldr $tj,[$tp,#8] @ tp[j+1]
+ adc $ahi,$ahi,#0
+ adds $nlo,$nlo,$alo
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
+ adc $nlo,$nhi,#0
+ cmp $tp,$num
+ bne .Linner
+
+ adds $nlo,$nlo,$ahi
+ mov $nhi,#0
+ adc $nhi,$nhi,#0
+ adds $nlo,$nlo,$tj
+ adc $nhi,$nhi,#0
+ ldr $tp,[$_bp] @ restore bp
+ ldr $tj,[$_bpend] @ restore &bp[num]
+ str $nlo,[$num] @ tp[num-1]=
+ ldr $n0,[$_n0] @ restore n0
+ str $nhi,[$num,#4] @ tp[num]=
+
+ cmp $tp,$tj
+ bne .Louter
+
+ ldr $rp,[$_rp] @ pull rp
+ add $num,$num,#4 @ $num to point at &tp[num]
+ sub $aj,$num,sp @ "original" num value
+ mov $tp,sp @ "rewind" $tp
+ mov $ap,$tp @ "borrow" $ap
+ sub $np,$np,$aj @ "rewind" $np to &np[0]
+
+ subs $tj,$tj,$tj @ "clear" carry flag
+.Lsub: ldr $tj,[$tp],#4
+ ldr $nj,[$np],#4
+ sbcs $tj,$tj,$nj @ tp[j]-np[j]
+ str $tj,[$rp],#4 @ rp[j]=
+ teq $tp,$num @ preserve carry
+ bne .Lsub
+ sbcs $nhi,$nhi,#0 @ upmost carry
+ mov $tp,sp @ "rewind" $tp
+ sub $rp,$rp,$aj @ "rewind" $rp
+
+ and $ap,$tp,$nhi
+ bic $np,$rp,$nhi
+ orr $ap,$ap,$np @ ap=borrow?tp:rp
+
+.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
+ str sp,[$tp],#4 @ zap tp
+ str $tj,[$rp],#4
+ cmp $tp,$num
+ bne .Lcopy
+
+ add sp,$num,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4-r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt: tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+.size bn_mul_mont,.-bn_mul_mont
+.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl
index 26c2685a72..332ef3e91d 100644
--- a/crypto/bn/asm/bn-586.pl
+++ b/crypto/bn/asm/bn-586.pl
@@ -1,6 +1,7 @@
#!/usr/local/bin/perl
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
@@ -24,38 +25,25 @@ sub bn_mul_add_words
{
local($name)=@_;
- &function_begin($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
- &comment("");
- $Low="eax";
- $High="edx";
- $a="ebx";
- $w="ebp";
- $r="edi";
- $c="esi";
-
- &xor($c,$c); # clear carry
- &mov($r,&wparam(0)); #
-
- &mov("ecx",&wparam(2)); #
- &mov($a,&wparam(1)); #
-
- &and("ecx",0xfffffff8); # num / 8
- &mov($w,&wparam(3)); #
-
- &push("ecx"); # Up the stack for a tmp variable
-
- &jz(&label("maw_finish"));
+ $r="eax";
+ $a="edx";
+ $c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
- &jnc(&label("maw_loop"));
+ &jnc(&label("maw_non_sse2"));
- &movd("mm0",$w); # mm0 = w
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
-
- &set_label("maw_sse2_loop",0);
+ &jmp(&label("maw_sse2_entry"));
+
+ &set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
@@ -112,42 +100,82 @@ sub bn_mul_add_words
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
- &add($r,32);
+ &lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
- &sub("ecx",8);
+ &sub($c,8);
+ &jz(&label("maw_sse2_exit"));
+ &set_label("maw_sse2_entry");
+ &test($c,0xfffffff8);
+ &jnz(&label("maw_sse2_unrolled"));
+
+ &set_label("maw_sse2_loop",4);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm3"); # carry += r[i]
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
-
- &movd($c,"mm1"); # c = carry_out
+ &set_label("maw_sse2_exit");
+ &movd("eax","mm1"); # c = carry_out
&emms();
+ &ret();
- &jmp(&label("maw_finish"));
+ &set_label("maw_non_sse2",16);
}
- &set_label("maw_loop",0);
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
+
+ &comment("");
+ $Low="eax";
+ $High="edx";
+ $a="ebx";
+ $w="ebp";
+ $r="edi";
+ $c="esi";
+
+ &xor($c,$c); # clear carry
+ &mov($r,&wparam(0)); #
+
+ &mov("ecx",&wparam(2)); #
+ &mov($a,&wparam(1)); #
+
+ &and("ecx",0xfffffff8); # num / 8
+ &mov($w,&wparam(3)); #
- &mov(&swtmp(0),"ecx"); #
+ &push("ecx"); # Up the stack for a tmp variable
+
+ &jz(&label("maw_finish"));
+
+ &set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
- &mov("eax",&DWP($i,$a,"",0)); # *a
+ &mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
- &add("eax",$c); # L(t)+= *r
- &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
+ &add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
- &add("eax",$c); # L(t)+=c
+ &add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
- &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
+ &mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
- &mov("ecx",&swtmp(0)); #
- &add($a,32);
- &add($r,32);
&sub("ecx",8);
+ &lea($a,&DWP(32,$a));
+ &lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
@@ -160,16 +188,15 @@ sub bn_mul_add_words
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
- &mov("eax",&DWP($i*4,$a,"",0));# *a
+ &mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
- &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
- &add("eax",$c);
+ &add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
- &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
- &mov($c,"edx"); # c= H(t);
+ &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
+ &mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
@@ -184,7 +211,45 @@ sub bn_mul_words
{
local($name)=@_;
- &function_begin($name,"");
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("mw_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+ &movd("mm0",&wparam(3)); # mm0 = w
+ &pxor("mm1","mm1"); # mm1 = carry = 0
+
+ &set_label("mw_sse2_loop",16);
+ &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
+ &pmuludq("mm2","mm0"); # a[i] *= w
+ &lea($a,&DWP(4,$a));
+ &paddq("mm1","mm2"); # carry += a[i]*w
+ &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
+ &sub($c,1);
+ &psrlq("mm1",32); # carry = carry_high
+ &lea($r,&DWP(4,$r));
+ &jnz(&label("mw_sse2_loop"));
+
+ &movd("eax","mm1"); # return carry
+ &emms();
+ &ret();
+ &set_label("mw_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
&comment("");
$Low="eax";
@@ -257,7 +322,40 @@ sub bn_sqr_words
{
local($name)=@_;
- &function_begin($name,"");
+ &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+ $r="eax";
+ $a="edx";
+ $c="ecx";
+
+ if ($sse2) {
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt(&DWP(0,"eax"),26);
+ &jnc(&label("sqr_non_sse2"));
+
+ &mov($r,&wparam(0));
+ &mov($a,&wparam(1));
+ &mov($c,&wparam(2));
+
+ &set_label("sqr_sse2_loop",16);
+ &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
+ &pmuludq("mm0","mm0"); # a[i] *= a[i]
+ &lea($a,&DWP(4,$a)); # a++
+ &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
+ &sub($c,1);
+ &lea($r,&DWP(8,$r)); # r += 2
+ &jnz(&label("sqr_sse2_loop"));
+
+ &emms();
+ &ret();
+ &set_label("sqr_non_sse2",16);
+ }
+
+ # function_begin prologue
+ &push("ebp");
+ &push("ebx");
+ &push("esi");
+ &push("edi");
&comment("");
$r="esi";
@@ -313,12 +411,13 @@ sub bn_div_words
{
local($name)=@_;
- &function_begin($name,"");
+ &function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
- &mov("ebx",&wparam(2)); #
- &div("ebx");
- &function_end($name);
+ &mov("ecx",&wparam(2)); #
+ &div("ecx");
+ &ret();
+ &function_end_B($name);
}
sub bn_add_words
diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl
index 5d962cb957..57101a6bd7 100644
--- a/crypto/bn/asm/co-586.pl
+++ b/crypto/bn/asm/co-586.pl
@@ -1,6 +1,7 @@
#!/usr/local/bin/perl
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000000..af903a2bf8
--- /dev/null
+++ b/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,764 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+# stalls after ldf8, xma and getf.sig outside inner loop and
+# improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+# once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+# acute interest, because upcoming Tukwila's individual cores are
+# reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+# sign verify sign/s verify/s
+# rsa 512 bits 0.000302s 0.000024s 3312.3 41332.2
+# rsa 1024 bits 0.000816s 0.000058s 1225.2 17172.0
+# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
+# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
+# dsa 512 bits 0.000254s 0.000206s 3944.6 4865.1
+# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
+# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
+#
+# ... and *without*:
+#
+# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
+# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
+# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
+# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
+# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
+# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
+# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
+#
+# As it can be seen, RSA sign performance improves by 120-30%,
+# hereafter less for longer keys, while verify - by 72-13%.
+# DSA performance improves by 100-30%.
+
+if ($^O eq "hpux") {
+ $ADDP="addp4";
+ for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+// const BN_ULONG *bp,const BN_ULONG *np,
+// const BN_ULONG *n0p,int num);
+.global bn_mul_mont#
+.proc bn_mul_mont#
+.align 64;;
+bn_mul_mont:
+ .prologue
+ .body
+{ .mmi; cmp4.le p6,p7=2,r37;;
+(p6) cmp4.lt.unc p8,p9=8,r37
+ mov ret0=r0 };;
+{ .bbb;
+(p9) br.cond.dptk.many bn_mul_mont_8
+(p8) br.cond.dpnt.many bn_mul_mont_general
+(p7) br.ret.spnt.many b0 };;
+.endp bn_mul_mont#
+
+prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
+
+rptr=r8; aptr=r9; bptr=r14; nptr=r15;
+tptr=r16; // &tp[0]
+tp_1=r17; // &tp[-1]
+num=r18; len=r19; lc=r20;
+topbit=r21; // carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.local bn_mul_mont_general#
+.proc bn_mul_mont_general#
+.align 64;;
+bn_mul_mont_general:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ $ADDP aptr=0,in1
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; .vframe prevsp
+ mov prevsp=sp
+ $ADDP bptr=0,in2
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+ .body
+ .rotf alo[6],nlo[4],ahi[8],nhi[6]
+ .rotr a[3],n[3],t[2]
+
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 alo[4]=[aptr],16 // ap[0]
+ $ADDP r30=8,in1 };;
+{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
+ ldf8 alo[2]=[aptr],16 // ap[2]
+ $ADDP in4=0,in4 };;
+{ .mmi; ldf8 alo[1]=[r30] // ap[3]
+ ldf8 n0=[in4] // n0
+ $ADDP rptr=0,in0 }
+{ .mmi; $ADDP nptr=0,in3
+ mov r31=16
+ zxt4 num=in5 };;
+{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
+ shladd len=num,3,r0
+ shladd r31=num,3,r31 };;
+{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
+ add lc=-5,num
+ sub r31=sp,r31 };;
+{ .mfb; and sp=-16,r31 // alloca
+ xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
+ nop.b 0 }
+{ .mfb; nop.m 0
+ xmpy.lu alo[4]=alo[4],bi
+ brp.loop.imp .L1st_ctop,.L1st_cend-16
+ };;
+{ .mfi; nop.m 0
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
+ add tp_1=8,sp }
+{ .mfi; nop.m 0
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20001f<<16
+ // ------^----- (p40) at first (p23)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; nop.m 0
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
+ mov ar.lc=lc }
+{ .mfi; nop.m 0
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+.align 32
+.L1st_ctop:
+.pred.rel "mutex",p40,p42
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23) }
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p23) st8 [tp_1]=n[2],8
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mmb; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ br.ctop.sptk .L1st_ctop };;
+.L1st_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ add num=-1,num };; // num--
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ sub aptr=aptr,len };; // rewind
+{ .mmi; .pred.rel "mutex",p40,p42
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+ sub nptr=nptr,len };;
+{ .mmi; .pred.rel "mutex",p39,p41
+(p39) add topbit=r0,r0
+(p41) add topbit=r0,r0,1
+ nop.i 0 }
+{ .mmi; st8 [tp_1]=n[0]
+ add tptr=16,sp
+ add tp_1=8,sp };;
+
+.Louter:
+{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
+ ldf8 ahi[3]=[tptr] // tp[0]
+ add r30=8,aptr };;
+{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
+ ldf8 alo[3]=[r30],16 // ap[1]
+ add r31=8,nptr };;
+{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
+ xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
+ brp.loop.imp .Linner_ctop,.Linner_cend-16
+ }
+{ .mfb; ldf8 alo[1]=[r30] // ap[3]
+ xma.lu alo[4]=alo[4],bi,ahi[3]
+ clrrrb.pr };;
+{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
+ xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
+ nop.i 0 }
+{ .mfi; ldf8 nlo[1]=[r31] // np[1]
+ xma.lu alo[3]=alo[3],bi,ahi[2]
+ mov pr.rot=0x20101f<<16
+ // ------^----- (p40) at first (p23)
+ // --------^--- (p30) at first (p22)
+ // ----------^^ p[16:20]=1
+ };;
+{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
+ xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
+ mov ar.lc=lc }
+{ .mfi;
+ fcvt.fxu.s1 nhi[1]=f0
+ mov ar.ec=8 };;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align 32
+.Linner_ctop:
+.pred.rel "mutex",p40,p42
+.pred.rel "mutex",p30,p32
+{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
+ (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
+ (p40) add n[2]=n[2],a[2] } // (p23)
+{ .mfi; (p16) nop.m 0
+ (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
+ (p42) add n[2]=n[2],a[2],1 };; // (p23)
+{ .mfi; (p21) getf.sig a[0]=alo[5]
+ (p16) nop.f 0
+ (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
+{ .mfi; (p21) ld8 t[0]=[tptr],8
+ (p16) nop.f 0
+ (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
+{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
+ (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
+ (p30) add a[1]=a[1],t[1] } // (p22)
+{ .mfi; (p16) nop.m 0
+ (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
+ (p32) add a[1]=a[1],t[1],1 };; // (p22)
+{ .mmi; (p21) getf.sig n[0]=nlo[3]
+ (p16) nop.m 0
+ (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
+{ .mmb; (p23) st8 [tp_1]=n[2],8
+ (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
+ br.ctop.sptk .Linner_ctop };;
+.Linner_cend:
+
+{ .mmi; getf.sig a[0]=ahi[6] // (p24)
+ getf.sig n[0]=nhi[4]
+ nop.i 0 };;
+
+{ .mmi; .pred.rel "mutex",p31,p33
+(p31) add a[0]=a[0],topbit
+(p33) add a[0]=a[0],topbit,1
+ mov topbit=r0 };;
+{ .mfi; .pred.rel "mutex",p31,p33
+(p31) cmp.ltu p32,p30=a[0],topbit
+(p33) cmp.leu p32,p30=a[0],topbit
+ }
+{ .mfi; .pred.rel "mutex",p40,p42
+(p40) add n[0]=n[0],a[0]
+(p42) add n[0]=n[0],a[0],1
+ };;
+{ .mmi; .pred.rel "mutex",p44,p46
+(p40) cmp.ltu p41,p39=n[0],a[0]
+(p42) cmp.leu p41,p39=n[0],a[0]
+(p32) add topbit=r0,r0,1 }
+
+{ .mmi; st8 [tp_1]=n[0],8
+ cmp4.ne p6,p0=1,num
+ sub aptr=aptr,len };; // rewind
+{ .mmi; sub nptr=nptr,len
+(p41) add topbit=r0,r0,1
+ add tptr=16,sp }
+{ .mmb; add tp_1=8,sp
+ add num=-1,num // num--
+(p6) br.cond.sptk.many .Louter };;
+
+{ .mbb; add lc=4,lc
+ brp.loop.imp .Lsub_ctop,.Lsub_cend-16
+ clrrrb.pr };;
+{ .mii; nop.m 0
+ mov pr.rot=0x10001<<16
+ // ------^---- (p33) at first (p17)
+ mov ar.lc=lc }
+{ .mii; nop.m 0
+ mov ar.ec=3
+ nop.i 0 };;
+
+.Lsub_ctop:
+.pred.rel "mutex",p33,p35
+{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
+ (p16) nop.f 0
+ (p33) sub n[1]=t[1],n[1] } // (p17)
+{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
+ (p16) nop.f 0
+ (p35) sub n[1]=t[1],n[1],1 };; // (p17)
+{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
+ (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
+ (p18) nop.b 0 }
+{ .mib; (p18) nop.m 0
+ (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
+ br.ctop.sptk .Lsub_ctop };;
+.Lsub_cend:
+
+{ .mmb; .pred.rel "mutex",p34,p36
+(p34) sub topbit=topbit,r0 // (p19)
+(p36) sub topbit=topbit,r0,1
+ brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
+ }
+{ .mmb; sub rptr=rptr,len // rewind
+ sub tptr=tptr,len
+ clrrrb.pr };;
+{ .mmi; and aptr=tptr,topbit
+ andcm bptr=rptr,topbit
+ mov pr.rot=1<<16 };;
+{ .mii; or nptr=aptr,bptr
+ mov ar.lc=lc
+ mov ar.ec=3 };;
+
+.Lcopy_ctop:
+{ .mmb; (p16) ld8 n[0]=[nptr],8
+ (p18) st8 [tptr]=r0,8
+ (p16) nop.b 0 }
+{ .mmb; (p16) nop.m 0
+ (p18) st8 [rptr]=n[2],8
+ br.ctop.sptk .Lcopy_ctop };;
+.Lcopy_cend:
+
+{ .mmi; mov ret0=1 // signal "handled"
+ rum 1<<5 // clear um.mfh
+ mov ar.lc=prevlc }
+{ .mib; .restore sp
+ mov sp=prevsp
+ mov pr=prevpr,-2
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_general#
+
+a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
+n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
+t0=r15;
+
+ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.local bn_mul_mont_8#
+.proc bn_mul_mont_8#
+.align 64
+.skip 48;; // aligns loop body
+bn_mul_mont_8:
+ .prologue
+{ .mmi; .save ar.pfs,prevfs
+ alloc prevfs=ar.pfs,6,2,0,8
+ .vframe prevsp
+ mov prevsp=sp
+ .save ar.lc,prevlc
+ mov prevlc=ar.lc }
+{ .mmi; add r17=-6*16,sp
+ add sp=-7*16,sp
+ .save pr,prevpr
+ mov prevpr=pr };;
+
+{ .mmi; .save.gf 0,0x10
+ stf.spill [sp]=f16,-16
+ .save.gf 0,0x20
+ stf.spill [r17]=f17,32
+ add r16=-5*16,prevsp};;
+{ .mmi; .save.gf 0,0x40
+ stf.spill [r16]=f18,32
+ .save.gf 0,0x80
+ stf.spill [r17]=f19,32
+ $ADDP aptr=0,in1 };;
+{ .mmi; .save.gf 0,0x100
+ stf.spill [r16]=f20,32
+ .save.gf 0,0x200
+ stf.spill [r17]=f21,32
+ $ADDP r29=8,in1 };;
+{ .mmi; .save.gf 0,0x400
+ stf.spill [r16]=f22
+ .save.gf 0,0x800
+ stf.spill [r17]=f23
+ $ADDP rptr=0,in0 };;
+
+ .body
+ .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+ .rotr t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
+ ldf8 ai1=[r29],16 // ap[1]
+ $ADDP bptr=0,in2 }
+{ .mmi; $ADDP r30=8,in2
+ $ADDP nptr=0,in3
+ $ADDP r31=8,in3 };;
+{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
+ ldf8 bj[6]=[r30],16 // bp[1]
+ cmp4.le p4,p5=3,in5 }
+{ .mmi; ldf8 ni0=[nptr],16 // np[0]
+ ldf8 ni1=[r31],16 // np[1]
+ cmp4.le p6,p7=4,in5 };;
+
+{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
+ (p5)fcvt.fxu ai2=f0
+ cmp4.le p8,p9=5,in5 }
+{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
+ (p7)fcvt.fxu ai3=f0
+ cmp4.le p10,p11=6,in5 }
+{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
+ (p5)fcvt.fxu bj[5]=f0
+ cmp4.le p12,p13=7,in5 }
+{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
+ (p7)fcvt.fxu bj[4]=f0
+ cmp4.le p14,p15=8,in5 }
+{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
+ (p5)fcvt.fxu ni2=f0
+ addp4 r28=-1,in5 }
+{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
+ (p7)fcvt.fxu ni3=f0
+ $ADDP in4=0,in4 };;
+
+{ .mfi; ldf8 n0=[in4]
+ fcvt.fxu tf[1]=f0
+ nop.i 0 }
+
+{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
+ (p9)fcvt.fxu ai4=f0
+ mov t[0]=r0 }
+{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
+ (p11)fcvt.fxu ai5=f0
+ mov t[1]=r0 }
+{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
+ (p9)fcvt.fxu bj[3]=f0
+ mov t[2]=r0 }
+{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
+ (p11)fcvt.fxu bj[2]=f0
+ mov t[3]=r0 }
+{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
+ (p9)fcvt.fxu ni4=f0
+ mov t[4]=r0 }
+{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
+ (p11)fcvt.fxu ni5=f0
+ mov t[5]=r0 };;
+
+{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
+ (p13)fcvt.fxu ai6=f0
+ mov t[6]=r0 }
+{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
+ (p15)fcvt.fxu ai7=f0
+ mov t[7]=r0 }
+{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
+ (p13)fcvt.fxu bj[1]=f0
+ mov ar.lc=r28 }
+{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
+ (p15)fcvt.fxu bj[0]=f0
+ mov ar.ec=2 }
+{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
+ (p13)fcvt.fxu ni6=f0
+ mov pr.rot=1<<16 }
+{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
+ (p15)fcvt.fxu ni7=f0
+ brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
+ };;
+
+// The loop is scheduled for 32*(n+1) ticks on Itanium 2. Actual
+// measurement with help of Interval Time Counter indicate that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*(n+1) ticks, because of higher xma latency...
+.Louter_8_ctop:
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 0:
+ (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
+ (p40) add a3=a3,n3 } // (p17) a3+=n3
+{ .mfi; (p42) add a3=a3,n3,1
+ (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig a7=alo[8] // 1:
+ (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
+ (p50) add t[6]=t[6],a3,1 };;
+{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
+ (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
+ (p40) cmp.ltu p43,p41=a3,n3 }
+{ .mfi; (p42) cmp.leu p43,p41=a3,n3
+ (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n5=nlo[6] // 3:
+ (p48) cmp.ltu p51,p49=t[6],a3
+ (p50) cmp.leu p51,p49=t[6],a3 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 4:
+ (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
+ (p41) add a4=a4,n4 } // (p17) a4+=n4
+{ .mfi; (p43) add a4=a4,n4,1
+ (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
+ (p16) nop.i 0 };;
+{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
+ (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
+ (p51) add t[5]=t[5],a4,1 };;
+{ .mfi; (p16) nop.m 0 // 6:
+ (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
+ (p41) cmp.ltu p42,p40=a4,n4 }
+{ .mfi; (p43) cmp.leu p42,p40=a4,n4
+ (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n6=nlo[7] // 7:
+ (p49) cmp.ltu p50,p48=t[5],a4
+ (p51) cmp.leu p50,p48=t[5],a4 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 8:
+ (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
+ (p40) add a5=a5,n5 } // (p17) a5+=n5
+{ .mfi; (p42) add a5=a5,n5,1
+ (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a1=alo[1] // 9:
+ (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
+ (p50) add t[4]=t[4],a5,1 };;
+{ .mfi; (p16) nop.m 0 // 10:
+ (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
+ (p40) cmp.ltu p43,p41=a5,n5 }
+{ .mfi; (p42) cmp.leu p43,p41=a5,n5
+ (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
+ (p16) nop.i 0 };;
+{ .mii; (p17) getf.sig n7=nlo[8] // 11:
+ (p48) cmp.ltu p51,p49=t[4],a5
+ (p50) cmp.leu p51,p49=t[4],a5 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
+ (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
+ (p41) add a6=a6,n6 } // (p17) a6+=n6
+{ .mfi; (p43) add a6=a6,n6,1
+ (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a2=alo[2] // 13:
+ (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
+ (p51) add t[3]=t[3],a6,1 };;
+{ .mfi; (p16) nop.m 0 // 14:
+ (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
+ (p41) cmp.ltu p42,p40=a6,n6 }
+{ .mfi; (p43) cmp.leu p42,p40=a6,n6
+ (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
+ (p16) nop.i 0 };;
+{ .mii; (p16) nop.m 0 // 15:
+ (p49) cmp.ltu p50,p48=t[3],a6
+ (p51) cmp.leu p50,p48=t[3],a6 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 16:
+ (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
+ (p40) add a7=a7,n7 } // (p17) a7+=n7
+{ .mfi; (p42) add a7=a7,n7,1
+ (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a3=alo[3] // 17:
+ (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
+ (p50) add t[2]=t[2],a7,1 };;
+{ .mfi; (p16) nop.m 0 // 18:
+ (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
+ (p40) cmp.ltu p43,p41=a7,n7 }
+{ .mfi; (p42) cmp.leu p43,p41=a7,n7
+ (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n1=nlo[1] // 19:
+ (p48) cmp.ltu p51,p49=t[2],a7
+ (p50) cmp.leu p51,p49=t[2],a7 };;
+ .pred.rel "mutex",p41,p43
+ .pred.rel "mutex",p49,p51
+{ .mfi; (p16) nop.m 0 // 20:
+ (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
+ (p41) add a8=a8,n8 } // (p17) a8+=n8
+{ .mfi; (p43) add a8=a8,n8,1
+ (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a4=alo[4] // 21:
+ (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
+ (p51) add t[1]=t[1],a8,1 };;
+{ .mfi; (p16) nop.m 0 // 22:
+ (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
+ (p41) cmp.ltu p42,p40=a8,n8 }
+{ .mfi; (p43) cmp.leu p42,p40=a8,n8
+ (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n2=nlo[2] // 23:
+ (p49) cmp.ltu p50,p48=t[1],a8
+ (p51) cmp.leu p50,p48=t[1],a8 };;
+{ .mfi; (p16) nop.m 0 // 24:
+ (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
+ (p16) add a1=a1,n1 } // (p16) a1+=n1
+{ .mfi; (p16) nop.m 0
+ (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
+ (p17) mov t[0]=r0 };;
+{ .mii; (p16) getf.sig a5=alo[5] // 25:
+ (p16) add t0=t[7],a1 // (p16) t[7]+=a1
+ (p42) add t[0]=t[0],r0,1 };;
+{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
+ (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
+ (p50) add t[0]=t[0],r0,1 }
+{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
+ (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig n3=nlo[3] // 27:
+ (p16) cmp.ltu.unc p50,p48=t0,a1
+ (p16) nop.i 0 };;
+ .pred.rel "mutex",p40,p42
+ .pred.rel "mutex",p48,p50
+{ .mfi; (p16) nop.m 0 // 28:
+ (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
+ (p40) add a2=a2,n2 } // (p16) a2+=n2
+{ .mfi; (p42) add a2=a2,n2,1
+ (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
+ (p16) nop.i 0 };;
+{ .mii; (p16) getf.sig a6=alo[6] // 29:
+ (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
+ (p50) add t[6]=t[6],a2,1 };;
+{ .mfi; (p16) nop.m 0 // 30:
+ (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
+ (p40) cmp.ltu p41,p39=a2,n2 }
+{ .mfi; (p42) cmp.leu p41,p39=a2,n2
+ (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
+ (p16) nop.i 0 };;
+{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
+ (p16) nop.f 0
+ (p48) cmp.ltu p49,p47=t[6],a2 }
+{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
+ (p16) nop.f 0
+ br.ctop.sptk.many .Louter_8_ctop };;
+.Louter_8_cend:
+
+// move np[8] to GPR bank and subtract it from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+// t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t[0]|t0 (least significant)
+{ .mmi; getf.sig n1=ni0
+ getf.sig n2=ni1
+ add r16=-7*16,prevsp}
+{ .mmi; getf.sig n3=ni2
+ getf.sig n4=ni3
+ add r17=-6*16,prevsp};;
+{ .mmi; getf.sig n5=ni4
+ getf.sig n6=ni5
+ add r18=-5*16,prevsp}
+{ .mmi; getf.sig n7=ni6
+ getf.sig n8=ni7
+ sub n1=t0,n1 };;
+{ .mmi; cmp.gtu p34,p32=n1,t0;;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n2=t[0],n2
+ (p34)sub n2=t[0],n2,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n2,t[0]
+ (p34)cmp.geu p35,p33=n2,t[0];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n3=t[7],n3 }
+{ .mmi; (p35)sub n3=t[7],n3,1;;
+ (p33)cmp.gtu p34,p32=n3,t[7]
+ (p35)cmp.geu p34,p32=n3,t[7] };;
+ .pred.rel "mutex",p32,p34
+{ .mii; (p32)sub n4=t[6],n4
+ (p34)sub n4=t[6],n4,1;;
+ (p32)cmp.gtu p35,p33=n4,t[6] }
+{ .mmi; (p34)cmp.geu p35,p33=n4,t[6];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub n5=t[5],n5
+ (p35)sub n5=t[5],n5,1 };;
+{ .mii; (p33)cmp.gtu p34,p32=n5,t[5]
+ (p35)cmp.geu p34,p32=n5,t[5];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n6=t[4],n6 }
+{ .mmi; (p34)sub n6=t[4],n6,1;;
+ (p32)cmp.gtu p35,p33=n6,t[4]
+ (p34)cmp.geu p35,p33=n6,t[4] };;
+ .pred.rel "mutex",p33,p35
+{ .mii; (p33)sub n7=t[3],n7
+ (p35)sub n7=t[3],n7,1;;
+ (p33)cmp.gtu p34,p32=n7,t[3] }
+{ .mmi; (p35)cmp.geu p34,p32=n7,t[3];;
+ .pred.rel "mutex",p32,p34
+ (p32)sub n8=t[2],n8
+ (p34)sub n8=t[2],n8,1 };;
+{ .mii; (p32)cmp.gtu p35,p33=n8,t[2]
+ (p34)cmp.geu p35,p33=n8,t[2];;
+ .pred.rel "mutex",p33,p35
+ (p33)sub a8=t[1],r0 }
+{ .mmi; (p35)sub a8=t[1],r0,1;;
+ (p33)cmp.gtu p34,p32=a8,t[1]
+ (p35)cmp.geu p34,p32=a8,t[1] };;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+ .pred.rel "mutex",p32,p34
+{ .mmi; (p32)st8 [rptr]=n1,8
+ (p34)st8 [rptr]=t0,8
+ add r19=-4*16,prevsp};;
+{ .mmb; (p32)st8 [rptr]=n2,8
+ (p34)st8 [rptr]=t[0],8
+ (p5)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n3,8
+ (p34)st8 [rptr]=t[7],8
+ (p7)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n4,8
+ (p34)st8 [rptr]=t[6],8
+ (p9)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n5,8
+ (p34)st8 [rptr]=t[5],8
+ (p11)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n6,8
+ (p34)st8 [rptr]=t[4],8
+ (p13)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n7,8
+ (p34)st8 [rptr]=t[3],8
+ (p15)br.cond.dpnt.few .Ldone };;
+{ .mmb; (p32)st8 [rptr]=n8,8
+ (p34)st8 [rptr]=t[2],8
+ nop.b 0 };;
+.Ldone: // epilogue
+{ .mmi; ldf.fill f16=[r16],64
+ ldf.fill f17=[r17],64
+ nop.i 0 }
+{ .mmi; ldf.fill f18=[r18],64
+ ldf.fill f19=[r19],64
+ mov pr=prevpr,-2 };;
+{ .mmi; ldf.fill f20=[r16]
+ ldf.fill f21=[r17]
+ mov ar.lc=prevlc }
+{ .mmi; ldf.fill f22=[r18]
+ ldf.fill f23=[r19]
+ mov ret0=1 } // signal "handled"
+{ .mib; rum 1<<5
+ .restore sp
+ mov sp=prevsp
+ br.ret.sptk.many b0 };;
+.endp bn_mul_mont_8#
+
+.type copyright#,\@object
+copyright:
+stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S
index 7b82b820e6..951abc53ea 100644
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
@@ -171,21 +171,21 @@
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_add_words:
.prologue
- .fframe 0
.save ar.pfs,r2
{ .mii; alloc r2=ar.pfs,4,12,0,16
cmp4.le p6,p0=r35,r0 };;
{ .mfb; mov r8=r0 // return value
(p6) br.ret.spnt.many b0 };;
- .save ar.lc,r3
{ .mib; sub r10=r35,r0,1
+ .save ar.lc,r3
mov r3=ar.lc
brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
}
- .body
{ .mib; ADDP r14=0,r32 // rp
+ .save pr,r9
mov r9=pr };;
+ .body
{ .mii; ADDP r15=0,r33 // ap
mov ar.lc=r10
mov ar.ec=6 }
@@ -224,21 +224,21 @@ bn_add_words:
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_sub_words:
.prologue
- .fframe 0
.save ar.pfs,r2
{ .mii; alloc r2=ar.pfs,4,12,0,16
cmp4.le p6,p0=r35,r0 };;
{ .mfb; mov r8=r0 // return value
(p6) br.ret.spnt.many b0 };;
- .save ar.lc,r3
{ .mib; sub r10=r35,r0,1
+ .save ar.lc,r3
mov r3=ar.lc
brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
}
- .body
{ .mib; ADDP r14=0,r32 // rp
+ .save pr,r9
mov r9=pr };;
+ .body
{ .mii; ADDP r15=0,r33 // ap
mov ar.lc=r10
mov ar.ec=6 }
@@ -283,7 +283,6 @@ bn_sub_words:
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_mul_words:
.prologue
- .fframe 0
.save ar.pfs,r2
#ifdef XMA_TEMPTATION
{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
@@ -294,9 +293,10 @@ bn_mul_words:
cmp4.le p6,p0=r34,r0
(p6) br.ret.spnt.many b0 };;
- .save ar.lc,r3
{ .mii; sub r10=r34,r0,1
+ .save ar.lc,r3
mov r3=ar.lc
+ .save pr,r9
mov r9=pr };;
.body
@@ -397,22 +397,21 @@ bn_mul_words:
.skip 48 // makes the loop body aligned at 64-byte boundary
bn_mul_add_words:
.prologue
- .fframe 0
.save ar.pfs,r2
- .save ar.lc,r3
- .save pr,r9
{ .mmi; alloc r2=ar.pfs,4,4,0,8
cmp4.le p6,p0=r34,r0
+ .save ar.lc,r3
mov r3=ar.lc };;
{ .mib; mov r8=r0 // return value
sub r10=r34,r0,1
(p6) br.ret.spnt.many b0 };;
- .body
{ .mib; setf.sig f8=r35 // w
+ .save pr,r9
mov r9=pr
brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
}
+ .body
{ .mmi; ADDP r14=0,r32 // rp
ADDP r15=0,r33 // ap
mov ar.lc=r10 }
@@ -466,7 +465,6 @@ bn_mul_add_words:
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_sqr_words:
.prologue
- .fframe 0
.save ar.pfs,r2
{ .mii; alloc r2=ar.pfs,3,0,0,0
sxt4 r34=r34 };;
@@ -476,9 +474,10 @@ bn_sqr_words:
nop.f 0x0
(p6) br.ret.spnt.many b0 };;
- .save ar.lc,r3
{ .mii; sub r10=r34,r0,1
+ .save ar.lc,r3
mov r3=ar.lc
+ .save pr,r9
mov r9=pr };;
.body
@@ -545,7 +544,6 @@ bn_sqr_words:
.align 64
bn_sqr_comba8:
.prologue
- .fframe 0
.save ar.pfs,r2
#if defined(_HPUX_SOURCE) && !defined(_LP64)
{ .mii; alloc r2=ar.pfs,2,1,0,0
@@ -617,7 +615,6 @@ bn_sqr_comba8:
.align 64
bn_mul_comba8:
.prologue
- .fframe 0
.save ar.pfs,r2
#if defined(_HPUX_SOURCE) && !defined(_LP64)
{ .mii; alloc r2=ar.pfs,3,0,0,0
@@ -1175,7 +1172,6 @@ bn_mul_comba8:
.align 64
bn_sqr_comba4:
.prologue
- .fframe 0
.save ar.pfs,r2
#if defined(_HPUX_SOURCE) && !defined(_LP64)
{ .mii; alloc r2=ar.pfs,2,1,0,0
@@ -1208,7 +1204,6 @@ bn_sqr_comba4:
.align 64
bn_mul_comba4:
.prologue
- .fframe 0
.save ar.pfs,r2
#if defined(_HPUX_SOURCE) && !defined(_LP64)
{ .mii; alloc r2=ar.pfs,3,0,0,0
@@ -1411,11 +1406,11 @@ equ=p24
.align 64
bn_div_words:
.prologue
- .fframe 0
.save ar.pfs,r2
- .save b0,r3
{ .mii; alloc r2=ar.pfs,3,5,0,8
+ .save b0,r3
mov r3=b0
+ .save pr,r10
mov r10=pr };;
{ .mmb; cmp.eq p6,p0=r34,r0
mov r8=-1
diff --git a/crypto/bn/asm/mips3-mont.pl b/crypto/bn/asm/mips3-mont.pl
new file mode 100644
index 0000000000..8f9156e02a
--- /dev/null
+++ b/crypto/bn/asm/mips3-mont.pl
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys. While 512-bit
+# RSA private key operations are 40% faster, 1024-bit ones are hardly
+# faster at all, while longer key operations are slower by up to 20%.
+# It might be of interest to embedded system developers though, as
+# it's smaller than 1KB, yet offers ~3x improvement over compiler
+# generated code.
+#
+# The module targets N32 and N64 MIPS ABIs and currently is a bit
+# IRIX-centric, i.e. is likely to require adaptation for other OSes.
+
+# int bn_mul_mont(
+$rp="a0"; # BN_ULONG *rp,
+$ap="a1"; # const BN_ULONG *ap,
+$bp="a2"; # const BN_ULONG *bp,
+$np="a3"; # const BN_ULONG *np,
+$n0="a4"; # const BN_ULONG *n0,
+$num="a5"; # int num);
+
+$lo0="a6";
+$hi0="a7";
+$lo1="v0";
+$hi1="v1";
+$aj="t0";
+$bi="t1";
+$nj="t2";
+$tp="t3";
+$alo="s0";
+$ahi="s1";
+$nlo="s2";
+$nhi="s3";
+$tj="s4";
+$i="s5";
+$j="s6";
+$fp="t8";
+$m1="t9";
+
+$FRAME=8*(2+8);
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set noat
+.set reorder
+
+.align 5
+.globl bn_mul_mont
+.ent bn_mul_mont
+bn_mul_mont:
+ .set noreorder
+ PTR_SUB sp,64
+ move $fp,sp
+ .frame $fp,64,ra
+ slt AT,$num,4
+ li v0,0
+ beqzl AT,.Lproceed
+ nop
+ jr ra
+ PTR_ADD sp,$fp,64
+ .set reorder
+.align 5
+.Lproceed:
+ ld $n0,0($n0)
+ ld $bi,0($bp) # bp[0]
+ ld $aj,0($ap) # ap[0]
+ ld $nj,0($np) # np[0]
+ PTR_SUB sp,16 # place for two extra words
+ sll $num,3
+ li AT,-4096
+ PTR_SUB sp,$num
+ and sp,AT
+
+ sd s0,0($fp)
+ sd s1,8($fp)
+ sd s2,16($fp)
+ sd s3,24($fp)
+ sd s4,32($fp)
+ sd s5,40($fp)
+ sd s6,48($fp)
+ sd s7,56($fp)
+
+ dmultu $aj,$bi
+ ld $alo,8($ap)
+ ld $nlo,8($np)
+ mflo $lo0
+ mfhi $hi0
+ dmultu $lo0,$n0
+ mflo $m1
+
+ dmultu $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ dmultu $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+ dmultu $nlo,$m1
+ daddu $lo1,$lo0
+ sltu AT,$lo1,$lo0
+ daddu $hi1,AT
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,sp
+ li $j,16
+.align 4
+.L1st:
+ .set noreorder
+ PTR_ADD $aj,$ap,$j
+ ld $aj,($aj)
+ PTR_ADD $nj,$np,$j
+ ld $nj,($nj)
+
+ dmultu $aj,$bi
+ daddu $lo0,$alo,$hi0
+ daddu $lo1,$nlo,$hi1
+ sltu AT,$lo0,$hi0
+ sltu s7,$lo1,$hi1
+ daddu $hi0,$ahi,AT
+ daddu $hi1,$nhi,s7
+ mflo $alo
+ mfhi $ahi
+
+ daddu $lo1,$lo0
+ sltu AT,$lo1,$lo0
+ dmultu $nj,$m1
+ daddu $hi1,AT
+ addu $j,8
+ sd $lo1,($tp)
+ sltu s7,$j,$num
+ mflo $nlo
+ mfhi $nhi
+
+ bnez s7,.L1st
+ PTR_ADD $tp,8
+ .set reorder
+
+ daddu $lo0,$alo,$hi0
+ sltu AT,$lo0,$hi0
+ daddu $hi0,$ahi,AT
+
+ daddu $lo1,$nlo,$hi1
+ sltu s7,$lo1,$hi1
+ daddu $hi1,$nhi,s7
+ daddu $lo1,$lo0
+ sltu AT,$lo1,$lo0
+ daddu $hi1,AT
+
+ sd $lo1,($tp)
+
+ daddu $hi1,$hi0
+ sltu AT,$hi1,$hi0
+ sd $hi1,8($tp)
+ sd AT,16($tp)
+
+ li $i,8
+.align 4
+.Louter:
+ PTR_ADD $bi,$bp,$i
+ ld $bi,($bi)
+ ld $aj,($ap)
+ ld $alo,8($ap)
+ ld $tj,(sp)
+
+ dmultu $aj,$bi
+ ld $nj,($np)
+ ld $nlo,8($np)
+ mflo $lo0
+ mfhi $hi0
+ daddu $lo0,$tj
+ dmultu $lo0,$n0
+ sltu AT,$lo0,$tj
+ daddu $hi0,AT
+ mflo $m1
+
+ dmultu $alo,$bi
+ mflo $alo
+ mfhi $ahi
+
+ dmultu $nj,$m1
+ mflo $lo1
+ mfhi $hi1
+
+ dmultu $nlo,$m1
+ daddu $lo1,$lo0
+ sltu AT,$lo1,$lo0
+ daddu $hi1,AT
+ mflo $nlo
+ mfhi $nhi
+
+ move $tp,sp
+ li $j,16
+ ld $tj,8($tp)
+.align 4
+.Linner:
+ .set noreorder
+ PTR_ADD $aj,$ap,$j
+ ld $aj,($aj)
+ PTR_ADD $nj,$np,$j
+ ld $nj,($nj)
+
+ dmultu $aj,$bi
+ daddu $lo0,$alo,$hi0
+ daddu $lo1,$nlo,$hi1
+ sltu AT,$lo0,$hi0
+ sltu s7,$lo1,$hi1
+ daddu $hi0,$ahi,AT
+ daddu $hi1,$nhi,s7
+ mflo $alo
+ mfhi $ahi
+
+ daddu $lo0,$tj
+ addu $j,8
+ dmultu $nj,$m1
+ sltu AT,$lo0,$tj
+ daddu $lo1,$lo0
+ daddu $hi0,AT
+ sltu s7,$lo1,$lo0
+ ld $tj,16($tp)
+ daddu $hi1,s7
+ sltu AT,$j,$num
+ mflo $nlo
+ mfhi $nhi
+ sd $lo1,($tp)
+ bnez AT,.Linner
+ PTR_ADD $tp,8
+ .set reorder
+
+ daddu $lo0,$alo,$hi0
+ sltu AT,$lo0,$hi0
+ daddu $hi0,$ahi,AT
+ daddu $lo0,$tj
+ sltu s7,$lo0,$tj
+ daddu $hi0,s7
+
+ ld $tj,16($tp)
+ daddu $lo1,$nlo,$hi1
+ sltu AT,$lo1,$hi1
+ daddu $hi1,$nhi,AT
+ daddu $lo1,$lo0
+ sltu s7,$lo1,$lo0
+ daddu $hi1,s7
+ sd $lo1,($tp)
+
+ daddu $lo1,$hi1,$hi0
+ sltu $hi1,$lo1,$hi0
+ daddu $lo1,$tj
+ sltu AT,$lo1,$tj
+ daddu $hi1,AT
+ sd $lo1,8($tp)
+ sd $hi1,16($tp)
+
+ addu $i,8
+ sltu s7,$i,$num
+ bnez s7,.Louter
+
+ .set noreorder
+ PTR_ADD $tj,sp,$num # &tp[num]
+ move $tp,sp
+ move $ap,sp
+ li $hi0,0 # clear borrow bit
+
+.align 4
+.Lsub: ld $lo0,($tp)
+ ld $lo1,($np)
+ PTR_ADD $tp,8
+ PTR_ADD $np,8
+ dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
+ sgtu AT,$lo1,$lo0
+ dsubu $lo0,$lo1,$hi0
+ sgtu $hi0,$lo0,$lo1
+ sd $lo0,($rp)
+ or $hi0,AT
+ sltu AT,$tp,$tj
+ bnez AT,.Lsub
+ PTR_ADD $rp,8
+
+ dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
+ move $tp,sp
+ PTR_SUB $rp,$num # restore rp
+ not $hi1,$hi0
+
+ and $ap,$hi0,sp
+ and $bp,$hi1,$rp
+ or $ap,$ap,$bp # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: ld $aj,($ap)
+ PTR_ADD $ap,8
+ PTR_ADD $tp,8
+ sd zero,-8($tp)
+ sltu AT,$tp,$tj
+ sd $aj,($rp)
+ bnez AT,.Lcopy
+ PTR_ADD $rp,8
+
+ ld s0,0($fp)
+ ld s1,8($fp)
+ ld s2,16($fp)
+ ld s3,24($fp)
+ ld s4,32($fp)
+ ld s5,40($fp)
+ ld s6,48($fp)
+ ld s7,56($fp)
+ li v0,1
+ jr ra
+ PTR_ADD sp,$fp,64
+ .set reorder
+END(bn_mul_mont)
+.rdata
+.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000000..4a766a87fb
--- /dev/null
+++ b/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+ $LEVEL ="2.0W";
+ $SIZE_T =8;
+ $FRAME_MARKER =80;
+ $SAVED_RP =16;
+ $PUSH ="std";
+ $PUSHMA ="std,ma";
+ $POP ="ldd";
+ $POPMB ="ldd,mb";
+ $BN_SZ =$SIZE_T;
+} else {
+ $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
+ $SIZE_T =4;
+ $FRAME_MARKER =48;
+ $SAVED_RP =20;
+ $PUSH ="stw";
+ $PUSHMA ="stwm";
+ $POP ="ldw";
+ $POPMB ="ldwm";
+ $BN_SZ =$SIZE_T;
+ if (open CONF,"<${dir}../../opensslconf.h") {
+ while(<CONF>) {
+ if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+ $BN_SZ=8;
+ $LEVEL="2.0";
+ last;
+ }
+ }
+ close CONF;
+ }
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
+ # [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32; # local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22"; # passed through stack in 32-bit
+$num="%r21"; # passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4"; $fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
+$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
+
+$code=<<___;
+ .LEVEL $LEVEL
+ .SPACE \$TEXT\$
+ .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+ .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+ .ALIGN 64
+bn_mul_mont
+ .PROC
+ .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+ .ENTRY
+ $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
+ $PUSHMA %r3,$FRAME(%sp)
+ $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
+ $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
+ $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
+ $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
+ $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
+ $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
+ $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
+ ldo -$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+ ldw `-$FRAME_MARKER-4`($fp),$n0
+ ldw `-$FRAME_MARKER-8`($fp),$num
+ nop
+ nop ; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+ comiclr,<= 6,$num,%r0 ; are vectors long enough?
+ b L\$abort
+ ldi 0,%r28 ; signal "unhandled"
+ add,ev %r0,$num,$num ; is $num even?
+ b L\$abort
+ nop
+ or $ap,$np,$ti1
+ extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
+ b L\$abort
+ nop
+ nop ; alignment
+ nop
+
+ fldws 0($n0),${fn0}
+ fldws,ma 4($bp),${fbi} ; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+ comib,> 3,$num,L\$abort ; are vectors long enough?
+ ldi 0,%r28 ; signal "unhandled"
+ addl $num,$num,$num ; I operate on 32-bit values
+
+ fldws 4($n0),${fn0} ; only low part of n0
+ fldws 4($bp),${fbi} ; bp[0] in flipped word order
+___
+$code.=<<___;
+ fldds 0($ap),${fai} ; ap[0,1]
+ fldds 0($np),${fni} ; np[0,1]
+
+ sh2addl $num,%r0,$arrsz
+ ldi 31,$hi0
+ ldo 36($arrsz),$hi1 ; space for tp[num+1]
+ andcm $hi1,$hi0,$hi1 ; align
+ addl $hi1,%sp,%sp
+ $PUSH $fp,-$SIZE_T(%sp)
+
+ ldo `$LOCALS+16`($fp),$xfer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ addl $arrsz,$ap,$ap ; point at the end
+ addl $arrsz,$np,$np
+ subi 0,$arrsz,$idx ; j=0
+ ldo 8($idx),$idx ; j++++
+
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ fstds ${fab1},0($xfer)
+ fstds ${fnm1},8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+ mtctl $hi0,%cr11 ; $hi0 still holds 31
+ extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
+ b L\$parisc11
+ nop
+___
+$code.=<<___; # PA-RISC 2.0 code-path
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $ab0,63,32,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ ldo 8($idx),$idx ; j++++
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ extrd,u $nm0,31,32,$hi1
+
+L\$1st
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ stw $nm1,-4($tp) ; tp[j-1]
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,63,32,$ab1
+ addl $hi1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ addl $ab1,$nm1,$nm1
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ extrd,u $ab0,31,32,$hi0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+ fldws 0($bp),${fbi} ; bp[1] in flipped word order
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $hi0,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ addl $hi1,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2]
+ flddx $idx($np),${fni} ; np[2]
+ ldo 8($idx),$idx ; j++++
+ ldd -16($xfer),$ab0 ; 33-bit value
+ ldd -8($xfer),$nm0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ extrd,u $ab0,31,32,$ti0 ; carry bit
+ extrd,u $ab0,63,32,$ab0
+ fstds ${fab1},0($xfer)
+ addl $ti0,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ addl $ab0,$nm0,$nm0 ; low part is discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ extrd,u $nm0,31,32,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+L\$inner
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ flddx $idx($np),${fni} ; np[j,j+1]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldw 4($tp),$ti0 ; tp[j]
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldd -16($xfer),$ab0
+ fstds ${fab0},-16($xfer)
+ addl $hi0,$ti0,$ti0
+ addl $ti0,$ab0,$ab0
+ ldd -8($xfer),$nm0
+ fstds ${fnm0},-8($xfer)
+ extrd,u $ab0,31,32,$hi0
+ extrd,u $nm1,31,32,$hi1
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ addl $ab0,$nm0,$nm0
+ stw,ma $nm0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner ; j++++
+ extrd,u $nm0,31,32,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldd 0($xfer),$ab1
+ fstds ${fab1},0($xfer)
+ addl $hi0,$ti1,$ti1
+ addl $ti1,$ab1,$ab1
+ ldd 8($xfer),$nm1
+ fstds ${fnm1},8($xfer)
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+ ldw 4($tp),$ti0 ; tp[j]
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ ldd -16($xfer),$ab0
+ ldd -8($xfer),$nm0
+ extrd,u $nm1,31,32,$hi1
+
+ addl $hi0,$ab0,$ab0
+ addl $ti0,$ab0,$ab0
+ stw $nm1,-4($tp) ; tp[j-1]
+ extrd,u $ab0,31,32,$hi0
+ ldw 8($tp),$ti1 ; tp[j]
+ extrd,u $ab0,63,32,$ab0
+ addl $hi1,$nm0,$nm0
+ ldd 0($xfer),$ab1
+ addl $ab0,$nm0,$nm0
+ ldd,mb 8($xfer),$nm1
+ extrd,u $nm0,31,32,$hi1
+ stw,ma $nm0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone ; i--
+ subi 0,$arrsz,$idx ; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+ fldws,ma 4($bp),${fbi} ; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldi 12,$ti0 ; bp[i] in flipped word order
+ addl,ev %r0,$num,$num
+ ldi -4,$ti0
+ addl $ti0,$bp,$bp
+ fldws 0($bp),${fbi}
+___
+$code.=<<___;
+ flddx $idx($ap),${fai} ; ap[0]
+ addl $hi0,$ab1,$ab1
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ addl $hi1,$hi0,$hi0
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+ addl $hi0,$ab1,$ab1
+ addl $ti1,$ab1,$ab1
+ extrd,u $ab1,31,32,$hi0
+ extrd,u $ab1,63,32,$ab1
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ addl $hi1,$nm1,$nm1
+ addl $ab1,$nm1,$nm1
+ extrd,u $nm1,31,32,$hi1
+ stw $nm1,-4($tp) ; tp[j-1]
+
+ addl $hi1,$hi0,$hi0
+ addl $ti0,$hi0,$hi0
+ extrd,u $hi0,31,32,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+ ldws,ma 4($tp),$ti0
+ extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
+ b L\$sub_pa11
+ addl $tp,$arrsz,$tp
+L\$sub
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+ ldd,ma 8($tp),$ti0
+L\$sub
+ ldd $idx($np),$hi0
+ shrpd $ti0,$ti0,32,$ti0 ; flip word order
+ std $ti0,-8($tp) ; save flipped value
+ sub,db $ti0,$hi0,$hi1
+ ldd,ma 8($tp),$ti0
+ addib,<> 8,$idx,L\$sub
+ std,ma $hi1,8($rp)
+
+ extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
+ sub,db $ti0,%r0,$hi1
+ ldo -8($tp),$tp
+___
+$code.=<<___;
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy
+ ldd $idx($np),$hi0
+ std,ma %r0,8($tp)
+ addib,<> 8,$idx,.-8 ; L\$copy
+ std,ma $hi0,8($rp)
+___
+
+if ($BN_SZ==4) { # PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+ b L\$done
+ nop
+
+ .ALIGN 8
+L\$parisc11
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -12($xfer),$ablo
+ ldw -16($xfer),$hi0
+ ldw -4($xfer),$nmlo0
+ ldw -8($xfer),$nmhi0
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+
+ ldo 8($idx),$idx ; j++++
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ addc %r0,$nmhi0,$hi1
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+ nop
+
+L\$1st_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 12($xfer),$nmlo1
+ addc %r0,$abhi,$hi0
+ ldw 8($xfer),$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fab1},0($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ fstds ${fnm1},8($xfer)
+ add $hi1,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$hi1
+ ldw -16($xfer),$abhi
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
+ ldw -4($xfer),$nmlo0
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -8($xfer),$nmhi0
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ fstds ${fab0},-16($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ fstds ${fnm0},-8($xfer)
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$1st_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ ldw 8($xfer),$nmhi1
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ add $hi0,$ablo,$ablo
+ fstds ${fab1},0($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -8($xfer),$nmhi0
+ addc %r0,$nmhi1,$hi1
+ ldw -4($xfer),$nmlo0
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ ldo -1($num),$num ; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[1]
+ flddx $idx($ap),${fai} ; ap[0,1]
+ flddx $idx($np),${fni} ; np[0,1]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ xmpyu ${fn0},${fab0}R,${fm0}
+ ldo `$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
+ fstds ${fab0},-16($xfer) ; 33-bit value
+ fstds ${fnm0},-8($xfer)
+ flddx $idx($ap),${fai} ; ap[2,3]
+ flddx $idx($np),${fni} ; np[2,3]
+ ldw -16($xfer),$abhi ; carry bit actually
+ ldo 8($idx),$idx ; j++++
+ ldw -12($xfer),$ablo
+ ldw -8($xfer),$nmhi0
+ ldw -4($xfer),$nmlo0
+ ldw 0($xfer),$hi0 ; high part
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ fstds ${fab1},0($xfer)
+ addl $abhi,$hi0,$hi0 ; account carry bit
+ fstds ${fnm1},8($xfer)
+ add $ablo,$nmlo0,$nmlo0 ; discarded
+ ldw 0($tp),$ti1 ; tp[1]
+ addc %r0,$nmhi0,$hi1
+ fstds ${fab0},-16($xfer)
+ fstds ${fnm0},-8($xfer)
+ ldw 4($xfer),$ablo
+ ldw 0($xfer),$abhi
+
+L\$inner_pa11
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
+ flddx $idx($ap),${fai} ; ap[j,j+1]
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
+ flddx $idx($np),${fni} ; np[j,j+1]
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ ldw 12($xfer),$nmlo1
+ add $ti1,$ablo,$ablo
+ ldw 8($xfer),$nmhi1
+ addc %r0,$abhi,$hi0
+ fstds ${fab1},0($xfer)
+ add $ablo,$nmlo1,$nmlo1
+ fstds ${fnm1},8($xfer)
+ addc %r0,$nmhi1,$nmhi1
+ ldw -12($xfer),$ablo
+ add $hi1,$nmlo1,$nmlo1
+ ldw -16($xfer),$abhi
+ addc %r0,$nmhi1,$hi1
+
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
+ ldw 8($tp),$ti1 ; tp[j]
+ xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
+ ldw -4($xfer),$nmlo0
+ add $hi0,$ablo,$ablo
+ ldw -8($xfer),$nmhi0
+ addc %r0,$abhi,$abhi
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ add $ti0,$ablo,$ablo
+ fstds ${fab0},-16($xfer)
+ addc %r0,$abhi,$hi0
+ fstds ${fnm0},-8($xfer)
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldw 0($xfer),$abhi
+ add $hi1,$nmlo0,$nmlo0
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+ addib,<> 8,$idx,L\$inner_pa11 ; j++++
+ addc %r0,$nmhi0,$hi1
+
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
+ ldw 12($xfer),$nmlo1
+ xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
+ ldw 8($xfer),$nmhi1
+ add $hi0,$ablo,$ablo
+ ldw 4($tp),$ti0 ; tp[j]
+ addc %r0,$abhi,$abhi
+ fstds ${fab1},0($xfer)
+ add $ti1,$ablo,$ablo
+ fstds ${fnm1},8($xfer)
+ addc %r0,$abhi,$hi0
+ ldw -16($xfer),$abhi
+ add $ablo,$nmlo1,$nmlo1
+ ldw -12($xfer),$ablo
+ addc %r0,$nmhi1,$nmhi1
+ ldw -8($xfer),$nmhi0
+ add $hi1,$nmlo1,$nmlo1
+ ldw -4($xfer),$nmlo0
+ addc %r0,$nmhi1,$hi1
+
+ add $hi0,$ablo,$ablo
+ stw $nmlo1,-4($tp) ; tp[j-1]
+ addc %r0,$abhi,$abhi
+ add $ti0,$ablo,$ablo
+ ldw 8($tp),$ti1 ; tp[j]
+ addc %r0,$abhi,$hi0
+ ldw 0($xfer),$abhi
+ add $ablo,$nmlo0,$nmlo0
+ ldw 4($xfer),$ablo
+ addc %r0,$nmhi0,$nmhi0
+ ldws,mb 8($xfer),$nmhi1
+ add $hi1,$nmlo0,$nmlo0
+ ldw 4($xfer),$nmlo1
+ addc %r0,$nmhi0,$hi1
+ stws,ma $nmlo0,8($tp) ; tp[j-1]
+
+ addib,= -1,$num,L\$outerdone_pa11; i--
+ subi 0,$arrsz,$idx ; j=0
+
+ fldws,ma 4($bp),${fbi} ; bp[i]
+ flddx $idx($ap),${fai} ; ap[0]
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ flddx $idx($np),${fni} ; np[0]
+ fldws 8($xfer),${fti}R ; tp[0]
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldo 8($idx),$idx ; j++++
+ xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
+ xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ fstws,mb ${fab0}L,-8($xfer) ; save high part
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ fcpy,sgl %fr0,${fti}L ; zero high part
+ fcpy,sgl %fr0,${fab0}L
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
+ fcnvxf,dbl,dbl ${fab0},${fab0}
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+ xmpyu ${fn0},${fab0}R,${fm0}
+
+ b L\$outer_pa11
+ ldo `$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+ add $hi0,$ablo,$ablo
+ addc %r0,$abhi,$abhi
+ add $ti1,$ablo,$ablo
+ addc %r0,$abhi,$hi0
+
+ ldw 4($tp),$ti0 ; tp[j]
+
+ add $hi1,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$nmhi1
+ add $ablo,$nmlo1,$nmlo1
+ addc %r0,$nmhi1,$hi1
+ stw $nmlo1,-4($tp) ; tp[j-1]
+
+ add $hi1,$hi0,$hi0
+ addc %r0,%r0,$hi1
+ add $ti0,$hi0,$hi0
+ addc %r0,$hi1,$hi1
+ stw $hi0,0($tp)
+ stw $hi1,4($tp)
+
+ ldo `$LOCALS+32+4`($fp),$tp
+ sub %r0,%r0,%r0 ; clear borrow
+ ldw -4($tp),$ti0
+ addl $tp,$arrsz,$tp
+L\$sub_pa11
+ ldwx $idx($np),$hi0
+ subb $ti0,$hi0,$hi1
+ ldwx $idx($tp),$ti0
+ addib,<> 4,$idx,L\$sub_pa11
+ stws,ma $hi1,4($rp)
+
+ subb $ti0,%r0,$hi1
+ ldo -4($tp),$tp
+ and $tp,$hi1,$ap
+ andcm $rp,$hi1,$bp
+ or $ap,$bp,$np
+
+ sub $rp,$arrsz,$rp ; rewind rp
+ subi 0,$arrsz,$idx
+ ldo `$LOCALS+32`($fp),$tp
+L\$copy_pa11
+ ldwx $idx($np),$hi0
+ stws,ma %r0,4($tp)
+ addib,<> 4,$idx,L\$copy_pa11
+ stws,ma $hi0,4($rp)
+
+ nop ; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+ ldi 1,%r28 ; signal "handled"
+ ldo $FRAME($fp),%sp ; destroy tp[num+1]
+
+ $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
+ $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
+ $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
+ $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
+ $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
+ $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
+ $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
+ $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+ bv (%r2)
+ .EXIT
+ $POPMB -$FRAME(%sp),%r3
+ .PROCEND
+ .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "ldd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
+ { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
+ { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+ $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $std = sub {
+ my ($mod,$args) = @_;
+ my $orig = "std$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
+ { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+ $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
+ $opcode|=(1<<5) if ($mod =~ /^,m/);
+ $opcode|=(1<<13) if ($mod =~ /^,mb/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $extrd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "extrd$mod\t$args";
+
+ # I only have ",u" completer, it's implicitly encoded...
+ if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
+ { my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+ my $len=32-$3;
+ $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
+ $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
+ { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+ my $len=32-$2;
+ $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
+ $opcode |= (1<<13) if ($mod =~ /,\**=/);
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+ my ($mod,$args) = @_;
+ my $orig = "shrpd$mod\t$args";
+
+ if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
+ { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+ my $cpos=63-$3;
+ $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+ }
+ else { "\t".$orig; }
+};
+
+my $sub = sub {
+ my ($mod,$args) = @_;
+ my $orig = "sub$mod\t$args";
+
+ if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+ my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+ $opcode|=(1<<10); # e1
+ $opcode|=(1<<8); # e2
+ $opcode|=(1<<5); # d
+ sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+ }
+ else { "\t".$orig; }
+};
+
+sub assemble {
+ my ($mnemonic,$mod,$args)=@_;
+ my $opcode = eval("\$$mnemonic");
+
+ ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/ge;
+ # flip word order in 64-bit mode...
+ s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+ # assemble 2.0 instructions in 32-bit mode...
+ s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
new file mode 100644
index 0000000000..9257b2cd71
--- /dev/null
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,328 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit +65%
+# 1024-bit +35%
+# 2048-bit +18%
+# 4096-bit +4%
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+ $BITS= 32;
+ $BNSZ= $BITS/8;
+ $SIZE_T=4;
+ $RZONE= 224;
+ $FRAME= $SIZE_T*16;
+
+ $LD= "lwz"; # load
+ $LDU= "lwzu"; # load and update
+ $LDX= "lwzx"; # load indexed
+ $ST= "stw"; # store
+ $STU= "stwu"; # store and update
+ $STX= "stwx"; # store indexed
+ $STUX= "stwux"; # store indexed and update
+ $UMULL= "mullw"; # unsigned multiply low
+ $UMULH= "mulhwu"; # unsigned multiply high
+ $UCMP= "cmplw"; # unsigned compare
+ $SHRI= "srwi"; # unsigned shift right by immediate
+ $PUSH= $ST;
+ $POP= $LD;
+} elsif ($flavour =~ /64/) {
+ $BITS= 64;
+ $BNSZ= $BITS/8;
+ $SIZE_T=8;
+ $RZONE= 288;
+ $FRAME= $SIZE_T*16;
+
+ # same as above, but 64-bit mnemonics...
+ $LD= "ld"; # load
+ $LDU= "ldu"; # load and update
+ $LDX= "ldx"; # load indexed
+ $ST= "std"; # store
+ $STU= "stdu"; # store and update
+ $STX= "stdx"; # store indexed
+ $STUX= "stdux"; # store indexed and update
+ $UMULL= "mulld"; # unsigned multiply low
+ $UMULH= "mulhdu"; # unsigned multiply high
+ $UCMP= "cmpld"; # unsigned compare
+ $SHRI= "srdi"; # unsigned shift right by immediate
+ $PUSH= $ST;
+ $POP= $LD;
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3"; $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9"; # $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r14";
+$j="r15";
+$tp="r16";
+$m0="r17";
+$m1="r18";
+$lo0="r19";
+$hi0="r20";
+$lo1="r21";
+$hi1="r22";
+$alo="r23";
+$ahi="r24";
+$nlo="r25";
+#
+$nhi="r0";
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl .bn_mul_mont_int
+.align 4
+.bn_mul_mont_int:
+ cmpwi $num,4
+ mr $rp,r3 ; $rp is reassigned
+ li r3,0
+ bltlr
+___
+$code.=<<___ if ($BNSZ==4);
+ cmpwi $num,32 ; longer key performance is not better
+ bgelr
+___
+$code.=<<___;
+ slwi $num,$num,`log($BNSZ)/log(2)`
+ li $tj,-4096
+ addi $ovf,$num,`$FRAME+$RZONE`
+ subf $ovf,$ovf,$sp ; $sp-$ovf
+ and $ovf,$ovf,$tj ; minimize TLB usage
+ subf $ovf,$sp,$ovf ; $ovf-$sp
+ srwi $num,$num,`log($BNSZ)/log(2)`
+ $STUX $sp,$sp,$ovf
+
+ $PUSH r14,`4*$SIZE_T`($sp)
+ $PUSH r15,`5*$SIZE_T`($sp)
+ $PUSH r16,`6*$SIZE_T`($sp)
+ $PUSH r17,`7*$SIZE_T`($sp)
+ $PUSH r18,`8*$SIZE_T`($sp)
+ $PUSH r19,`9*$SIZE_T`($sp)
+ $PUSH r20,`10*$SIZE_T`($sp)
+ $PUSH r21,`11*$SIZE_T`($sp)
+ $PUSH r22,`12*$SIZE_T`($sp)
+ $PUSH r23,`13*$SIZE_T`($sp)
+ $PUSH r24,`14*$SIZE_T`($sp)
+ $PUSH r25,`15*$SIZE_T`($sp)
+
+ $LD $n0,0($n0) ; pull n0[0] value
+ addi $num,$num,-2 ; adjust $num for counter register
+
+ $LD $m0,0($bp) ; m0=bp[0]
+ $LD $aj,0($ap) ; ap[0]
+ addi $tp,$sp,$FRAME
+ $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
+ $UMULH $hi0,$aj,$m0
+
+ $LD $aj,$BNSZ($ap) ; ap[1]
+ $LD $nj,0($np) ; np[0]
+
+ $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
+
+ $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
+ $UMULH $ahi,$aj,$m0
+
+ $UMULL $lo1,$nj,$m1 ; np[0]*m1
+ $UMULH $hi1,$nj,$m1
+ $LD $nj,$BNSZ($np) ; np[1]
+ addc $lo1,$lo1,$lo0
+ addze $hi1,$hi1
+
+ $UMULL $nlo,$nj,$m1 ; np[1]*m1
+ $UMULH $nhi,$nj,$m1
+
+ mtctr $num
+ li $j,`2*$BNSZ`
+.align 4
+L1st:
+ $LDX $aj,$ap,$j ; ap[j]
+ addc $lo0,$alo,$hi0
+ $LDX $nj,$np,$j ; np[j]
+ addze $hi0,$ahi
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
+ addc $lo1,$nlo,$hi1
+ $UMULH $ahi,$aj,$m0
+ addze $hi1,$nhi
+ $UMULL $nlo,$nj,$m1 ; np[j]*m1
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
+ $UMULH $nhi,$nj,$m1
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+
+ addi $j,$j,$BNSZ ; j++
+ addi $tp,$tp,$BNSZ ; tp++
+ bdnz- L1st
+;L1st
+ addc $lo0,$alo,$hi0
+ addze $hi0,$ahi
+
+ addc $lo1,$nlo,$hi1
+ addze $hi1,$nhi
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+
+ li $ovf,0
+ addc $hi1,$hi1,$hi0
+ addze $ovf,$ovf ; upmost overflow bit
+ $ST $hi1,$BNSZ($tp)
+
+ li $i,$BNSZ
+.align 4
+Louter:
+ $LDX $m0,$bp,$i ; m0=bp[i]
+ $LD $aj,0($ap) ; ap[0]
+ addi $tp,$sp,$FRAME
+ $LD $tj,$FRAME($sp) ; tp[0]
+ $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
+ $UMULH $hi0,$aj,$m0
+ $LD $aj,$BNSZ($ap) ; ap[1]
+ $LD $nj,0($np) ; np[0]
+ addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
+ addze $hi0,$hi0
+ $UMULL $m1,$lo0,$n0 ; tp[0]*n0
+ $UMULH $ahi,$aj,$m0
+ $UMULL $lo1,$nj,$m1 ; np[0]*m1
+ $UMULH $hi1,$nj,$m1
+ $LD $nj,$BNSZ($np) ; np[1]
+ addc $lo1,$lo1,$lo0
+ $UMULL $nlo,$nj,$m1 ; np[1]*m1
+ addze $hi1,$hi1
+ $UMULH $nhi,$nj,$m1
+
+ mtctr $num
+ li $j,`2*$BNSZ`
+.align 4
+Linner:
+ $LDX $aj,$ap,$j ; ap[j]
+ addc $lo0,$alo,$hi0
+ $LD $tj,$BNSZ($tp) ; tp[j]
+ addze $hi0,$ahi
+ $LDX $nj,$np,$j ; np[j]
+ addc $lo1,$nlo,$hi1
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
+ addze $hi1,$nhi
+ $UMULH $ahi,$aj,$m0
+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
+ $UMULL $nlo,$nj,$m1 ; np[j]*m1
+ addze $hi0,$hi0
+ $UMULH $nhi,$nj,$m1
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
+ addi $j,$j,$BNSZ ; j++
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+ addi $tp,$tp,$BNSZ ; tp++
+ bdnz- Linner
+;Linner
+ $LD $tj,$BNSZ($tp) ; tp[j]
+ addc $lo0,$alo,$hi0
+ addze $hi0,$ahi
+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
+ addze $hi0,$hi0
+
+ addc $lo1,$nlo,$hi1
+ addze $hi1,$nhi
+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
+ addze $hi1,$hi1
+ $ST $lo1,0($tp) ; tp[j-1]
+
+ addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
+ li $ovf,0
+ adde $hi1,$hi1,$hi0
+ addze $ovf,$ovf
+ $ST $hi1,$BNSZ($tp)
+;
+ slwi $tj,$num,`log($BNSZ)/log(2)`
+ $UCMP $i,$tj
+ addi $i,$i,$BNSZ
+ ble- Louter
+
+ addi $num,$num,2 ; restore $num
+ subfc $j,$j,$j ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,$FRAME
+ mtctr $num
+
+.align 4
+Lsub: $LDX $tj,$tp,$j
+ $LDX $nj,$np,$j
+ subfe $aj,$nj,$tj ; tp[j]-np[j]
+ $STX $aj,$rp,$j
+ addi $j,$j,$BNSZ
+ bdnz- Lsub
+
+ li $j,0
+ mtctr $num
+ subfe $ovf,$j,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ $LDX $tj,$ap,$j
+ $STX $tj,$rp,$j
+ $STX $j,$tp,$j ; zap at once
+ addi $j,$j,$BNSZ
+ bdnz- Lcopy
+
+ $POP r14,`4*$SIZE_T`($sp)
+ $POP r15,`5*$SIZE_T`($sp)
+ $POP r16,`6*$SIZE_T`($sp)
+ $POP r17,`7*$SIZE_T`($sp)
+ $POP r18,`8*$SIZE_T`($sp)
+ $POP r19,`9*$SIZE_T`($sp)
+ $POP r20,`10*$SIZE_T`($sp)
+ $POP r21,`11*$SIZE_T`($sp)
+ $POP r22,`12*$SIZE_T`($sp)
+ $POP r23,`13*$SIZE_T`($sp)
+ $POP r24,`14*$SIZE_T`($sp)
+ $POP r25,`15*$SIZE_T`($sp)
+ $POP $sp,0($sp)
+ li r3,1
+ blr
+ .long 0
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl
index 307c7ccb35..37c65d3511 100644
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@@ -100,9 +100,9 @@
# me a note at schari@us.ibm.com
#
-$opf = shift;
+$flavour = shift;
-if ($opf =~ /32\.s/) {
+if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$ISA= "\"ppc\"";
@@ -116,7 +116,7 @@ if ($opf =~ /32\.s/) {
$UDIV= "divwu"; # unsigned divide
$UCMPI= "cmplwi"; # unsigned compare with immediate
$UCMP= "cmplw"; # unsigned compare
- $COUNTZ="cntlzw"; # count leading zeros
+ $CNTLZ= "cntlzw"; # count leading zeros
$SHL= "slw"; # shift left
$SHR= "srw"; # unsigned shift right
$SHRI= "srwi"; # unsigned shift right by immediate
@@ -124,7 +124,8 @@ if ($opf =~ /32\.s/) {
$CLRU= "clrlwi"; # clear upper bits
$INSR= "insrwi"; # insert right
$ROTL= "rotlwi"; # rotate left by immediate
-} elsif ($opf =~ /64\.s/) {
+ $TR= "tw"; # conditional trap
+} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$ISA= "\"ppc64\"";
@@ -139,7 +140,7 @@ if ($opf =~ /32\.s/) {
$UDIV= "divdu"; # unsigned divide
$UCMPI= "cmpldi"; # unsigned compare with immediate
$UCMP= "cmpld"; # unsigned compare
- $COUNTZ="cntlzd"; # count leading zeros
+ $CNTLZ= "cntlzd"; # count leading zeros
$SHL= "sld"; # shift left
$SHR= "srd"; # unsigned shift right
$SHRI= "srdi"; # unsigned shift right by immediate
@@ -147,93 +148,17 @@ if ($opf =~ /32\.s/) {
$CLRU= "clrldi"; # clear upper bits
$INSR= "insrdi"; # insert right
$ROTL= "rotldi"; # rotate left by immediate
-} else { die "nonsense $opf"; }
+ $TR= "td"; # conditional trap
+} else { die "nonsense $flavour"; }
-( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
-# function entry points from the AIX code
-#
-# There are other, more elegant, ways to handle this. We (IBM) chose
-# this approach as it plays well with scripts we run to 'namespace'
-# OpenSSL .i.e. we add a prefix to all the public symbols so we can
-# co-exist in the same process with other implementations of OpenSSL.
-# 'cleverer' ways of doing these substitutions tend to hide data we
-# need to be obvious.
-#
-my @items = ("bn_sqr_comba4",
- "bn_sqr_comba8",
- "bn_mul_comba4",
- "bn_mul_comba8",
- "bn_sub_words",
- "bn_add_words",
- "bn_div_words",
- "bn_sqr_words",
- "bn_mul_words",
- "bn_mul_add_words");
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-if ($opf =~ /linux/) { do_linux(); }
-elsif ($opf =~ /aix/) { do_aix(); }
-elsif ($opf =~ /osx/) { do_osx(); }
-else { do_bsd(); }
-
-sub do_linux {
- $d=&data();
-
- if ($BITS==64) {
- foreach $t (@items) {
- $d =~ s/\.$t:/\
-\t.section\t".opd","aw"\
-\t.align\t3\
-\t.globl\t$t\
-$t:\
-\t.quad\t.$t,.TOC.\@tocbase,0\
-\t.size\t$t,24\
-\t.previous\n\
-\t.type\t.$t,\@function\
-\t.globl\t.$t\
-.$t:/g;
- }
- }
- else {
- foreach $t (@items) {
- $d=~s/\.$t/$t/g;
- }
- }
- # hide internal labels to avoid pollution of name table...
- $d=~s/Lppcasm_/.Lppcasm_/gm;
- print $d;
-}
-
-sub do_aix {
- # AIX assembler is smart enough to please the linker without
- # making us do something special...
- print &data();
-}
-
-# MacOSX 32 bit
-sub do_osx {
- $d=&data();
- # Change the bn symbol prefix from '.' to '_'
- foreach $t (@items) {
- $d=~s/\.$t/_$t/g;
- }
- # Change .machine to something OS X asm will accept
- $d=~s/\.machine.*/.text/g;
- $d=~s/\#/;/g; # change comment from '#' to ';'
- print $d;
-}
-
-# BSD (Untested)
-sub do_bsd {
- $d=&data();
- foreach $t (@items) {
- $d=~s/\.$t/_$t/g;
- }
- print $d;
-}
-
-sub data {
- local($data)=<<EOF;
+$data=<<EOF;
#--------------------------------------------------------------------
#
#
@@ -295,33 +220,20 @@ sub data {
#
# Defines to be used in the assembly code.
#
-.set r0,0 # we use it as storage for value of 0
-.set SP,1 # preserved
-.set RTOC,2 # preserved
-.set r3,3 # 1st argument/return value
-.set r4,4 # 2nd argument/volatile register
-.set r5,5 # 3rd argument/volatile register
-.set r6,6 # ...
-.set r7,7
-.set r8,8
-.set r9,9
-.set r10,10
-.set r11,11
-.set r12,12
-.set r13,13 # not used, nor any other "below" it...
-
-.set BO_IF_NOT,4
-.set BO_IF,12
-.set BO_dCTR_NZERO,16
-.set BO_dCTR_ZERO,18
-.set BO_ALWAYS,20
-.set CR0_LT,0;
-.set CR0_GT,1;
-.set CR0_EQ,2
-.set CR1_FX,4;
-.set CR1_FEX,5;
-.set CR1_VX,6
-.set LR,8
+#.set r0,0 # we use it as storage for value of 0
+#.set SP,1 # preserved
+#.set RTOC,2 # preserved
+#.set r3,3 # 1st argument/return value
+#.set r4,4 # 2nd argument/volatile register
+#.set r5,5 # 3rd argument/volatile register
+#.set r6,6 # ...
+#.set r7,7
+#.set r8,8
+#.set r9,9
+#.set r10,10
+#.set r11,11
+#.set r12,12
+#.set r13,13 # not used, nor any other "below" it...
# Declare function names to be global
# NOTE: For gcc these names MUST be changed to remove
@@ -342,7 +254,7 @@ sub data {
# .text section
- .machine $ISA
+ .machine "any"
#
# NOTE: The following label name should be changed to
@@ -476,7 +388,7 @@ sub data {
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -901,7 +813,7 @@ sub data {
$ST r9, `15*$BNSZ`(r3) #r[15]=c1;
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
@@ -1053,7 +965,7 @@ sub data {
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1589,7 +1501,7 @@ sub data {
adde r10,r10,r9
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1621,7 +1533,7 @@ sub data {
subfc. r7,r0,r6 # If r6 is 0 then result is 0.
# if r6 > 0 then result !=0
# In either case carry bit is set.
- bc BO_IF,CR0_EQ,Lppcasm_sub_adios
+ beq Lppcasm_sub_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
@@ -1633,11 +1545,11 @@ Lppcasm_sub_mainloop:
# if carry = 1 this is r7-r8. Else it
# is r7-r8 -1 as we need.
$STU r6,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
+ bdnz- Lppcasm_sub_mainloop
Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
@@ -1668,7 +1580,7 @@ Lppcasm_sub_adios:
# check for r6 = 0. Is this needed?
#
addic. r6,r6,0 #test r6 and clear carry bit.
- bc BO_IF,CR0_EQ,Lppcasm_add_adios
+ beq Lppcasm_add_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
@@ -1678,10 +1590,10 @@ Lppcasm_add_mainloop:
$LDU r8,$BNSZ(r5)
adde r8,r7,r8
$STU r8,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
+ bdnz- Lppcasm_add_mainloop
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1705,29 +1617,24 @@ Lppcasm_add_adios:
# r5 = d
$UCMPI 0,r5,0 # compare r5 and 0
- bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0
+ bne Lppcasm_div1 # proceed if d!=0
li r3,-1 # d=0 return -1
- bclr BO_ALWAYS,CR0_LT
+ blr
Lppcasm_div1:
xor r0,r0,r0 #r0=0
- $COUNTZ r7,r5 #r7 = num leading 0s in d.
- subfic r8,r7,$BITS #r8 = BN_num_bits_word(d)
- cmpi 0,0,r8,$BITS #
- bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if (r8==$BITS)
- li r9,1 # r9=1
- $SHL r10,r9,r8 # r9<<=r8
- $UCMP 0,r3,r10 #
- bc BO_IF,CR0_GT,Lppcasm_div2 #or if (h > (1<<r8))
- $UDIV r3,r3,r0 #if not assert(0) divide by 0!
- #that's how we signal overflow
- bclr BO_ALWAYS,CR0_LT #return. NEVER REACHED.
+ li r8,$BITS
+ $CNTLZ. r7,r5 #r7 = num leading 0s in d.
+ beq Lppcasm_div2 #proceed if no leading zeros
+ subf r8,r7,r8 #r8 = BN_num_bits_word(d)
+ $SHR. r9,r3,r8 #are there any bits above r8'th?
+ $TR 16,r9,r0 #if there're, signal to dump core...
Lppcasm_div2:
$UCMP 0,r3,r5 #h>=d?
- bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not
+ blt Lppcasm_div3 #goto Lppcasm_div3 if not
subf r3,r5,r3 #h-=d ;
Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
cmpi 0,0,r7,0 # is (i == 0)?
- bc BO_IF,CR0_EQ,Lppcasm_div4
+ beq Lppcasm_div4
$SHL r3,r3,r7 # h = (h<< i)
$SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
$SHL r5,r5,r7 # d<<=i
@@ -1744,7 +1651,7 @@ Lppcasm_divouterloop:
$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
# compute here for innerloop.
$UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
- bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not
+ bne Lppcasm_div5 # goto Lppcasm_div5 if not
li r8,-1
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
@@ -1765,9 +1672,9 @@ Lppcasm_divinnerloop:
# the following 2 instructions do that
$SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
- $UCMP 1,r6,r7 # compare (tl <= r7)
- bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
- bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
+ $UCMP cr1,r6,r7 # compare (tl <= r7)
+ bne Lppcasm_divinnerexit
+ ble cr1,Lppcasm_divinnerexit
addi r8,r8,-1 #q--
subf r12,r9,r12 #th -=dh
$CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
@@ -1776,14 +1683,14 @@ Lppcasm_divinnerloop:
Lppcasm_divinnerexit:
$SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
$SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
- $UCMP 1,r4,r11 # compare l and tl
+ $UCMP cr1,r4,r11 # compare l and tl
add r12,r12,r10 # th+=t
- bc BO_IF_NOT,CR1_FX,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
+ bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
addi r12,r12,1 # th++
Lppcasm_div7:
subf r11,r11,r4 #r11=l-tl
- $UCMP 1,r3,r12 #compare h and th
- bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
+ $UCMP cr1,r3,r12 #compare h and th
+ bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
addi r8,r8,-1 # q--
add r3,r5,r3 # h+=d
Lppcasm_div8:
@@ -1794,12 +1701,12 @@ Lppcasm_div8:
# the following 2 instructions will do this.
$INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
$ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
+ bdz Lppcasm_div9 #if (count==0) break ;
$SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
b Lppcasm_divouterloop
Lppcasm_div9:
or r3,r8,r0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1825,7 +1732,7 @@ Lppcasm_div9:
# No unrolling done here. Not performance critical.
addic. r5,r5,0 #test r5.
- bc BO_IF,CR0_EQ,Lppcasm_sqr_adios
+ beq Lppcasm_sqr_adios
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
mtctr r5
@@ -1836,9 +1743,9 @@ Lppcasm_sqr_mainloop:
$UMULH r8,r6,r6
$STU r7,$BNSZ(r3)
$STU r8,$BNSZ(r3)
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
+ bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
@@ -1861,7 +1768,7 @@ Lppcasm_sqr_adios:
xor r0,r0,r0
xor r12,r12,r12 # used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
- bc BO_IF,CR0_EQ,Lppcasm_mw_REM
+ beq Lppcasm_mw_REM
mtctr r7
Lppcasm_mw_LOOP:
#mul(rp[0],ap[0],w,c1);
@@ -1899,11 +1806,11 @@ Lppcasm_mw_LOOP:
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
+ bdnz- Lppcasm_mw_LOOP
Lppcasm_mw_REM:
andi. r5,r5,0x3
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$UMULL r9,r6,r8
@@ -1915,7 +1822,7 @@ Lppcasm_mw_REM:
addi r5,r5,-1
cmpli 0,0,r5,0
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul(rp[1],ap[1],w,c1);
@@ -1929,7 +1836,7 @@ Lppcasm_mw_REM:
addi r5,r5,-1
cmpli 0,0,r5,0
- bc BO_IF,CR0_EQ,Lppcasm_mw_OVER
+ beq Lppcasm_mw_OVER
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
@@ -1942,7 +1849,7 @@ Lppcasm_mw_REM:
Lppcasm_mw_OVER:
addi r3,r12,0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
#
@@ -1967,7 +1874,7 @@ Lppcasm_mw_OVER:
xor r0,r0,r0 #r0 = 0
xor r12,r12,r12 #r12 = 0 . used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
- bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
+ beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
mtctr r7
Lppcasm_maw_mainloop:
#mul_add(rp[0],ap[0],w,c1);
@@ -2020,11 +1927,11 @@ Lppcasm_maw_mainloop:
$ST r11,`3*$BNSZ`(r3)
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
- bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
+ bdnz- Lppcasm_maw_mainloop
Lppcasm_maw_leftover:
andi. r5,r5,0x3
- bc BO_IF,CR0_EQ,Lppcasm_maw_adios
+ beq Lppcasm_maw_adios
addi r3,r3,-$BNSZ
addi r4,r4,-$BNSZ
#mul_add(rp[0],ap[0],w,c1);
@@ -2039,7 +1946,7 @@ Lppcasm_maw_leftover:
addze r12,r10
$ST r9,0(r3)
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ bdz Lppcasm_maw_adios
#mul_add(rp[1],ap[1],w,c1);
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
@@ -2051,7 +1958,7 @@ Lppcasm_maw_leftover:
addze r12,r10
$ST r9,0(r3)
- bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
+ bdz Lppcasm_maw_adios
#mul_add(rp[2],ap[2],w,c1);
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
@@ -2065,17 +1972,10 @@ Lppcasm_maw_leftover:
Lppcasm_maw_adios:
addi r3,r12,0
- bclr BO_ALWAYS,CR0_LT
+ blr
.long 0x00000000
.align 4
EOF
- $data =~ s/\`([^\`]*)\`/eval $1/gem;
-
- # if some assembler chokes on some simplified mnemonic,
- # this is the spot to fix it up, e.g.:
- # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
- $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
- # assembler X doesn't accept li, load immediate value
- #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
- return($data);
-}
+$data =~ s/\`([^\`]*)\`/eval $1/gem;
+print $data;
+close STDOUT;
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
new file mode 100644
index 0000000000..f040466f43
--- /dev/null
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -0,0 +1,1086 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2007
+
+# The reason for undertaken effort is basically following. Even though
+# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
+# performance was observed to be less than impressive, essentially as
+# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
+# Well, it's not surprising that IBM had to make some sacrifices to
+# boost the clock frequency that much, but no overall improvement?
+# Having observed how much difference did switching to FPU make on
+# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
+# Unfortunately the resulting performance improvement is not as
+# impressive, ~30%, and in absolute terms is still very far from what
+# one would expect from 4.7GHz CPU. There is a chance that I'm doing
+# something wrong, but in the lack of assembler level micro-profiling
+# data or at least decent platform guide I can't tell... Or better
+# results might be achieved with VMX... Anyway, this module provides
+# *worse* performance on other PowerPC implementations, ~40-15% slower
+# on PPC970 depending on key length and ~40% slower on Power 5 for all
+# key lengths. As it's obviously inappropriate as "best all-round"
+# alternative, it has to be complemented with run-time CPU family
+# detection. Oh! It should also be noted that unlike other PowerPC
+# implementation IALU ppc-mont.pl module performs *suboptimaly* on
+# >=1024-bit key lengths on Power 6. It should also be noted that
+# *everything* said so far applies to 64-bit builds! As far as 32-bit
+# application executed on 64-bit CPU goes, this module is likely to
+# become preferred choice, because it's easy to adapt it for such
+# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
+
+# February 2008
+
+# Micro-profiling assisted optimization results in ~15% improvement
+# over original ppc64-mont.pl version, or overall ~50% improvement
+# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
+# Power 6 CPU, this module is 5-150% faster depending on key length,
+# [hereafter] more for longer keys. But if compared to ppc-mont.pl
+# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
+# in absolute terms, but it's apparently the way Power 6 is...
+
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+ $SIZE_T=4;
+ $RZONE= 224;
+ $FRAME= $SIZE_T*12+8*12;
+ $fname= "bn_mul_mont_fpu64";
+
+ $STUX= "stwux"; # store indexed and update
+ $PUSH= "stw";
+ $POP= "lwz";
+} elsif ($flavour =~ /64/) {
+ $SIZE_T=8;
+ $RZONE= 288;
+ $FRAME= $SIZE_T*12+8*12;
+ $fname= "bn_mul_mont_fpu64";
+
+ # same as above, but 64-bit mnemonics...
+ $STUX= "stdux"; # store indexed and update
+ $PUSH= "std";
+ $POP= "ld";
+} else { die "nonsense $flavour"; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=($FRAME+63)&~63;
+$TRANSFER=16*8;
+
+$carry="r0";
+$sp="r1";
+$toc="r2";
+$rp="r3"; $ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9"; # $rp is reassigned
+$tp="r10";
+$j="r11";
+$i="r12";
+# non-volatile registers
+$nap_d="r14"; # interleaved ap and np in double format
+$a0="r15"; # ap[0]
+$t0="r16"; # temporary registers
+$t1="r17";
+$t2="r18";
+$t3="r19";
+$t4="r20";
+$t5="r21";
+$t6="r22";
+$t7="r23";
+
+# PPC offers enough register bank capacity to unroll inner loops twice
+#
+# ..A3A2A1A0
+# dcba
+# -----------
+# A0a
+# A0b
+# A0c
+# A0d
+# A1a
+# A1b
+# A1c
+# A1d
+# A2a
+# A2b
+# A2c
+# A2d
+# A3a
+# A3b
+# A3c
+# A3d
+# ..a
+# ..b
+#
+$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
+$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
+$dota="f8"; $dotb="f9";
+$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
+$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
+$T0a="f18"; $T0b="f19";
+$T1a="f20"; $T1b="f21";
+$T2a="f22"; $T2b="f23";
+$T3a="f24"; $T3b="f25";
+
+# sp----------->+-------------------------------+
+# | saved sp |
+# +-------------------------------+
+# | |
+# +-------------------------------+
+# | 10 saved gpr, r14-r23 |
+# . .
+# . .
+# +12*size_t +-------------------------------+
+# | 12 saved fpr, f14-f25 |
+# . .
+# . .
+# +12*8 +-------------------------------+
+# | padding to 64 byte boundary |
+# . .
+# +X +-------------------------------+
+# | 16 gpr<->fpr transfer zone |
+# . .
+# . .
+# +16*8 +-------------------------------+
+# | __int64 tmp[-1] |
+# +-------------------------------+
+# | __int64 tmp[num] |
+# . .
+# . .
+# . .
+# +(num+1)*8 +-------------------------------+
+# | padding to 64 byte boundary |
+# . .
+# +X +-------------------------------+
+# | double nap_d[4*num] |
+# . .
+# . .
+# . .
+# +-------------------------------+
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl .$fname
+.align 5
+.$fname:
+ cmpwi $num,`3*8/$SIZE_T`
+ mr $rp,r3 ; $rp is reassigned
+ li r3,0 ; possible "not handled" return code
+ bltlr-
+ andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
+ bnelr-
+
+ slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
+ li $i,-4096
+ slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
+ add $tp,$tp,$num ; place for tp[num+1]
+ addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
+ subf $tp,$tp,$sp ; $sp-$tp
+ and $tp,$tp,$i ; minimize TLB usage
+ subf $tp,$sp,$tp ; $tp-$sp
+ $STUX $sp,$sp,$tp ; alloca
+
+ $PUSH r14,`2*$SIZE_T`($sp)
+ $PUSH r15,`3*$SIZE_T`($sp)
+ $PUSH r16,`4*$SIZE_T`($sp)
+ $PUSH r17,`5*$SIZE_T`($sp)
+ $PUSH r18,`6*$SIZE_T`($sp)
+ $PUSH r19,`7*$SIZE_T`($sp)
+ $PUSH r20,`8*$SIZE_T`($sp)
+ $PUSH r21,`9*$SIZE_T`($sp)
+ $PUSH r22,`10*$SIZE_T`($sp)
+ $PUSH r23,`11*$SIZE_T`($sp)
+ stfd f14,`12*$SIZE_T+0`($sp)
+ stfd f15,`12*$SIZE_T+8`($sp)
+ stfd f16,`12*$SIZE_T+16`($sp)
+ stfd f17,`12*$SIZE_T+24`($sp)
+ stfd f18,`12*$SIZE_T+32`($sp)
+ stfd f19,`12*$SIZE_T+40`($sp)
+ stfd f20,`12*$SIZE_T+48`($sp)
+ stfd f21,`12*$SIZE_T+56`($sp)
+ stfd f22,`12*$SIZE_T+64`($sp)
+ stfd f23,`12*$SIZE_T+72`($sp)
+ stfd f24,`12*$SIZE_T+80`($sp)
+ stfd f25,`12*$SIZE_T+88`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
+ ld $a0,0($ap) ; pull ap[0] value
+ ld $n0,0($n0) ; pull n0[0] value
+ ld $t3,0($bp) ; bp[0]
+___
+$code.=<<___ if ($SIZE_T==4);
+ mr $t1,$n0
+ lwz $a0,0($ap) ; pull ap[0,1] value
+ lwz $t0,4($ap)
+ lwz $n0,0($t1) ; pull n0[0,1] value
+ lwz $t1,4($t1)
+ lwz $t3,0($bp) ; bp[0,1]
+ lwz $t2,4($bp)
+ insrdi $a0,$t0,32,0
+ insrdi $n0,$t1,32,0
+ insrdi $t3,$t2,32,0
+___
+$code.=<<___;
+ addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
+ li $i,-64
+ add $nap_d,$tp,$num
+ and $nap_d,$nap_d,$i ; align to 64 bytes
+
+ mulld $t7,$a0,$t3 ; ap[0]*bp[0]
+ ; nap_d is off by 1, because it's used with stfdu/lfdu
+ addi $nap_d,$nap_d,-8
+ srwi $j,$num,`3+1` ; counter register, num/2
+ mulld $t7,$t7,$n0 ; tp[0]*n0
+ addi $j,$j,-1
+ addi $tp,$sp,`$FRAME+$TRANSFER-8`
+ li $carry,0
+ mtctr $j
+
+ ; transfer bp[0] to FPU as 4x16-bit values
+ extrdi $t0,$t3,16,48
+ extrdi $t1,$t3,16,32
+ extrdi $t2,$t3,16,16
+ extrdi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp)
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+ ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
+ extrdi $t4,$t7,16,48
+ extrdi $t5,$t7,16,32
+ extrdi $t6,$t7,16,16
+ extrdi $t7,$t7,16,0
+ std $t4,`$FRAME+32`($sp)
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
+ lwz $t0,4($ap) ; load a[j] as 32-bit word pair
+ lwz $t1,0($ap)
+ lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
+ lwz $t3,8($ap)
+ lwz $t4,4($np) ; load n[j] as 32-bit word pair
+ lwz $t5,0($np)
+ lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
+ lwz $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
+ lfd $ba,`$FRAME+0`($sp)
+ lfd $bb,`$FRAME+8`($sp)
+ lfd $bc,`$FRAME+16`($sp)
+ lfd $bd,`$FRAME+24`($sp)
+ lfd $na,`$FRAME+32`($sp)
+ lfd $nb,`$FRAME+40`($sp)
+ lfd $nc,`$FRAME+48`($sp)
+ lfd $nd,`$FRAME+56`($sp)
+ std $t0,`$FRAME+64`($sp)
+ std $t1,`$FRAME+72`($sp)
+ std $t2,`$FRAME+80`($sp)
+ std $t3,`$FRAME+88`($sp)
+ std $t4,`$FRAME+96`($sp)
+ std $t5,`$FRAME+104`($sp)
+ std $t6,`$FRAME+112`($sp)
+ std $t7,`$FRAME+120`($sp)
+ fcfid $ba,$ba
+ fcfid $bb,$bb
+ fcfid $bc,$bc
+ fcfid $bd,$bd
+ fcfid $na,$na
+ fcfid $nb,$nb
+ fcfid $nc,$nc
+ fcfid $nd,$nd
+
+ lfd $A0,`$FRAME+64`($sp)
+ lfd $A1,`$FRAME+72`($sp)
+ lfd $A2,`$FRAME+80`($sp)
+ lfd $A3,`$FRAME+88`($sp)
+ lfd $N0,`$FRAME+96`($sp)
+ lfd $N1,`$FRAME+104`($sp)
+ lfd $N2,`$FRAME+112`($sp)
+ lfd $N3,`$FRAME+120`($sp)
+ fcfid $A0,$A0
+ fcfid $A1,$A1
+ fcfid $A2,$A2
+ fcfid $A3,$A3
+ fcfid $N0,$N0
+ fcfid $N1,$N1
+ fcfid $N2,$N2
+ fcfid $N3,$N3
+ addi $ap,$ap,16
+ addi $np,$np,16
+
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ stfd $A0,8($nap_d) ; save a[j] in double format
+ stfd $A1,16($nap_d)
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ stfd $A2,24($nap_d) ; save a[j+1] in double format
+ stfd $A3,32($nap_d)
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ fmul $T0a,$A0,$ba
+ fmul $T0b,$A0,$bb
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+
+.align 5
+L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
+ lwz $t0,4($ap) ; load a[j] as 32-bit word pair
+ lwz $t1,0($ap)
+ lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
+ lwz $t3,8($ap)
+ lwz $t4,4($np) ; load n[j] as 32-bit word pair
+ lwz $t5,0($np)
+ lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
+ lwz $t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+ lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
+ lwz $t1,4($ap)
+ lwz $t2,8($ap)
+ lwz $t3,12($ap)
+ lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
+ lwz $t5,4($np)
+ lwz $t6,8($np)
+ lwz $t7,12($np)
+___
+$code.=<<___;
+ std $t0,`$FRAME+64`($sp)
+ std $t1,`$FRAME+72`($sp)
+ std $t2,`$FRAME+80`($sp)
+ std $t3,`$FRAME+88`($sp)
+ std $t4,`$FRAME+96`($sp)
+ std $t5,`$FRAME+104`($sp)
+ std $t6,`$FRAME+112`($sp)
+ std $t7,`$FRAME+120`($sp)
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+ lfd $A0,`$FRAME+64`($sp)
+ lfd $A1,`$FRAME+72`($sp)
+ lfd $A2,`$FRAME+80`($sp)
+ lfd $A3,`$FRAME+88`($sp)
+ lfd $N0,`$FRAME+96`($sp)
+ lfd $N1,`$FRAME+104`($sp)
+ lfd $N2,`$FRAME+112`($sp)
+ lfd $N3,`$FRAME+120`($sp)
+ fcfid $A0,$A0
+ fcfid $A1,$A1
+ fcfid $A2,$A2
+ fcfid $A3,$A3
+ fcfid $N0,$N0
+ fcfid $N1,$N1
+ fcfid $N2,$N2
+ fcfid $N3,$N3
+ addi $ap,$ap,16
+ addi $np,$np,16
+
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ stfd $A0,8($nap_d) ; save a[j] in double format
+ stfd $A1,16($nap_d)
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ fmadd $T0a,$A0,$ba,$dota
+ fmadd $T0b,$A0,$bb,$dotb
+ stfd $A2,24($nap_d) ; save a[j+1] in double format
+ stfd $A3,32($nap_d)
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ stfd $N0,40($nap_d) ; save n[j] in double format
+ stfd $N1,48($nap_d)
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ add $t0,$t0,$carry ; can not overflow
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ stfd $N2,56($nap_d) ; save n[j+1] in double format
+ stfdu $N3,64($nap_d)
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ insrdi $t0,$t1,16,32
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ add $t2,$t2,$carry
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ srdi $carry,$t2,16
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ insrdi $t0,$t2,16,16
+ add $t3,$t3,$carry
+ srdi $carry,$t3,16
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ add $t4,$t4,$carry
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ srdi $carry,$t4,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ add $t5,$t5,$carry
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ add $t6,$t6,$carry
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ srdi $carry,$t6,16
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ insrdi $t4,$t6,16,16
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ add $t7,$t7,$carry
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ std $t0,8($tp) ; tp[j-1]
+ stdu $t4,16($tp) ; tp[j]
+ bdnz- L1st
+
+ fctid $dota,$dota
+ fctid $dotb,$dotb
+
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+
+ add $t0,$t0,$carry ; can not overflow
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+ insrdi $t0,$t1,16,32
+ add $t2,$t2,$carry
+ srdi $carry,$t2,16
+ insrdi $t0,$t2,16,16
+ add $t3,$t3,$carry
+ srdi $carry,$t3,16
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ add $t4,$t4,$carry
+ srdi $carry,$t4,16
+ add $t5,$t5,$carry
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+ add $t6,$t6,$carry
+ srdi $carry,$t6,16
+ insrdi $t4,$t6,16,16
+ add $t7,$t7,$carry
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+ ld $t6,`$FRAME+64`($sp)
+ ld $t7,`$FRAME+72`($sp)
+
+ std $t0,8($tp) ; tp[j-1]
+ stdu $t4,16($tp) ; tp[j]
+
+ add $t6,$t6,$carry ; can not overflow
+ srdi $carry,$t6,16
+ add $t7,$t7,$carry
+ insrdi $t6,$t7,48,0
+ srdi $ovf,$t7,48
+ std $t6,8($tp) ; tp[num-1]
+
+ slwi $t7,$num,2
+ subf $nap_d,$t7,$nap_d ; rewind pointer
+
+ li $i,8 ; i=1
+.align 5
+Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
+ ldx $t3,$bp,$i ; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+ add $t0,$bp,$i
+ lwz $t3,0($t0) ; bp[i,i+1]
+ lwz $t0,4($t0)
+ insrdi $t3,$t0,32,0
+___
+$code.=<<___;
+ ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
+ mulld $t7,$a0,$t3 ; ap[0]*bp[i]
+
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
+ li $carry,0
+ mulld $t7,$t7,$n0 ; tp[0]*n0
+ mtctr $j
+
+ ; transfer bp[i] to FPU as 4x16-bit values
+ extrdi $t0,$t3,16,48
+ extrdi $t1,$t3,16,32
+ extrdi $t2,$t3,16,16
+ extrdi $t3,$t3,16,0
+ std $t0,`$FRAME+0`($sp)
+ std $t1,`$FRAME+8`($sp)
+ std $t2,`$FRAME+16`($sp)
+ std $t3,`$FRAME+24`($sp)
+ ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
+ extrdi $t4,$t7,16,48
+ extrdi $t5,$t7,16,32
+ extrdi $t6,$t7,16,16
+ extrdi $t7,$t7,16,0
+ std $t4,`$FRAME+32`($sp)
+ std $t5,`$FRAME+40`($sp)
+ std $t6,`$FRAME+48`($sp)
+ std $t7,`$FRAME+56`($sp)
+
+ lfd $A0,8($nap_d) ; load a[j] in double format
+ lfd $A1,16($nap_d)
+ lfd $A2,24($nap_d) ; load a[j+1] in double format
+ lfd $A3,32($nap_d)
+ lfd $N0,40($nap_d) ; load n[j] in double format
+ lfd $N1,48($nap_d)
+ lfd $N2,56($nap_d) ; load n[j+1] in double format
+ lfdu $N3,64($nap_d)
+
+ lfd $ba,`$FRAME+0`($sp)
+ lfd $bb,`$FRAME+8`($sp)
+ lfd $bc,`$FRAME+16`($sp)
+ lfd $bd,`$FRAME+24`($sp)
+ lfd $na,`$FRAME+32`($sp)
+ lfd $nb,`$FRAME+40`($sp)
+ lfd $nc,`$FRAME+48`($sp)
+ lfd $nd,`$FRAME+56`($sp)
+
+ fcfid $ba,$ba
+ fcfid $bb,$bb
+ fcfid $bc,$bc
+ fcfid $bd,$bd
+ fcfid $na,$na
+ fcfid $nb,$nb
+ fcfid $nc,$nc
+ fcfid $nd,$nd
+
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ fmul $T0a,$A0,$ba
+ fmul $T0b,$A0,$bb
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ lfd $A0,8($nap_d) ; load a[j] in double format
+ lfd $A1,16($nap_d)
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ lfd $A2,24($nap_d) ; load a[j+1] in double format
+ lfd $A3,32($nap_d)
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+
+.align 5
+Linner:
+ fmul $T1a,$A1,$ba
+ fmul $T1b,$A1,$bb
+ fmul $T2a,$A2,$ba
+ fmul $T2b,$A2,$bb
+ lfd $N0,40($nap_d) ; load n[j] in double format
+ lfd $N1,48($nap_d)
+ fmul $T3a,$A3,$ba
+ fmul $T3b,$A3,$bb
+ fmadd $T0a,$A0,$ba,$dota
+ fmadd $T0b,$A0,$bb,$dotb
+ lfd $N2,56($nap_d) ; load n[j+1] in double format
+ lfdu $N3,64($nap_d)
+
+ fmadd $T1a,$A0,$bc,$T1a
+ fmadd $T1b,$A0,$bd,$T1b
+ fmadd $T2a,$A1,$bc,$T2a
+ fmadd $T2b,$A1,$bd,$T2b
+ lfd $A0,8($nap_d) ; load a[j] in double format
+ lfd $A1,16($nap_d)
+ fmadd $T3a,$A2,$bc,$T3a
+ fmadd $T3b,$A2,$bd,$T3b
+ fmul $dota,$A3,$bc
+ fmul $dotb,$A3,$bd
+ lfd $A2,24($nap_d) ; load a[j+1] in double format
+ lfd $A3,32($nap_d)
+
+ fmadd $T1a,$N1,$na,$T1a
+ fmadd $T1b,$N1,$nb,$T1b
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ fmadd $T2a,$N2,$na,$T2a
+ fmadd $T2b,$N2,$nb,$T2b
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ fmadd $T3a,$N3,$na,$T3a
+ fmadd $T3b,$N3,$nb,$T3b
+ add $t0,$t0,$carry ; can not overflow
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ fmadd $T0a,$N0,$na,$T0a
+ fmadd $T0b,$N0,$nb,$T0b
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+
+ fmadd $T1a,$N0,$nc,$T1a
+ fmadd $T1b,$N0,$nd,$T1b
+ insrdi $t0,$t1,16,32
+ ld $t1,8($tp) ; tp[j]
+ fmadd $T2a,$N1,$nc,$T2a
+ fmadd $T2b,$N1,$nd,$T2b
+ add $t2,$t2,$carry
+ fmadd $T3a,$N2,$nc,$T3a
+ fmadd $T3b,$N2,$nd,$T3b
+ srdi $carry,$t2,16
+ insrdi $t0,$t2,16,16
+ fmadd $dota,$N3,$nc,$dota
+ fmadd $dotb,$N3,$nd,$dotb
+ add $t3,$t3,$carry
+ ldu $t2,16($tp) ; tp[j+1]
+ srdi $carry,$t3,16
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ add $t4,$t4,$carry
+
+ fctid $T0a,$T0a
+ fctid $T0b,$T0b
+ srdi $carry,$t4,16
+ fctid $T1a,$T1a
+ fctid $T1b,$T1b
+ add $t5,$t5,$carry
+ fctid $T2a,$T2a
+ fctid $T2b,$T2b
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+ fctid $T3a,$T3a
+ fctid $T3b,$T3b
+ add $t6,$t6,$carry
+ srdi $carry,$t6,16
+ insrdi $t4,$t6,16,16
+
+ stfd $T0a,`$FRAME+0`($sp)
+ stfd $T0b,`$FRAME+8`($sp)
+ add $t7,$t7,$carry
+ addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
+ stfd $T1a,`$FRAME+16`($sp)
+ stfd $T1b,`$FRAME+24`($sp)
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+ stfd $T2a,`$FRAME+32`($sp)
+ stfd $T2b,`$FRAME+40`($sp)
+ adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
+ stfd $T3a,`$FRAME+48`($sp)
+ stfd $T3b,`$FRAME+56`($sp)
+ addze $carry,$carry
+ std $t3,-16($tp) ; tp[j-1]
+ std $t5,-8($tp) ; tp[j]
+ bdnz- Linner
+
+ fctid $dota,$dota
+ fctid $dotb,$dotb
+ ld $t0,`$FRAME+0`($sp)
+ ld $t1,`$FRAME+8`($sp)
+ ld $t2,`$FRAME+16`($sp)
+ ld $t3,`$FRAME+24`($sp)
+ ld $t4,`$FRAME+32`($sp)
+ ld $t5,`$FRAME+40`($sp)
+ ld $t6,`$FRAME+48`($sp)
+ ld $t7,`$FRAME+56`($sp)
+ stfd $dota,`$FRAME+64`($sp)
+ stfd $dotb,`$FRAME+72`($sp)
+
+ add $t0,$t0,$carry ; can not overflow
+ srdi $carry,$t0,16
+ add $t1,$t1,$carry
+ srdi $carry,$t1,16
+ insrdi $t0,$t1,16,32
+ add $t2,$t2,$carry
+ ld $t1,8($tp) ; tp[j]
+ srdi $carry,$t2,16
+ insrdi $t0,$t2,16,16
+ add $t3,$t3,$carry
+ ldu $t2,16($tp) ; tp[j+1]
+ srdi $carry,$t3,16
+ insrdi $t0,$t3,16,0 ; 0..63 bits
+ add $t4,$t4,$carry
+ srdi $carry,$t4,16
+ add $t5,$t5,$carry
+ srdi $carry,$t5,16
+ insrdi $t4,$t5,16,32
+ add $t6,$t6,$carry
+ srdi $carry,$t6,16
+ insrdi $t4,$t6,16,16
+ add $t7,$t7,$carry
+ insrdi $t4,$t7,16,0 ; 64..127 bits
+ srdi $carry,$t7,16 ; upper 33 bits
+ ld $t6,`$FRAME+64`($sp)
+ ld $t7,`$FRAME+72`($sp)
+
+ addc $t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t0,$t0,32,0
+ extrdi $t1,$t1,32,0
+ adde $t0,$t0,$t1
+___
+$code.=<<___;
+ adde $t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
+ extrdi $t4,$t4,32,0
+ extrdi $t2,$t2,32,0
+ adde $t4,$t4,$t2
+___
+$code.=<<___;
+ addze $carry,$carry
+
+ std $t3,-16($tp) ; tp[j-1]
+ std $t5,-8($tp) ; tp[j]
+
+ add $carry,$carry,$ovf ; comsume upmost overflow
+ add $t6,$t6,$carry ; can not overflow
+ srdi $carry,$t6,16
+ add $t7,$t7,$carry
+ insrdi $t6,$t7,48,0
+ srdi $ovf,$t7,48
+ std $t6,0($tp) ; tp[num-1]
+
+ slwi $t7,$num,2
+ addi $i,$i,8
+ subf $nap_d,$t7,$nap_d ; rewind pointer
+ cmpw $i,$num
+ blt- Louter
+___
+
+$code.=<<___ if ($SIZE_T==8);
+ subf $np,$num,$np ; rewind np
+ addi $j,$j,1 ; restore counter
+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,`$FRAME+$TRANSFER+8`
+ addi $t4,$sp,`$FRAME+$TRANSFER+16`
+ addi $t5,$np,8
+ addi $t6,$rp,8
+ mtctr $j
+
+.align 4
+Lsub: ldx $t0,$tp,$i
+ ldx $t1,$np,$i
+ ldx $t2,$t4,$i
+ ldx $t3,$t5,$i
+ subfe $t0,$t1,$t0 ; tp[j]-np[j]
+ subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
+ stdx $t0,$rp,$i
+ stdx $t2,$t6,$i
+ addi $i,$i,16
+ bdnz- Lsub
+
+ li $i,0
+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+ addi $t7,$ap,8
+ mtctr $j
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ ldx $t0,$ap,$i
+ ldx $t1,$t7,$i
+ std $i,8($nap_d) ; zap nap_d
+ std $i,16($nap_d)
+ std $i,24($nap_d)
+ std $i,32($nap_d)
+ std $i,40($nap_d)
+ std $i,48($nap_d)
+ std $i,56($nap_d)
+ stdu $i,64($nap_d)
+ stdx $t0,$rp,$i
+ stdx $t1,$t6,$i
+ stdx $i,$tp,$i ; zap tp at once
+ stdx $i,$t4,$i
+ addi $i,$i,16
+ bdnz- Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+ subf $np,$num,$np ; rewind np
+ addi $j,$j,1 ; restore counter
+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ addi $np,$np,-4
+ addi $rp,$rp,-4
+ addi $ap,$sp,`$FRAME+$TRANSFER+4`
+ mtctr $j
+
+.align 4
+Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
+ ldu $t2,16($tp)
+ lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
+ lwz $t5,8($np)
+ lwz $t6,12($np)
+ lwzu $t7,16($np)
+ extrdi $t1,$t0,32,0
+ extrdi $t3,$t2,32,0
+ subfe $t4,$t4,$t0 ; tp[j]-np[j]
+ stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
+ subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
+ stw $t1,8($ap)
+ subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
+ stw $t2,12($ap)
+ subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
+ stwu $t3,16($ap)
+ stw $t4,4($rp)
+ stw $t5,8($rp)
+ stw $t6,12($rp)
+ stwu $t7,16($rp)
+ bdnz- Lsub
+
+ li $i,0
+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
+ addi $tp,$sp,`$FRAME+$TRANSFER+4`
+ subf $rp,$num,$rp ; rewind rp
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
+ addi $tp,$sp,`$FRAME+$TRANSFER`
+ mtctr $j
+
+.align 4
+Lcopy: ; copy or in-place refresh
+ lwz $t0,4($ap)
+ lwz $t1,8($ap)
+ lwz $t2,12($ap)
+ lwzu $t3,16($ap)
+ std $i,8($nap_d) ; zap nap_d
+ std $i,16($nap_d)
+ std $i,24($nap_d)
+ std $i,32($nap_d)
+ std $i,40($nap_d)
+ std $i,48($nap_d)
+ std $i,56($nap_d)
+ stdu $i,64($nap_d)
+ stw $t0,4($rp)
+ stw $t1,8($rp)
+ stw $t2,12($rp)
+ stwu $t3,16($rp)
+ std $i,8($tp) ; zap tp at once
+ stdu $i,16($tp)
+ bdnz- Lcopy
+___
+
+$code.=<<___;
+ $POP r14,`2*$SIZE_T`($sp)
+ $POP r15,`3*$SIZE_T`($sp)
+ $POP r16,`4*$SIZE_T`($sp)
+ $POP r17,`5*$SIZE_T`($sp)
+ $POP r18,`6*$SIZE_T`($sp)
+ $POP r19,`7*$SIZE_T`($sp)
+ $POP r20,`8*$SIZE_T`($sp)
+ $POP r21,`9*$SIZE_T`($sp)
+ $POP r22,`10*$SIZE_T`($sp)
+ $POP r23,`11*$SIZE_T`($sp)
+ lfd f14,`12*$SIZE_T+0`($sp)
+ lfd f15,`12*$SIZE_T+8`($sp)
+ lfd f16,`12*$SIZE_T+16`($sp)
+ lfd f17,`12*$SIZE_T+24`($sp)
+ lfd f18,`12*$SIZE_T+32`($sp)
+ lfd f19,`12*$SIZE_T+40`($sp)
+ lfd f20,`12*$SIZE_T+48`($sp)
+ lfd f21,`12*$SIZE_T+56`($sp)
+ lfd f22,`12*$SIZE_T+64`($sp)
+ lfd f23,`12*$SIZE_T+72`($sp)
+ lfd f24,`12*$SIZE_T+80`($sp)
+ lfd f25,`12*$SIZE_T+88`($sp)
+ $POP $sp,0($sp)
+ li r3,1 ; signal "handled"
+ blr
+ .long 0
+.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
new file mode 100644
index 0000000000..d23251033b
--- /dev/null
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,225 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2007.
+#
+# Performance improvement over vanilla C code varies from 85% to 45%
+# depending on key length and benchmark. Unfortunately in this context
+# these are not very impressive results [for code that utilizes "wide"
+# 64x64=128-bit multiplication, which is not commonly available to C
+# programmers], at least hand-coded bn_asm.c replacement is known to
+# provide 30-40% better results for longest keys. Well, on a second
+# thought it's not very surprising, because z-CPUs are single-issue
+# and _strictly_ in-order execution, while bn_mul_mont is more or less
+# dependent on CPU ability to pipe-line instructions and have several
+# of them "in-flight" at the same time. I mean while other methods,
+# for example Karatsuba, aim to minimize amount of multiplications at
+# the cost of other operations increase, bn_mul_mont aim to neatly
+# "overlap" multiplications and the other operations [and on most
+# platforms even minimize the amount of the other operations, in
+# particular references to memory]. But it's possible to improve this
+# module performance by implementing dedicated squaring code-path and
+# possibly by unrolling loops...
+
+# January 2009.
+#
+# Reschedule to minimize/avoid Address Generation Interlock hazard,
+# make inner loops counter-based.
+
+$mn0="%r0";
+$num="%r1";
+
+# int bn_mul_mont(
+$rp="%r2"; # BN_ULONG *rp,
+$ap="%r3"; # const BN_ULONG *ap,
+$bp="%r4"; # const BN_ULONG *bp,
+$np="%r5"; # const BN_ULONG *np,
+$n0="%r6"; # const BN_ULONG *n0,
+#$num="160(%r15)" # int num);
+
+$bi="%r2"; # zaps rp
+$j="%r7";
+
+$ahi="%r8";
+$alo="%r9";
+$nhi="%r10";
+$nlo="%r11";
+$AHI="%r12";
+$NHI="%r13";
+$count="%r14";
+$sp="%r15";
+
+$code.=<<___;
+.text
+.globl bn_mul_mont
+.type bn_mul_mont,\@function
+bn_mul_mont:
+ lgf $num,164($sp) # pull $num
+ sla $num,3 # $num to enumerate bytes
+ la $bp,0($num,$bp)
+
+ stg %r2,16($sp)
+
+ cghi $num,16 #
+ lghi %r2,0 #
+ blr %r14 # if($num<16) return 0;
+ cghi $num,128 #
+ bhr %r14 # if($num>128) return 0;
+
+ stmg %r3,%r15,24($sp)
+
+ lghi $rp,-160-8 # leave room for carry bit
+ lcgr $j,$num # -$num
+ lgr %r0,$sp
+ la $rp,0($rp,$sp)
+ la $sp,0($j,$rp) # alloca
+ stg %r0,0($sp) # back chain
+
+ sra $num,3 # restore $num
+ la $bp,0($j,$bp) # restore $bp
+ ahi $num,-1 # adjust $num for inner loop
+ lg $n0,0($n0) # pull n0
+
+ lg $bi,0($bp)
+ lg $alo,0($ap)
+ mlgr $ahi,$bi # ap[0]*bp[0]
+ lgr $AHI,$ahi
+
+ lgr $mn0,$alo # "tp[0]"*n0
+ msgr $mn0,$n0
+
+ lg $nlo,0($np) #
+ mlgr $nhi,$mn0 # np[0]*m1
+ algr $nlo,$alo # +="tp[0]"
+ lghi $NHI,0
+ alcgr $NHI,$nhi
+
+ la $j,8(%r0) # j=1
+ lr $count,$num
+
+.align 16
+.L1st:
+ lg $alo,0($j,$ap)
+ mlgr $ahi,$bi # ap[j]*bp[0]
+ algr $alo,$AHI
+ lghi $AHI,0
+ alcgr $AHI,$ahi
+
+ lg $nlo,0($j,$np)
+ mlgr $nhi,$mn0 # np[j]*m1
+ algr $nlo,$NHI
+ lghi $NHI,0
+ alcgr $nhi,$NHI # +="tp[j]"
+ algr $nlo,$alo
+ alcgr $NHI,$nhi
+
+ stg $nlo,160-8($j,$sp) # tp[j-1]=
+ la $j,8($j) # j++
+ brct $count,.L1st
+
+ algr $NHI,$AHI
+ lghi $AHI,0
+ alcgr $AHI,$AHI # upmost overflow bit
+ stg $NHI,160-8($j,$sp)
+ stg $AHI,160($j,$sp)
+ la $bp,8($bp) # bp++
+
+.Louter:
+ lg $bi,0($bp) # bp[i]
+ lg $alo,0($ap)
+ mlgr $ahi,$bi # ap[0]*bp[i]
+ alg $alo,160($sp) # +=tp[0]
+ lghi $AHI,0
+ alcgr $AHI,$ahi
+
+ lgr $mn0,$alo
+ msgr $mn0,$n0 # tp[0]*n0
+
+ lg $nlo,0($np) # np[0]
+ mlgr $nhi,$mn0 # np[0]*m1
+ algr $nlo,$alo # +="tp[0]"
+ lghi $NHI,0
+ alcgr $NHI,$nhi
+
+ la $j,8(%r0) # j=1
+ lr $count,$num
+
+.align 16
+.Linner:
+ lg $alo,0($j,$ap)
+ mlgr $ahi,$bi # ap[j]*bp[i]
+ algr $alo,$AHI
+ lghi $AHI,0
+ alcgr $ahi,$AHI
+ alg $alo,160($j,$sp)# +=tp[j]
+ alcgr $AHI,$ahi
+
+ lg $nlo,0($j,$np)
+ mlgr $nhi,$mn0 # np[j]*m1
+ algr $nlo,$NHI
+ lghi $NHI,0
+ alcgr $nhi,$NHI
+ algr $nlo,$alo # +="tp[j]"
+ alcgr $NHI,$nhi
+
+ stg $nlo,160-8($j,$sp) # tp[j-1]=
+ la $j,8($j) # j++
+ brct $count,.Linner
+
+ algr $NHI,$AHI
+ lghi $AHI,0
+ alcgr $AHI,$AHI
+ alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
+ lghi $ahi,0
+ alcgr $AHI,$ahi # new upmost overflow bit
+ stg $NHI,160-8($j,$sp)
+ stg $AHI,160($j,$sp)
+
+ la $bp,8($bp) # bp++
+ clg $bp,160+8+32($j,$sp) # compare to &bp[num]
+ jne .Louter
+
+ lg $rp,160+8+16($j,$sp) # reincarnate rp
+ la $ap,160($sp)
+ ahi $num,1 # restore $num, incidentally clears "borrow"
+
+ la $j,0(%r0)
+ lr $count,$num
+.Lsub: lg $alo,0($j,$ap)
+ slbg $alo,0($j,$np)
+ stg $alo,0($j,$rp)
+ la $j,8($j)
+ brct $count,.Lsub
+ lghi $ahi,0
+ slbgr $AHI,$ahi # handle upmost carry
+
+ ngr $ap,$AHI
+ lghi $np,-1
+ xgr $np,$AHI
+ ngr $np,$rp
+ ogr $ap,$np # ap=borrow?tp:rp
+
+ la $j,0(%r0)
+ lgr $count,$num
+.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
+ stg $j,160($j,$sp) # zap tp
+ stg $alo,0($j,$rp)
+ la $j,8($j)
+ brct $count,.Lcopy
+
+ la %r1,160+8+48($j,$sp)
+ lmg %r6,%r15,0(%r1)
+ lghi %r2,1 # signal "processed"
+ br %r14
+.size bn_mul_mont,.-bn_mul_mont
+.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
new file mode 100755
index 0000000000..8f45f5d513
--- /dev/null
+++ b/crypto/bn/asm/s390x.S
@@ -0,0 +1,678 @@
+.ident "s390x.S, version 1.0"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+
+.text
+
+#define zero %r0
+
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl bn_mul_add_words
+.type bn_mul_add_words,@function
+.align 4
+bn_mul_add_words:
+ lghi zero,0 // zero = 0
+ la %r1,0(%r2) // put rp aside
+ lghi %r2,0 // i=0;
+ ltgfr %r4,%r4
+ bler %r14 // if (len<=0) return 0;
+
+ stmg %r6,%r10,48(%r15)
+ lghi %r8,0 // carry = 0
+ srag %r10,%r4,2 // cnt=len/4
+ jz .Loop1_madd
+
+.Loop4_madd:
+ lg %r7,0(%r2,%r3) // ap[i]
+ mlgr %r6,%r5 // *=w
+ algr %r7,%r8 // +=carry
+ alcgr %r6,zero
+ alg %r7,0(%r2,%r1) // +=rp[i]
+ alcgr %r6,zero
+ stg %r7,0(%r2,%r1) // rp[i]=
+
+ lg %r9,8(%r2,%r3)
+ mlgr %r8,%r5
+ algr %r9,%r6
+ alcgr %r8,zero
+ alg %r9,8(%r2,%r1)
+ alcgr %r8,zero
+ stg %r9,8(%r2,%r1)
+
+ lg %r7,16(%r2,%r3)
+ mlgr %r6,%r5
+ algr %r7,%r8
+ alcgr %r6,zero
+ alg %r7,16(%r2,%r1)
+ alcgr %r6,zero
+ stg %r7,16(%r2,%r1)
+
+ lg %r9,24(%r2,%r3)
+ mlgr %r8,%r5
+ algr %r9,%r6
+ alcgr %r8,zero
+ alg %r9,24(%r2,%r1)
+ alcgr %r8,zero
+ stg %r9,24(%r2,%r1)
+
+ la %r2,32(%r2) // i+=4
+ brct %r10,.Loop4_madd
+
+ lghi %r10,3
+ nr %r4,%r10 // cnt=len%4
+ jz .Lend_madd
+
+.Loop1_madd:
+ lg %r7,0(%r2,%r3) // ap[i]
+ mlgr %r6,%r5 // *=w
+ algr %r7,%r8 // +=carry
+ alcgr %r6,zero
+ alg %r7,0(%r2,%r1) // +=rp[i]
+ alcgr %r6,zero
+ stg %r7,0(%r2,%r1) // rp[i]=
+
+ lgr %r8,%r6
+ la %r2,8(%r2) // i++
+ brct %r4,.Loop1_madd
+
+.Lend_madd:
+ lgr %r2,%r8
+ lmg %r6,%r10,48(%r15)
+ br %r14
+.size bn_mul_add_words,.-bn_mul_add_words
+
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl bn_mul_words
+.type bn_mul_words,@function
+.align 4
+bn_mul_words:
+ lghi zero,0 // zero = 0
+ la %r1,0(%r2) // put rp aside
+ lghi %r2,0 // i=0;
+ ltgfr %r4,%r4
+ bler %r14 // if (len<=0) return 0;
+
+ stmg %r6,%r10,48(%r15)
+ lghi %r8,0 // carry = 0
+ srag %r10,%r4,2 // cnt=len/4
+ jz .Loop1_mul
+
+.Loop4_mul:
+ lg %r7,0(%r2,%r3) // ap[i]
+ mlgr %r6,%r5 // *=w
+ algr %r7,%r8 // +=carry
+ alcgr %r6,zero
+ stg %r7,0(%r2,%r1) // rp[i]=
+
+ lg %r9,8(%r2,%r3)
+ mlgr %r8,%r5
+ algr %r9,%r6
+ alcgr %r8,zero
+ stg %r9,8(%r2,%r1)
+
+ lg %r7,16(%r2,%r3)
+ mlgr %r6,%r5
+ algr %r7,%r8
+ alcgr %r6,zero
+ stg %r7,16(%r2,%r1)
+
+ lg %r9,24(%r2,%r3)
+ mlgr %r8,%r5
+ algr %r9,%r6
+ alcgr %r8,zero
+ stg %r9,24(%r2,%r1)
+
+ la %r2,32(%r2) // i+=4
+ brct %r10,.Loop4_mul
+
+ lghi %r10,3
+ nr %r4,%r10 // cnt=len%4
+ jz .Lend_mul
+
+.Loop1_mul:
+ lg %r7,0(%r2,%r3) // ap[i]
+ mlgr %r6,%r5 // *=w
+ algr %r7,%r8 // +=carry
+ alcgr %r6,zero
+ stg %r7,0(%r2,%r1) // rp[i]=
+
+ lgr %r8,%r6
+ la %r2,8(%r2) // i++
+ brct %r4,.Loop1_mul
+
+.Lend_mul:
+ lgr %r2,%r8
+ lmg %r6,%r10,48(%r15)
+ br %r14
+.size bn_mul_words,.-bn_mul_words
+
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl bn_sqr_words
+.type bn_sqr_words,@function
+.align 4
+bn_sqr_words:
+ ltgfr %r4,%r4
+ bler %r14
+
+ stmg %r6,%r7,48(%r15)
+ srag %r1,%r4,2 // cnt=len/4
+ jz .Loop1_sqr
+
+.Loop4_sqr:
+ lg %r7,0(%r3)
+ mlgr %r6,%r7
+ stg %r7,0(%r2)
+ stg %r6,8(%r2)
+
+ lg %r7,8(%r3)
+ mlgr %r6,%r7
+ stg %r7,16(%r2)
+ stg %r6,24(%r2)
+
+ lg %r7,16(%r3)
+ mlgr %r6,%r7
+ stg %r7,32(%r2)
+ stg %r6,40(%r2)
+
+ lg %r7,24(%r3)
+ mlgr %r6,%r7
+ stg %r7,48(%r2)
+ stg %r6,56(%r2)
+
+ la %r3,32(%r3)
+ la %r2,64(%r2)
+ brct %r1,.Loop4_sqr
+
+ lghi %r1,3
+ nr %r4,%r1 // cnt=len%4
+ jz .Lend_sqr
+
+.Loop1_sqr:
+ lg %r7,0(%r3)
+ mlgr %r6,%r7
+ stg %r7,0(%r2)
+ stg %r6,8(%r2)
+
+ la %r3,8(%r3)
+ la %r2,16(%r2)
+ brct %r4,.Loop1_sqr
+
+.Lend_sqr:
+ lmg %r6,%r7,48(%r15)
+ br %r14
+.size bn_sqr_words,.-bn_sqr_words
+
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl bn_div_words
+.type bn_div_words,@function
+.align 4
+bn_div_words:
+ dlgr %r2,%r4
+ lgr %r2,%r3
+ br %r14
+.size bn_div_words,.-bn_div_words
+
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl bn_add_words
+.type bn_add_words,@function
+.align 4
+bn_add_words:
+ la %r1,0(%r2) // put rp aside
+ lghi %r2,0 // i=0
+ ltgfr %r5,%r5
+ bler %r14 // if (len<=0) return 0;
+
+ stg %r6,48(%r15)
+ lghi %r6,3
+ nr %r6,%r5 // len%4
+ sra %r5,2 // len/4, use sra because it sets condition code
+ jz .Loop1_add // carry is incidentally cleared if branch taken
+ algr %r2,%r2 // clear carry
+
+.Loop4_add:
+ lg %r0,0(%r2,%r3)
+ alcg %r0,0(%r2,%r4)
+ stg %r0,0(%r2,%r1)
+ lg %r0,8(%r2,%r3)
+ alcg %r0,8(%r2,%r4)
+ stg %r0,8(%r2,%r1)
+ lg %r0,16(%r2,%r3)
+ alcg %r0,16(%r2,%r4)
+ stg %r0,16(%r2,%r1)
+ lg %r0,24(%r2,%r3)
+ alcg %r0,24(%r2,%r4)
+ stg %r0,24(%r2,%r1)
+
+ la %r2,32(%r2) // i+=4
+ brct %r5,.Loop4_add
+
+ la %r6,1(%r6) // see if len%4 is zero ...
+ brct %r6,.Loop1_add // without touching condition code:-)
+
+.Lexit_add:
+ lghi %r2,0
+ alcgr %r2,%r2
+ lg %r6,48(%r15)
+ br %r14
+
+.Loop1_add:
+ lg %r0,0(%r2,%r3)
+ alcg %r0,0(%r2,%r4)
+ stg %r0,0(%r2,%r1)
+
+ la %r2,8(%r2) // i++
+ brct %r6,.Loop1_add
+
+ j .Lexit_add
+.size bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl bn_sub_words
+.type bn_sub_words,@function
+.align 4
+bn_sub_words:
+ la %r1,0(%r2) // put rp aside
+ lghi %r2,0 // i=0
+ ltgfr %r5,%r5
+ bler %r14 // if (len<=0) return 0;
+
+ stg %r6,48(%r15)
+ lghi %r6,3
+ nr %r6,%r5 // len%4
+ sra %r5,2 // len/4, use sra because it sets condition code
+ jnz .Loop4_sub // borrow is incidentally cleared if branch taken
+ slgr %r2,%r2 // clear borrow
+
+.Loop1_sub:
+ lg %r0,0(%r2,%r3)
+ slbg %r0,0(%r2,%r4)
+ stg %r0,0(%r2,%r1)
+
+ la %r2,8(%r2) // i++
+ brct %r6,.Loop1_sub
+ j .Lexit_sub
+
+.Loop4_sub:
+ lg %r0,0(%r2,%r3)
+ slbg %r0,0(%r2,%r4)
+ stg %r0,0(%r2,%r1)
+ lg %r0,8(%r2,%r3)
+ slbg %r0,8(%r2,%r4)
+ stg %r0,8(%r2,%r1)
+ lg %r0,16(%r2,%r3)
+ slbg %r0,16(%r2,%r4)
+ stg %r0,16(%r2,%r1)
+ lg %r0,24(%r2,%r3)
+ slbg %r0,24(%r2,%r4)
+ stg %r0,24(%r2,%r1)
+
+ la %r2,32(%r2) // i+=4
+ brct %r5,.Loop4_sub
+
+ la %r6,1(%r6) // see if len%4 is zero ...
+ brct %r6,.Loop1_sub // without touching condition code:-)
+
+.Lexit_sub:
+ lghi %r2,0
+ slbgr %r2,%r2
+ lcgr %r2,%r2
+ lg %r6,48(%r15)
+ br %r14
+.size bn_sub_words,.-bn_sub_words
+
+#define c1 %r1
+#define c2 %r5
+#define c3 %r8
+
+#define mul_add_c(ai,bi,c1,c2,c3) \
+ lg %r7,ai*8(%r3); \
+ mlg %r6,bi*8(%r4); \
+ algr c1,%r7; \
+ alcgr c2,%r6; \
+ alcgr c3,zero
+
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl bn_mul_comba8
+.type bn_mul_comba8,@function
+.align 4
+bn_mul_comba8:
+ stmg %r6,%r8,48(%r15)
+
+ lghi c1,0
+ lghi c2,0
+ lghi c3,0
+ lghi zero,0
+
+ mul_add_c(0,0,c1,c2,c3);
+ stg c1,0*8(%r2)
+ lghi c1,0
+
+ mul_add_c(0,1,c2,c3,c1);
+ mul_add_c(1,0,c2,c3,c1);
+ stg c2,1*8(%r2)
+ lghi c2,0
+
+ mul_add_c(2,0,c3,c1,c2);
+ mul_add_c(1,1,c3,c1,c2);
+ mul_add_c(0,2,c3,c1,c2);
+ stg c3,2*8(%r2)
+ lghi c3,0
+
+ mul_add_c(0,3,c1,c2,c3);
+ mul_add_c(1,2,c1,c2,c3);
+ mul_add_c(2,1,c1,c2,c3);
+ mul_add_c(3,0,c1,c2,c3);
+ stg c1,3*8(%r2)
+ lghi c1,0
+
+ mul_add_c(4,0,c2,c3,c1);
+ mul_add_c(3,1,c2,c3,c1);
+ mul_add_c(2,2,c2,c3,c1);
+ mul_add_c(1,3,c2,c3,c1);
+ mul_add_c(0,4,c2,c3,c1);
+ stg c2,4*8(%r2)
+ lghi c2,0
+
+ mul_add_c(0,5,c3,c1,c2);
+ mul_add_c(1,4,c3,c1,c2);
+ mul_add_c(2,3,c3,c1,c2);
+ mul_add_c(3,2,c3,c1,c2);
+ mul_add_c(4,1,c3,c1,c2);
+ mul_add_c(5,0,c3,c1,c2);
+ stg c3,5*8(%r2)
+ lghi c3,0
+
+ mul_add_c(6,0,c1,c2,c3);
+ mul_add_c(5,1,c1,c2,c3);
+ mul_add_c(4,2,c1,c2,c3);
+ mul_add_c(3,3,c1,c2,c3);
+ mul_add_c(2,4,c1,c2,c3);
+ mul_add_c(1,5,c1,c2,c3);
+ mul_add_c(0,6,c1,c2,c3);
+ stg c1,6*8(%r2)
+ lghi c1,0
+
+ mul_add_c(0,7,c2,c3,c1);
+ mul_add_c(1,6,c2,c3,c1);
+ mul_add_c(2,5,c2,c3,c1);
+ mul_add_c(3,4,c2,c3,c1);
+ mul_add_c(4,3,c2,c3,c1);
+ mul_add_c(5,2,c2,c3,c1);
+ mul_add_c(6,1,c2,c3,c1);
+ mul_add_c(7,0,c2,c3,c1);
+ stg c2,7*8(%r2)
+ lghi c2,0
+
+ mul_add_c(7,1,c3,c1,c2);
+ mul_add_c(6,2,c3,c1,c2);
+ mul_add_c(5,3,c3,c1,c2);
+ mul_add_c(4,4,c3,c1,c2);
+ mul_add_c(3,5,c3,c1,c2);
+ mul_add_c(2,6,c3,c1,c2);
+ mul_add_c(1,7,c3,c1,c2);
+ stg c3,8*8(%r2)
+ lghi c3,0
+
+ mul_add_c(2,7,c1,c2,c3);
+ mul_add_c(3,6,c1,c2,c3);
+ mul_add_c(4,5,c1,c2,c3);
+ mul_add_c(5,4,c1,c2,c3);
+ mul_add_c(6,3,c1,c2,c3);
+ mul_add_c(7,2,c1,c2,c3);
+ stg c1,9*8(%r2)
+ lghi c1,0
+
+ mul_add_c(7,3,c2,c3,c1);
+ mul_add_c(6,4,c2,c3,c1);
+ mul_add_c(5,5,c2,c3,c1);
+ mul_add_c(4,6,c2,c3,c1);
+ mul_add_c(3,7,c2,c3,c1);
+ stg c2,10*8(%r2)
+ lghi c2,0
+
+ mul_add_c(4,7,c3,c1,c2);
+ mul_add_c(5,6,c3,c1,c2);
+ mul_add_c(6,5,c3,c1,c2);
+ mul_add_c(7,4,c3,c1,c2);
+ stg c3,11*8(%r2)
+ lghi c3,0
+
+ mul_add_c(7,5,c1,c2,c3);
+ mul_add_c(6,6,c1,c2,c3);
+ mul_add_c(5,7,c1,c2,c3);
+ stg c1,12*8(%r2)
+ lghi c1,0
+
+
+ mul_add_c(6,7,c2,c3,c1);
+ mul_add_c(7,6,c2,c3,c1);
+ stg c2,13*8(%r2)
+ lghi c2,0
+
+ mul_add_c(7,7,c3,c1,c2);
+ stg c3,14*8(%r2)
+ stg c1,15*8(%r2)
+
+ lmg %r6,%r8,48(%r15)
+ br %r14
+.size bn_mul_comba8,.-bn_mul_comba8
+
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl bn_mul_comba4
+.type bn_mul_comba4,@function
+.align 4
+bn_mul_comba4:
+ stmg %r6,%r8,48(%r15)
+
+ lghi c1,0
+ lghi c2,0
+ lghi c3,0
+ lghi zero,0
+
+ mul_add_c(0,0,c1,c2,c3);
+ stg c1,0*8(%r3)
+ lghi c1,0
+
+ mul_add_c(0,1,c2,c3,c1);
+ mul_add_c(1,0,c2,c3,c1);
+ stg c2,1*8(%r2)
+ lghi c2,0
+
+ mul_add_c(2,0,c3,c1,c2);
+ mul_add_c(1,1,c3,c1,c2);
+ mul_add_c(0,2,c3,c1,c2);
+ stg c3,2*8(%r2)
+ lghi c3,0
+
+ mul_add_c(0,3,c1,c2,c3);
+ mul_add_c(1,2,c1,c2,c3);
+ mul_add_c(2,1,c1,c2,c3);
+ mul_add_c(3,0,c1,c2,c3);
+ stg c1,3*8(%r2)
+ lghi c1,0
+
+ mul_add_c(3,1,c2,c3,c1);
+ mul_add_c(2,2,c2,c3,c1);
+ mul_add_c(1,3,c2,c3,c1);
+ stg c2,4*8(%r2)
+ lghi c2,0
+
+ mul_add_c(2,3,c3,c1,c2);
+ mul_add_c(3,2,c3,c1,c2);
+ stg c3,5*8(%r2)
+ lghi c3,0
+
+ mul_add_c(3,3,c1,c2,c3);
+ stg c1,6*8(%r2)
+ stg c2,7*8(%r2)
+
+ stmg %r6,%r8,48(%r15)
+ br %r14
+.size bn_mul_comba4,.-bn_mul_comba4
+
+#define sqr_add_c(ai,c1,c2,c3) \
+ lg %r7,ai*8(%r3); \
+ mlgr %r6,%r7; \
+ algr c1,%r7; \
+ alcgr c2,%r6; \
+ alcgr c3,zero
+
+#define sqr_add_c2(ai,aj,c1,c2,c3) \
+ lg %r7,ai*8(%r3); \
+ mlg %r6,aj*8(%r3); \
+ algr c1,%r7; \
+ alcgr c2,%r6; \
+ alcgr c3,zero; \
+ algr c1,%r7; \
+ alcgr c2,%r6; \
+ alcgr c3,zero
+
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba8
+.type bn_sqr_comba8,@function
+.align 4
+bn_sqr_comba8:
+ stmg %r6,%r8,48(%r15)
+
+ lghi c1,0
+ lghi c2,0
+ lghi c3,0
+ lghi zero,0
+
+ sqr_add_c(0,c1,c2,c3);
+ stg c1,0*8(%r2)
+ lghi c1,0
+
+ sqr_add_c2(1,0,c2,c3,c1);
+ stg c2,1*8(%r2)
+ lghi c2,0
+
+ sqr_add_c(1,c3,c1,c2);
+ sqr_add_c2(2,0,c3,c1,c2);
+ stg c3,2*8(%r2)
+ lghi c3,0
+
+ sqr_add_c2(3,0,c1,c2,c3);
+ sqr_add_c2(2,1,c1,c2,c3);
+ stg c1,3*8(%r2)
+ lghi c1,0
+
+ sqr_add_c(2,c2,c3,c1);
+ sqr_add_c2(3,1,c2,c3,c1);
+ sqr_add_c2(4,0,c2,c3,c1);
+ stg c2,4*8(%r2)
+ lghi c2,0
+
+ sqr_add_c2(5,0,c3,c1,c2);
+ sqr_add_c2(4,1,c3,c1,c2);
+ sqr_add_c2(3,2,c3,c1,c2);
+ stg c3,5*8(%r2)
+ lghi c3,0
+
+ sqr_add_c(3,c1,c2,c3);
+ sqr_add_c2(4,2,c1,c2,c3);
+ sqr_add_c2(5,1,c1,c2,c3);
+ sqr_add_c2(6,0,c1,c2,c3);
+ stg c1,6*8(%r2)
+ lghi c1,0
+
+ sqr_add_c2(7,0,c2,c3,c1);
+ sqr_add_c2(6,1,c2,c3,c1);
+ sqr_add_c2(5,2,c2,c3,c1);
+ sqr_add_c2(4,3,c2,c3,c1);
+ stg c2,7*8(%r2)
+ lghi c2,0
+
+ sqr_add_c(4,c3,c1,c2);
+ sqr_add_c2(5,3,c3,c1,c2);
+ sqr_add_c2(6,2,c3,c1,c2);
+ sqr_add_c2(7,1,c3,c1,c2);
+ stg c3,8*8(%r2)
+ lghi c3,0
+
+ sqr_add_c2(7,2,c1,c2,c3);
+ sqr_add_c2(6,3,c1,c2,c3);
+ sqr_add_c2(5,4,c1,c2,c3);
+ stg c1,9*8(%r2)
+ lghi c1,0
+
+ sqr_add_c(5,c2,c3,c1);
+ sqr_add_c2(6,4,c2,c3,c1);
+ sqr_add_c2(7,3,c2,c3,c1);
+ stg c2,10*8(%r2)
+ lghi c2,0
+
+ sqr_add_c2(7,4,c3,c1,c2);
+ sqr_add_c2(6,5,c3,c1,c2);
+ stg c3,11*8(%r2)
+ lghi c3,0
+
+ sqr_add_c(6,c1,c2,c3);
+ sqr_add_c2(7,5,c1,c2,c3);
+ stg c1,12*8(%r2)
+ lghi c1,0
+
+ sqr_add_c2(7,6,c2,c3,c1);
+ stg c2,13*8(%r2)
+ lghi c2,0
+
+ sqr_add_c(7,c3,c1,c2);
+ stg c3,14*8(%r2)
+ stg c1,15*8(%r2)
+
+ lmg %r6,%r8,48(%r15)
+ br %r14
+.size bn_sqr_comba8,.-bn_sqr_comba8
+
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type bn_sqr_comba4,@function
+.align 4
+bn_sqr_comba4:
+ stmg %r6,%r8,48(%r15)
+
+ lghi c1,0
+ lghi c2,0
+ lghi c3,0
+ lghi zero,0
+
+ sqr_add_c(0,c1,c2,c3);
+ stg c1,0*8(%r2)
+ lghi c1,0
+
+ sqr_add_c2(1,0,c2,c3,c1);
+ stg c2,1*8(%r2)
+ lghi c2,0
+
+ sqr_add_c(1,c3,c1,c2);
+ sqr_add_c2(2,0,c3,c1,c2);
+ stg c3,2*8(%r2)
+ lghi c3,0
+
+ sqr_add_c2(3,0,c1,c2,c3);
+ sqr_add_c2(2,1,c1,c2,c3);
+ stg c1,3*8(%r2)
+ lghi c1,0
+
+ sqr_add_c(2,c2,c3,c1);
+ sqr_add_c2(3,1,c2,c3,c1);
+ stg c2,4*8(%r2)
+ lghi c2,0
+
+ sqr_add_c2(3,2,c3,c1,c2);
+ stg c3,5*8(%r2)
+ lghi c3,0
+
+ sqr_add_c(3,c1,c2,c3);
+ stg c1,6*8(%r2)
+ stg c2,7*8(%r2)
+
+ lmg %r6,%r8,48(%r15)
+ br %r14
+.size bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/crypto/bn/asm/sparcv8plus.S b/crypto/bn/asm/sparcv8plus.S
index 0074dfdb75..63de1860f2 100644
--- a/crypto/bn/asm/sparcv8plus.S
+++ b/crypto/bn/asm/sparcv8plus.S
@@ -144,6 +144,19 @@
* }
*/
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+ /* They've said -xarch=v9 at command line */
+ .register %g2,#scratch
+ .register %g3,#scratch
+# define FRAME_SIZE -192
+#elif defined(__GNUC__) && defined(__arch64__)
+ /* They've said -m64 at command line */
+ .register %g2,#scratch
+ .register %g3,#scratch
+# define FRAME_SIZE -192
+#else
+# define FRAME_SIZE -96
+#endif
/*
* GNU assembler can't stand stuw:-(
*/
@@ -162,10 +175,14 @@
* BN_ULONG w;
*/
bn_mul_add_words:
+ sra %o2,%g0,%o2 ! signx %o2
brgz,a %o2,.L_bn_mul_add_words_proceed
lduw [%o1],%g2
retl
clr %o0
+ nop
+ nop
+ nop
.L_bn_mul_add_words_proceed:
srl %o3,%g0,%o3 ! clruw %o3
@@ -260,10 +277,14 @@ bn_mul_add_words:
* BN_ULONG w;
*/
bn_mul_words:
+ sra %o2,%g0,%o2 ! signx %o2
brgz,a %o2,.L_bn_mul_words_proceeed
lduw [%o1],%g2
retl
clr %o0
+ nop
+ nop
+ nop
.L_bn_mul_words_proceeed:
srl %o3,%g0,%o3 ! clruw %o3
@@ -344,10 +365,14 @@ bn_mul_words:
* int n;
*/
bn_sqr_words:
+ sra %o2,%g0,%o2 ! signx %o2
brgz,a %o2,.L_bn_sqr_words_proceeed
lduw [%o1],%g2
retl
clr %o0
+ nop
+ nop
+ nop
.L_bn_sqr_words_proceeed:
andcc %o2,-4,%g0
@@ -445,6 +470,7 @@ bn_div_words:
* int n;
*/
bn_add_words:
+ sra %o3,%g0,%o3 ! signx %o3
brgz,a %o3,.L_bn_add_words_proceed
lduw [%o1],%o4
retl
@@ -454,7 +480,6 @@ bn_add_words:
andcc %o3,-4,%g0
bz,pn %icc,.L_bn_add_words_tail
addcc %g0,0,%g0 ! clear carry flag
- nop
.L_bn_add_words_loop: ! wow! 32 aligned!
dec 4,%o3
@@ -523,6 +548,7 @@ bn_add_words:
* int n;
*/
bn_sub_words:
+ sra %o3,%g0,%o3 ! signx %o3
brgz,a %o3,.L_bn_sub_words_proceed
lduw [%o1],%o4
retl
@@ -532,7 +558,6 @@ bn_sub_words:
andcc %o3,-4,%g0
bz,pn %icc,.L_bn_sub_words_tail
addcc %g0,0,%g0 ! clear carry flag
- nop
.L_bn_sub_words_loop: ! wow! 32 aligned!
dec 4,%o3
@@ -607,8 +632,6 @@ bn_sub_words:
* Andy.
*/
-#define FRAME_SIZE -96
-
/*
* Here is register usage map for *all* routines below.
*/
diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl
new file mode 100644
index 0000000000..b8fb1e8a25
--- /dev/null
+++ b/crypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*) Engine accessing the driver in question is on my TODO list.
+# For reference, acceleator is estimated to give 6 to 10 times
+# improvement on single-threaded RSA sign. It should be noted
+# that 6-10x improvement coefficient does not actually mean
+# something extraordinary in terms of absolute [single-threaded]
+# performance, as SPARCv9 instruction set is by all means least
+# suitable for high performance crypto among other 64 bit
+# platforms. 6-10x factor simply places T1 in same performance
+# domain as say AMD64 and IA-64. Improvement of RSA verify don't
+# appear impressive at all, but it's the sign operation which is
+# far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0"; # BN_ULONG *rp,
+$ap="%i1"; # const BN_ULONG *ap,
+$bp="%i2"; # const BN_ULONG *bp,
+$np="%i3"; # const BN_ULONG *np,
+$n0="%i4"; # const BN_ULONG *n0,
+$num="%i5"; # int num);
+
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) { $bias=2047; $frame=192; }
+else { $bias=0; $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2"; # 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1"; # 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont_int";
+
+$code=<<___;
+.section ".text",#alloc,#execinstr
+
+.global $fname
+.align 32
+$fname:
+ cmp %o5,4 ! 128 bits minimum
+ bge,pt %icc,.Lenter
+ sethi %hi(0xffffffff),$mask
+ retl
+ clr %o0
+.align 32
+.Lenter:
+ save %sp,-$frame,%sp
+ sll $num,2,$num ! num*=4
+ or $mask,%lo(0xffffffff),$mask
+ ld [$n0],$n0
+ cmp $ap,$bp
+ and $num,$mask,$num
+ ld [$bp],$mul0 ! bp[0]
+ nop
+
+ add %sp,$bias,%o7 ! real top of stack
+ ld [$ap],$car0 ! ap[0] ! redundant in squaring context
+ sub %o7,$num,%o7
+ ld [$ap+4],$apj ! ap[1]
+ and %o7,-1024,%o7
+ ld [$np],$car1 ! np[0]
+ sub %o7,$bias,%sp ! alloca
+ ld [$np+4],$npj ! np[1]
+ be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+ mov 12,$j
+
+ mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
+ mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
+ and $car0,$mask,$acc0
+ add %sp,$bias+$frame,$tp
+ ld [$ap+8],$apj !prologue!
+
+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
+ mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ ld [$np+8],$npj !prologue!
+ srlx $car1,32,$car1
+ mov $tmp0,$acc0 !prologue!
+
+.L1st:
+ mulx $apj,$mul0,$tmp0
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ add $j,4,$j ! j++
+ mov $tmp0,$acc0
+ st $car1,[$tp]
+ cmp $j,$num
+ mov $tmp1,$acc1
+ srlx $car1,32,$car1
+ bl %icc,.L1st
+ add $tp,4,$tp ! tp++
+!.L1st
+
+ mulx $apj,$mul0,$tmp0 !epilogue!
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0
+ and $car0,$mask,$acc0
+ add $acc1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $tmp0,$car0,$car0
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car1
+
+ add $car0,$car1,$car1
+ st $car1,[$tp+8]
+ srlx $car1,32,$car2
+
+ mov 4,$i ! i++
+ ld [$bp+4],$mul0 ! bp[1]
+.Louter:
+ add %sp,$bias+$frame,$tp
+ ld [$ap],$car0 ! ap[0]
+ ld [$ap+4],$apj ! ap[1]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ ld [$tp],$tmp1 ! tp[0]
+ ld [$tp+4],$tpj ! tp[1]
+ mov 12,$j
+
+ mulx $car0,$mul0,$car0
+ mulx $apj,$mul0,$tmp0 !prologue!
+ add $tmp1,$car0,$car0
+ ld [$ap+8],$apj !prologue!
+ and $car0,$mask,$acc0
+
+ mulx $n0,$acc0,$mul1
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1
+ mulx $npj,$mul1,$acc1 !prologue!
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ ld [$np+8],$npj !prologue!
+ srlx $car1,32,$car1
+ mov $tmp0,$acc0 !prologue!
+
+.Linner:
+ mulx $apj,$mul0,$tmp0
+ mulx $npj,$mul1,$tmp1
+ add $tpj,$car0,$car0
+ ld [$ap+$j],$apj ! ap[j]
+ add $acc0,$car0,$car0
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj ! np[j]
+ and $car0,$mask,$acc0
+ ld [$tp+8],$tpj ! tp[j]
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ add $j,4,$j ! j++
+ mov $tmp0,$acc0
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+ mov $tmp1,$acc1
+ cmp $j,$num
+ bl %icc,.Linner
+ add $tp,4,$tp ! tp++
+!.Linner
+
+ mulx $apj,$mul0,$tmp0 !epilogue!
+ mulx $npj,$mul1,$tmp1
+ add $tpj,$car0,$car0
+ add $acc0,$car0,$car0
+ ld [$tp+8],$tpj ! tp[j]
+ and $car0,$mask,$acc0
+ add $acc1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+
+ add $tpj,$car0,$car0
+ add $tmp0,$car0,$car0
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ add $acc0,$car1,$car1
+ st $car1,[$tp+4] ! tp[j-1]
+ srlx $car0,32,$car0
+ add $i,4,$i ! i++
+ srlx $car1,32,$car1
+
+ add $car0,$car1,$car1
+ cmp $i,$num
+ add $car2,$car1,$car1
+ st $car1,[$tp+8]
+
+ srlx $car1,32,$car2
+ bl,a %icc,.Louter
+ ld [$bp+$i],$mul0 ! bp[i]
+!.Louter
+
+ add $tp,12,$tp
+
+.Ltail:
+ add $np,$num,$np
+ add $rp,$num,$rp
+ mov $tp,$ap
+ sub %g0,$num,%o7 ! k=-num
+ ba .Lsub
+ subcc %g0,%g0,%g0 ! clear %icc.c
+.align 16
+.Lsub:
+ ld [$tp+%o7],%o0
+ ld [$np+%o7],%o1
+ subccc %o0,%o1,%o1 ! tp[j]-np[j]
+ add $rp,%o7,$i
+ add %o7,4,%o7
+ brnz %o7,.Lsub
+ st %o1,[$i]
+ subc $car2,0,$car2 ! handle upmost overflow bit
+ and $tp,$car2,$ap
+ andn $rp,$car2,$np
+ or $ap,$np,$ap
+ sub %g0,$num,%o7
+
+.Lcopy:
+ ld [$ap+%o7],%o0 ! copy or in-place refresh
+ st %g0,[$tp+%o7] ! zap tp
+ st %o0,[$rp+%o7]
+ add %o7,4,%o7
+ brnz %o7,.Lcopy
+ nop
+ mov 1,%i0
+ ret
+ restore
+___
+
+########
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
+########
+$sbit="%i2"; # re-use $bp!
+
+$code.=<<___;
+.align 32
+.Lbn_sqr_mont:
+ mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
+ mulx $apj,$mul0,$tmp0 !prologue!
+ and $car0,$mask,$acc0
+ add %sp,$bias+$frame,$tp
+ ld [$ap+8],$apj !prologue!
+
+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0
+ srlx $car0,32,$car0
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
+ mulx $npj,$mul1,$acc1 !prologue!
+ and $car0,1,$sbit
+ ld [$np+8],$npj !prologue!
+ srlx $car0,1,$car0
+ add $acc0,$car1,$car1
+ srlx $car1,32,$car1
+ mov $tmp0,$acc0 !prologue!
+
+.Lsqr_1st:
+ mulx $apj,$mul0,$tmp0
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0 ! ap[j]*a0+c0
+ add $acc1,$car1,$car1
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ mov $tmp1,$acc1
+ srlx $acc0,32,$sbit
+ add $j,4,$j ! j++
+ and $acc0,$mask,$acc0
+ cmp $j,$num
+ add $acc0,$car1,$car1
+ st $car1,[$tp]
+ mov $tmp0,$acc0
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_1st
+ add $tp,4,$tp ! tp++
+!.Lsqr_1st
+
+ mulx $apj,$mul0,$tmp0 ! epilogue
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0 ! ap[j]*a0+c0
+ add $acc1,$car1,$car1
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $tmp0,$car0,$car0 ! ap[j]*a0+c0
+ add $tmp1,$car1,$car1
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ st $car1,[$tp+8]
+ srlx $car1,32,$car2
+
+ ld [%sp+$bias+$frame],$tmp0 ! tp[0]
+ ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
+ ld [%sp+$bias+$frame+8],$tpj ! tp[2]
+ ld [$ap+4],$mul0 ! ap[1]
+ ld [$ap+8],$apj ! ap[2]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ mulx $n0,$tmp0,$mul1
+
+ mulx $mul0,$mul0,$car0
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1
+ mulx $npj,$mul1,$acc1
+ add $tmp0,$car1,$car1
+ and $car0,$mask,$acc0
+ ld [$np+8],$npj ! np[2]
+ srlx $car1,32,$car1
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ and $car0,1,$sbit
+ add $acc1,$car1,$car1
+ srlx $car0,1,$car0
+ mov 12,$j
+ st $car1,[%sp+$bias+$frame] ! tp[0]=
+ srlx $car1,32,$car1
+ add %sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $acc0,$car0,$car0
+ add $tpj,$car1,$car1
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc1,$car1,$car1
+ ld [$tp+8],$tpj ! tp[j]
+ add $acc0,$acc0,$acc0
+ add $j,4,$j ! j++
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ cmp $j,$num
+ add $acc0,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_2nd
+ add $tp,4,$tp ! tp++
+!.Lsqr_2nd
+
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $acc0,$car0,$car0
+ add $tpj,$car1,$car1
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc1,$car1,$car1
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ add $car2,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car2
+
+ ld [%sp+$bias+$frame],$tmp1 ! tp[0]
+ ld [%sp+$bias+$frame+4],$tpj ! tp[1]
+ ld [$ap+8],$mul0 ! ap[2]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ mulx $n0,$tmp1,$mul1
+ and $mul1,$mask,$mul1
+ mov 8,$i
+
+ mulx $mul0,$mul0,$car0
+ mulx $car1,$mul1,$car1
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add %sp,$bias+$frame,$tp
+ srlx $car1,32,$car1
+ and $car0,1,$sbit
+ srlx $car0,1,$car0
+ mov 4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $j,4,$j
+ ld [$tp+8],$tpj
+ cmp $j,$i
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_inner1
+ add $tp,4,$tp
+!.Lsqr_inner1
+
+ add $j,4,$j
+ ld [$ap+$j],$apj ! ap[j]
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ ld [$np+$j],$npj ! np[j]
+ add $acc0,$car1,$car1
+ ld [$tp+8],$tpj ! tp[j]
+ add $acc1,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $j,4,$j
+ cmp $j,$num
+ be,pn %icc,.Lsqr_no_inner2
+ add $tp,4,$tp
+
+.Lsqr_inner2:
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $acc0,$car0,$car0
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ ld [$tp+8],$tpj ! tp[j]
+ or $sbit,$acc0,$acc0
+ add $j,4,$j ! j++
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ cmp $j,$num
+ add $acc0,$car1,$car1
+ add $acc1,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_inner2
+ add $tp,4,$tp ! tp++
+
+.Lsqr_no_inner2:
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $acc0,$car0,$car0
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ add $acc1,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ add $car2,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car2
+
+ add $i,4,$i ! i++
+ ld [%sp+$bias+$frame],$tmp1 ! tp[0]
+ ld [%sp+$bias+$frame+4],$tpj ! tp[1]
+ ld [$ap+$i],$mul0 ! ap[j]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ mulx $n0,$tmp1,$mul1
+ and $mul1,$mask,$mul1
+ add $i,4,$tmp0
+
+ mulx $mul0,$mul0,$car0
+ mulx $car1,$mul1,$car1
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add %sp,$bias+$frame,$tp
+ srlx $car1,32,$car1
+ and $car0,1,$sbit
+ srlx $car0,1,$car0
+
+ cmp $tmp0,$num ! i<num-1
+ bl %icc,.Lsqr_outer
+ mov 4,$j
+
+.Lsqr_last:
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $j,4,$j
+ ld [$tp+8],$tpj
+ cmp $j,$i
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_last
+ add $tp,4,$tp
+!.Lsqr_last
+
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $acc0,$car1,$car1
+ add $acc1,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0 ! recover $car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ add $car2,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car2
+
+ ba .Ltail
+ add $tp,8,$tp
+.type $fname,#function
+.size $fname,(.-$fname)
+.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl
new file mode 100755
index 0000000000..a14205f2f0
--- /dev/null
+++ b/crypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
+# Because unlike integer multiplier, which simply stalls whole CPU,
+# FPU is fully pipelined and can effectively emit 48 bit partial
+# product every cycle. Why not blended SPARC v9? One can argue that
+# making this module dependent on UltraSPARC VIS extension limits its
+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
+# implementations from compatibility matrix. But the rest, whole Sun
+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
+# VIS extension instructions used in this module. This is considered
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
+
+# USI&II cores currently exhibit uniform 2x improvement [over pre-
+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
+# performance improves few percents for shorter keys and worsens few
+# percents for longer keys. This is because USIII integer multiplier
+# is >3x faster than USI&II one, which is harder to match [but see
+# TODO list below]. It should also be noted that SPARC64 V features
+# out-of-order execution, which *might* mean that integer multiplier
+# is pipelined, which in turn *might* be impossible to match... On
+# additional note, SPARC64 V implements FP Multiply-Add instruction,
+# which is perfectly usable in this context... In other words, as far
+# as Fujitsu SPARC64 V goes, talk to the author:-)
+
+# The implementation implies following "non-natural" limitations on
+# input arguments:
+# - num may not be less than 4;
+# - num has to be even;
+# Failure to meet either condition has no fatal effects, simply
+# doesn't give any performance gain.
+
+# TODO:
+# - modulo-schedule inner loop for better performance (on in-order
+# execution core such as UltraSPARC this shall result in further
+# noticeable(!) improvement);
+# - dedicated squaring procedure[?];
+
+######################################################################
+# November 2006
+#
+# Modulo-scheduled inner loops allow to interleave floating point and
+# integer instructions and minimize Read-After-Write penalties. This
+# results in *further* 20-50% perfromance improvement [depending on
+# key length, more for longer keys] on USI&II cores and 30-80% - on
+# USIII&IV.
+
+$fname="bn_mul_mont_fpu";
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+
+if ($bits==64) {
+ $bias=2047;
+ $frame=192;
+} else {
+ $bias=0;
+ $frame=128; # 96 rounded up to largest known cache-line
+}
+$locals=64;
+
+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
+# exclusively for pointers, indexes and other small values...
+# int bn_mul_mont(
+$rp="%i0"; # BN_ULONG *rp,
+$ap="%i1"; # const BN_ULONG *ap,
+$bp="%i2"; # const BN_ULONG *bp,
+$np="%i3"; # const BN_ULONG *np,
+$n0="%i4"; # const BN_ULONG *n0,
+$num="%i5"; # int num);
+
+$tp="%l0"; # t[num]
+$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
+$ap_h="%l2"; # to these four vectors as double-precision FP values.
+$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
+$np_h="%l4"; # loop and L1-cache aliasing is minimized...
+$i="%l5";
+$j="%l6";
+$mask="%l7"; # 16-bit mask, 0xffff
+
+$n0="%g4"; # reassigned(!) to "64-bit" register
+$carry="%i4"; # %i4 reused(!) for a carry bit
+
+# FP register naming chart
+#
+# ..HILO
+# dcba
+# --------
+# LOa
+# LOb
+# LOc
+# LOd
+# HIa
+# HIb
+# HIc
+# HId
+# ..a
+# ..b
+$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
+$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
+$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
+$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
+
+$dota="%f24"; $dotb="%f26";
+
+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
+
+$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
+
+$code=<<___;
+.section ".text",#alloc,#execinstr
+
+.global $fname
+.align 32
+$fname:
+ save %sp,-$frame-$locals,%sp
+
+ cmp $num,4
+ bl,a,pn %icc,.Lret
+ clr %i0
+ andcc $num,1,%g0 ! $num has to be even...
+ bnz,a,pn %icc,.Lret
+ clr %i0 ! signal "unsupported input value"
+
+ srl $num,1,$num
+ sethi %hi(0xffff),$mask
+ ld [%i4+0],$n0 ! $n0 reassigned, remember?
+ or $mask,%lo(0xffff),$mask
+ ld [%i4+4],%o0
+ sllx %o0,32,%o0
+ or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
+
+ sll $num,3,$num ! num*=8
+
+ add %sp,$bias,%o0 ! real top of stack
+ sll $num,2,%o1
+ add %o1,$num,%o1 ! %o1=num*5
+ sub %o0,%o1,%o0
+ and %o0,-2048,%o0 ! optimize TLB utilization
+ sub %o0,$bias,%sp ! alloca(5*num*8)
+
+ rd %asi,%o7 ! save %asi
+ add %sp,$bias+$frame+$locals,$tp
+ add $tp,$num,$ap_l
+ add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
+ add $ap_l,$num,$ap_h
+ add $ap_h,$num,$np_l
+ add $np_l,$num,$np_h
+
+ wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
+
+ add $rp,$num,$rp ! readjust input pointers to point
+ add $ap,$num,$ap ! at the ends too...
+ add $bp,$num,$bp
+ add $np,$num,$np
+
+ stx %o7,[%sp+$bias+$frame+48] ! save %asi
+
+ sub %g0,$num,$i ! i=-num
+ sub %g0,$num,$j ! j=-num
+
+ add $ap,$j,%o3
+ add $bp,$i,%o4
+
+ ld [%o3+4],%g1 ! bp[0]
+ ld [%o3+0],%o0
+ ld [%o4+4],%g5 ! ap[0]
+ sllx %g1,32,%g1
+ ld [%o4+0],%o1
+ sllx %g5,32,%g5
+ or %g1,%o0,%o0
+ or %g5,%o1,%o1
+
+ add $np,$j,%o5
+
+ mulx %o1,%o0,%o0 ! ap[0]*bp[0]
+ mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
+ stx %o0,[%sp+$bias+$frame+0]
+
+ ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
+ fzeros $alo
+ ld [%o3+4],$ahi_
+ fzeros $ahi
+ ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
+ fzeros $nlo
+ ld [%o5+4],$nhi_
+ fzeros $nhi
+
+ ! transfer b[i] to FPU as 4x16-bit values
+ ldda [%o4+2]%asi,$ba
+ fxtod $alo,$alo
+ ldda [%o4+0]%asi,$bb
+ fxtod $ahi,$ahi
+ ldda [%o4+6]%asi,$bc
+ fxtod $nlo,$nlo
+ ldda [%o4+4]%asi,$bd
+ fxtod $nhi,$nhi
+
+ ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
+ ldda [%sp+$bias+$frame+6]%asi,$na
+ fxtod $ba,$ba
+ ldda [%sp+$bias+$frame+4]%asi,$nb
+ fxtod $bb,$bb
+ ldda [%sp+$bias+$frame+2]%asi,$nc
+ fxtod $bc,$bc
+ ldda [%sp+$bias+$frame+0]%asi,$nd
+ fxtod $bd,$bd
+
+ std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
+ fxtod $na,$na
+ std $ahi,[$ap_h+$j]
+ fxtod $nb,$nb
+ std $nlo,[$np_l+$j] ! save smashed np[j] in double format
+ fxtod $nc,$nc
+ std $nhi,[$np_h+$j]
+ fxtod $nd,$nd
+
+ fmuld $alo,$ba,$aloa
+ fmuld $nlo,$na,$nloa
+ fmuld $alo,$bb,$alob
+ fmuld $nlo,$nb,$nlob
+ fmuld $alo,$bc,$aloc
+ faddd $aloa,$nloa,$nloa
+ fmuld $nlo,$nc,$nloc
+ fmuld $alo,$bd,$alod
+ faddd $alob,$nlob,$nlob
+ fmuld $nlo,$nd,$nlod
+ fmuld $ahi,$ba,$ahia
+ faddd $aloc,$nloc,$nloc
+ fmuld $nhi,$na,$nhia
+ fmuld $ahi,$bb,$ahib
+ faddd $alod,$nlod,$nlod
+ fmuld $nhi,$nb,$nhib
+ fmuld $ahi,$bc,$ahic
+ faddd $ahia,$nhia,$nhia
+ fmuld $nhi,$nc,$nhic
+ fmuld $ahi,$bd,$ahid
+ faddd $ahib,$nhib,$nhib
+ fmuld $nhi,$nd,$nhid
+
+ faddd $ahic,$nhic,$dota ! $nhic
+ faddd $ahid,$nhid,$dotb ! $nhid
+
+ faddd $nloc,$nhia,$nloc
+ faddd $nlod,$nhib,$nlod
+
+ fdtox $nloa,$nloa
+ fdtox $nlob,$nlob
+ fdtox $nloc,$nloc
+ fdtox $nlod,$nlod
+
+ std $nloa,[%sp+$bias+$frame+0]
+ add $j,8,$j
+ std $nlob,[%sp+$bias+$frame+8]
+ add $ap,$j,%o4
+ std $nloc,[%sp+$bias+$frame+16]
+ add $np,$j,%o5
+ std $nlod,[%sp+$bias+$frame+24]
+
+ ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
+ fzeros $alo
+ ld [%o4+4],$ahi_
+ fzeros $ahi
+ ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
+ fzeros $nlo
+ ld [%o5+4],$nhi_
+ fzeros $nhi
+
+ fxtod $alo,$alo
+ fxtod $ahi,$ahi
+ fxtod $nlo,$nlo
+ fxtod $nhi,$nhi
+
+ ldx [%sp+$bias+$frame+0],%o0
+ fmuld $alo,$ba,$aloa
+ ldx [%sp+$bias+$frame+8],%o1
+ fmuld $nlo,$na,$nloa
+ ldx [%sp+$bias+$frame+16],%o2
+ fmuld $alo,$bb,$alob
+ ldx [%sp+$bias+$frame+24],%o3
+ fmuld $nlo,$nb,$nlob
+
+ srlx %o0,16,%o7
+ std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
+ fmuld $alo,$bc,$aloc
+ add %o7,%o1,%o1
+ std $ahi,[$ap_h+$j]
+ faddd $aloa,$nloa,$nloa
+ fmuld $nlo,$nc,$nloc
+ srlx %o1,16,%o7
+ std $nlo,[$np_l+$j] ! save smashed np[j] in double format
+ fmuld $alo,$bd,$alod
+ add %o7,%o2,%o2
+ std $nhi,[$np_h+$j]
+ faddd $alob,$nlob,$nlob
+ fmuld $nlo,$nd,$nlod
+ srlx %o2,16,%o7
+ fmuld $ahi,$ba,$ahia
+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+ faddd $aloc,$nloc,$nloc
+ fmuld $nhi,$na,$nhia
+ !and %o0,$mask,%o0
+ !and %o1,$mask,%o1
+ !and %o2,$mask,%o2
+ !sllx %o1,16,%o1
+ !sllx %o2,32,%o2
+ !sllx %o3,48,%o7
+ !or %o1,%o0,%o0
+ !or %o2,%o0,%o0
+ !or %o7,%o0,%o0 ! 64-bit result
+ srlx %o3,16,%g1 ! 34-bit carry
+ fmuld $ahi,$bb,$ahib
+
+ faddd $alod,$nlod,$nlod
+ fmuld $nhi,$nb,$nhib
+ fmuld $ahi,$bc,$ahic
+ faddd $ahia,$nhia,$nhia
+ fmuld $nhi,$nc,$nhic
+ fmuld $ahi,$bd,$ahid
+ faddd $ahib,$nhib,$nhib
+ fmuld $nhi,$nd,$nhid
+
+ faddd $dota,$nloa,$nloa
+ faddd $dotb,$nlob,$nlob
+ faddd $ahic,$nhic,$dota ! $nhic
+ faddd $ahid,$nhid,$dotb ! $nhid
+
+ faddd $nloc,$nhia,$nloc
+ faddd $nlod,$nhib,$nlod
+
+ fdtox $nloa,$nloa
+ fdtox $nlob,$nlob
+ fdtox $nloc,$nloc
+ fdtox $nlod,$nlod
+
+ std $nloa,[%sp+$bias+$frame+0]
+ std $nlob,[%sp+$bias+$frame+8]
+ addcc $j,8,$j
+ std $nloc,[%sp+$bias+$frame+16]
+ bz,pn %icc,.L1stskip
+ std $nlod,[%sp+$bias+$frame+24]
+
+.align 32 ! incidentally already aligned !
+.L1st:
+ add $ap,$j,%o4
+ add $np,$j,%o5
+ ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
+ fzeros $alo
+ ld [%o4+4],$ahi_
+ fzeros $ahi
+ ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
+ fzeros $nlo
+ ld [%o5+4],$nhi_
+ fzeros $nhi
+
+ fxtod $alo,$alo
+ fxtod $ahi,$ahi
+ fxtod $nlo,$nlo
+ fxtod $nhi,$nhi
+
+ ldx [%sp+$bias+$frame+0],%o0
+ fmuld $alo,$ba,$aloa
+ ldx [%sp+$bias+$frame+8],%o1
+ fmuld $nlo,$na,$nloa
+ ldx [%sp+$bias+$frame+16],%o2
+ fmuld $alo,$bb,$alob
+ ldx [%sp+$bias+$frame+24],%o3
+ fmuld $nlo,$nb,$nlob
+
+ srlx %o0,16,%o7
+ std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
+ fmuld $alo,$bc,$aloc
+ add %o7,%o1,%o1
+ std $ahi,[$ap_h+$j]
+ faddd $aloa,$nloa,$nloa
+ fmuld $nlo,$nc,$nloc
+ srlx %o1,16,%o7
+ std $nlo,[$np_l+$j] ! save smashed np[j] in double format
+ fmuld $alo,$bd,$alod
+ add %o7,%o2,%o2
+ std $nhi,[$np_h+$j]
+ faddd $alob,$nlob,$nlob
+ fmuld $nlo,$nd,$nlod
+ srlx %o2,16,%o7
+ fmuld $ahi,$ba,$ahia
+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+ and %o0,$mask,%o0
+ faddd $aloc,$nloc,$nloc
+ fmuld $nhi,$na,$nhia
+ and %o1,$mask,%o1
+ and %o2,$mask,%o2
+ fmuld $ahi,$bb,$ahib
+ sllx %o1,16,%o1
+ faddd $alod,$nlod,$nlod
+ fmuld $nhi,$nb,$nhib
+ sllx %o2,32,%o2
+ fmuld $ahi,$bc,$ahic
+ sllx %o3,48,%o7
+ or %o1,%o0,%o0
+ faddd $ahia,$nhia,$nhia
+ fmuld $nhi,$nc,$nhic
+ or %o2,%o0,%o0
+ fmuld $ahi,$bd,$ahid
+ or %o7,%o0,%o0 ! 64-bit result
+ faddd $ahib,$nhib,$nhib
+ fmuld $nhi,$nd,$nhid
+ addcc %g1,%o0,%o0
+ faddd $dota,$nloa,$nloa
+ srlx %o3,16,%g1 ! 34-bit carry
+ faddd $dotb,$nlob,$nlob
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ stx %o0,[$tp] ! tp[j-1]=
+
+ faddd $ahic,$nhic,$dota ! $nhic
+ faddd $ahid,$nhid,$dotb ! $nhid
+
+ faddd $nloc,$nhia,$nloc
+ faddd $nlod,$nhib,$nlod
+
+ fdtox $nloa,$nloa
+ fdtox $nlob,$nlob
+ fdtox $nloc,$nloc
+ fdtox $nlod,$nlod
+
+ std $nloa,[%sp+$bias+$frame+0]
+ std $nlob,[%sp+$bias+$frame+8]
+ std $nloc,[%sp+$bias+$frame+16]
+ std $nlod,[%sp+$bias+$frame+24]
+
+ addcc $j,8,$j
+ bnz,pt %icc,.L1st
+ add $tp,8,$tp
+
+.L1stskip:
+ fdtox $dota,$dota
+ fdtox $dotb,$dotb
+
+ ldx [%sp+$bias+$frame+0],%o0
+ ldx [%sp+$bias+$frame+8],%o1
+ ldx [%sp+$bias+$frame+16],%o2
+ ldx [%sp+$bias+$frame+24],%o3
+
+ srlx %o0,16,%o7
+ std $dota,[%sp+$bias+$frame+32]
+ add %o7,%o1,%o1
+ std $dotb,[%sp+$bias+$frame+40]
+ srlx %o1,16,%o7
+ add %o7,%o2,%o2
+ srlx %o2,16,%o7
+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+ and %o0,$mask,%o0
+ and %o1,$mask,%o1
+ and %o2,$mask,%o2
+ sllx %o1,16,%o1
+ sllx %o2,32,%o2
+ sllx %o3,48,%o7
+ or %o1,%o0,%o0
+ or %o2,%o0,%o0
+ or %o7,%o0,%o0 ! 64-bit result
+ ldx [%sp+$bias+$frame+32],%o4
+ addcc %g1,%o0,%o0
+ ldx [%sp+$bias+$frame+40],%o5
+ srlx %o3,16,%g1 ! 34-bit carry
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ stx %o0,[$tp] ! tp[j-1]=
+ add $tp,8,$tp
+
+ srlx %o4,16,%o7
+ add %o7,%o5,%o5
+ and %o4,$mask,%o4
+ sllx %o5,16,%o7
+ or %o7,%o4,%o4
+ addcc %g1,%o4,%o4
+ srlx %o5,48,%g1
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ mov %g1,$carry
+ stx %o4,[$tp] ! tp[num-1]=
+
+ ba .Louter
+ add $i,8,$i
+.align 32
+.Louter:
+ sub %g0,$num,$j ! j=-num
+ add %sp,$bias+$frame+$locals,$tp
+
+ add $ap,$j,%o3
+ add $bp,$i,%o4
+
+ ld [%o3+4],%g1 ! bp[i]
+ ld [%o3+0],%o0
+ ld [%o4+4],%g5 ! ap[0]
+ sllx %g1,32,%g1
+ ld [%o4+0],%o1
+ sllx %g5,32,%g5
+ or %g1,%o0,%o0
+ or %g5,%o1,%o1
+
+ ldx [$tp],%o2 ! tp[0]
+ mulx %o1,%o0,%o0
+ addcc %o2,%o0,%o0
+ mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
+ stx %o0,[%sp+$bias+$frame+0]
+
+ ! transfer b[i] to FPU as 4x16-bit values
+ ldda [%o4+2]%asi,$ba
+ ldda [%o4+0]%asi,$bb
+ ldda [%o4+6]%asi,$bc
+ ldda [%o4+4]%asi,$bd
+
+ ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
+ ldda [%sp+$bias+$frame+6]%asi,$na
+ fxtod $ba,$ba
+ ldda [%sp+$bias+$frame+4]%asi,$nb
+ fxtod $bb,$bb
+ ldda [%sp+$bias+$frame+2]%asi,$nc
+ fxtod $bc,$bc
+ ldda [%sp+$bias+$frame+0]%asi,$nd
+ fxtod $bd,$bd
+ ldd [$ap_l+$j],$alo ! load a[j] in double format
+ fxtod $na,$na
+ ldd [$ap_h+$j],$ahi
+ fxtod $nb,$nb
+ ldd [$np_l+$j],$nlo ! load n[j] in double format
+ fxtod $nc,$nc
+ ldd [$np_h+$j],$nhi
+ fxtod $nd,$nd
+
+ fmuld $alo,$ba,$aloa
+ fmuld $nlo,$na,$nloa
+ fmuld $alo,$bb,$alob
+ fmuld $nlo,$nb,$nlob
+ fmuld $alo,$bc,$aloc
+ faddd $aloa,$nloa,$nloa
+ fmuld $nlo,$nc,$nloc
+ fmuld $alo,$bd,$alod
+ faddd $alob,$nlob,$nlob
+ fmuld $nlo,$nd,$nlod
+ fmuld $ahi,$ba,$ahia
+ faddd $aloc,$nloc,$nloc
+ fmuld $nhi,$na,$nhia
+ fmuld $ahi,$bb,$ahib
+ faddd $alod,$nlod,$nlod
+ fmuld $nhi,$nb,$nhib
+ fmuld $ahi,$bc,$ahic
+ faddd $ahia,$nhia,$nhia
+ fmuld $nhi,$nc,$nhic
+ fmuld $ahi,$bd,$ahid
+ faddd $ahib,$nhib,$nhib
+ fmuld $nhi,$nd,$nhid
+
+ faddd $ahic,$nhic,$dota ! $nhic
+ faddd $ahid,$nhid,$dotb ! $nhid
+
+ faddd $nloc,$nhia,$nloc
+ faddd $nlod,$nhib,$nlod
+
+ fdtox $nloa,$nloa
+ fdtox $nlob,$nlob
+ fdtox $nloc,$nloc
+ fdtox $nlod,$nlod
+
+ std $nloa,[%sp+$bias+$frame+0]
+ std $nlob,[%sp+$bias+$frame+8]
+ std $nloc,[%sp+$bias+$frame+16]
+ add $j,8,$j
+ std $nlod,[%sp+$bias+$frame+24]
+
+ ldd [$ap_l+$j],$alo ! load a[j] in double format
+ ldd [$ap_h+$j],$ahi
+ ldd [$np_l+$j],$nlo ! load n[j] in double format
+ ldd [$np_h+$j],$nhi
+
+ fmuld $alo,$ba,$aloa
+ fmuld $nlo,$na,$nloa
+ fmuld $alo,$bb,$alob
+ fmuld $nlo,$nb,$nlob
+ fmuld $alo,$bc,$aloc
+ ldx [%sp+$bias+$frame+0],%o0
+ faddd $aloa,$nloa,$nloa
+ fmuld $nlo,$nc,$nloc
+ ldx [%sp+$bias+$frame+8],%o1
+ fmuld $alo,$bd,$alod
+ ldx [%sp+$bias+$frame+16],%o2
+ faddd $alob,$nlob,$nlob
+ fmuld $nlo,$nd,$nlod
+ ldx [%sp+$bias+$frame+24],%o3
+ fmuld $ahi,$ba,$ahia
+
+ srlx %o0,16,%o7
+ faddd $aloc,$nloc,$nloc
+ fmuld $nhi,$na,$nhia
+ add %o7,%o1,%o1
+ fmuld $ahi,$bb,$ahib
+ srlx %o1,16,%o7
+ faddd $alod,$nlod,$nlod
+ fmuld $nhi,$nb,$nhib
+ add %o7,%o2,%o2
+ fmuld $ahi,$bc,$ahic
+ srlx %o2,16,%o7
+ faddd $ahia,$nhia,$nhia
+ fmuld $nhi,$nc,$nhic
+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+ ! why?
+ and %o0,$mask,%o0
+ fmuld $ahi,$bd,$ahid
+ and %o1,$mask,%o1
+ and %o2,$mask,%o2
+ faddd $ahib,$nhib,$nhib
+ fmuld $nhi,$nd,$nhid
+ sllx %o1,16,%o1
+ faddd $dota,$nloa,$nloa
+ sllx %o2,32,%o2
+ faddd $dotb,$nlob,$nlob
+ sllx %o3,48,%o7
+ or %o1,%o0,%o0
+ faddd $ahic,$nhic,$dota ! $nhic
+ or %o2,%o0,%o0
+ faddd $ahid,$nhid,$dotb ! $nhid
+ or %o7,%o0,%o0 ! 64-bit result
+ ldx [$tp],%o7
+ faddd $nloc,$nhia,$nloc
+ addcc %o7,%o0,%o0
+ ! end-of-why?
+ faddd $nlod,$nhib,$nlod
+ srlx %o3,16,%g1 ! 34-bit carry
+ fdtox $nloa,$nloa
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ fdtox $nlob,$nlob
+ fdtox $nloc,$nloc
+ fdtox $nlod,$nlod
+
+ std $nloa,[%sp+$bias+$frame+0]
+ std $nlob,[%sp+$bias+$frame+8]
+ addcc $j,8,$j
+ std $nloc,[%sp+$bias+$frame+16]
+ bz,pn %icc,.Linnerskip
+ std $nlod,[%sp+$bias+$frame+24]
+
+ ba .Linner
+ nop
+.align 32
+.Linner:
+ ldd [$ap_l+$j],$alo ! load a[j] in double format
+ ldd [$ap_h+$j],$ahi
+ ldd [$np_l+$j],$nlo ! load n[j] in double format
+ ldd [$np_h+$j],$nhi
+
+ fmuld $alo,$ba,$aloa
+ fmuld $nlo,$na,$nloa
+ fmuld $alo,$bb,$alob
+ fmuld $nlo,$nb,$nlob
+ fmuld $alo,$bc,$aloc
+ ldx [%sp+$bias+$frame+0],%o0
+ faddd $aloa,$nloa,$nloa
+ fmuld $nlo,$nc,$nloc
+ ldx [%sp+$bias+$frame+8],%o1
+ fmuld $alo,$bd,$alod
+ ldx [%sp+$bias+$frame+16],%o2
+ faddd $alob,$nlob,$nlob
+ fmuld $nlo,$nd,$nlod
+ ldx [%sp+$bias+$frame+24],%o3
+ fmuld $ahi,$ba,$ahia
+
+ srlx %o0,16,%o7
+ faddd $aloc,$nloc,$nloc
+ fmuld $nhi,$na,$nhia
+ add %o7,%o1,%o1
+ fmuld $ahi,$bb,$ahib
+ srlx %o1,16,%o7
+ faddd $alod,$nlod,$nlod
+ fmuld $nhi,$nb,$nhib
+ add %o7,%o2,%o2
+ fmuld $ahi,$bc,$ahic
+ srlx %o2,16,%o7
+ faddd $ahia,$nhia,$nhia
+ fmuld $nhi,$nc,$nhic
+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+ and %o0,$mask,%o0
+ fmuld $ahi,$bd,$ahid
+ and %o1,$mask,%o1
+ and %o2,$mask,%o2
+ faddd $ahib,$nhib,$nhib
+ fmuld $nhi,$nd,$nhid
+ sllx %o1,16,%o1
+ faddd $dota,$nloa,$nloa
+ sllx %o2,32,%o2
+ faddd $dotb,$nlob,$nlob
+ sllx %o3,48,%o7
+ or %o1,%o0,%o0
+ faddd $ahic,$nhic,$dota ! $nhic
+ or %o2,%o0,%o0
+ faddd $ahid,$nhid,$dotb ! $nhid
+ or %o7,%o0,%o0 ! 64-bit result
+ faddd $nloc,$nhia,$nloc
+ addcc %g1,%o0,%o0
+ ldx [$tp+8],%o7 ! tp[j]
+ faddd $nlod,$nhib,$nlod
+ srlx %o3,16,%g1 ! 34-bit carry
+ fdtox $nloa,$nloa
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+ fdtox $nlob,$nlob
+ addcc %o7,%o0,%o0
+ fdtox $nloc,$nloc
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ stx %o0,[$tp] ! tp[j-1]
+ fdtox $nlod,$nlod
+
+ std $nloa,[%sp+$bias+$frame+0]
+ std $nlob,[%sp+$bias+$frame+8]
+ std $nloc,[%sp+$bias+$frame+16]
+ addcc $j,8,$j
+ std $nlod,[%sp+$bias+$frame+24]
+ bnz,pt %icc,.Linner
+ add $tp,8,$tp
+
+.Linnerskip:
+ fdtox $dota,$dota
+ fdtox $dotb,$dotb
+
+ ldx [%sp+$bias+$frame+0],%o0
+ ldx [%sp+$bias+$frame+8],%o1
+ ldx [%sp+$bias+$frame+16],%o2
+ ldx [%sp+$bias+$frame+24],%o3
+
+ srlx %o0,16,%o7
+ std $dota,[%sp+$bias+$frame+32]
+ add %o7,%o1,%o1
+ std $dotb,[%sp+$bias+$frame+40]
+ srlx %o1,16,%o7
+ add %o7,%o2,%o2
+ srlx %o2,16,%o7
+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+ and %o0,$mask,%o0
+ and %o1,$mask,%o1
+ and %o2,$mask,%o2
+ sllx %o1,16,%o1
+ sllx %o2,32,%o2
+ sllx %o3,48,%o7
+ or %o1,%o0,%o0
+ or %o2,%o0,%o0
+ ldx [%sp+$bias+$frame+32],%o4
+ or %o7,%o0,%o0 ! 64-bit result
+ ldx [%sp+$bias+$frame+40],%o5
+ addcc %g1,%o0,%o0
+ ldx [$tp+8],%o7 ! tp[j]
+ srlx %o3,16,%g1 ! 34-bit carry
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ addcc %o7,%o0,%o0
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ stx %o0,[$tp] ! tp[j-1]
+ add $tp,8,$tp
+
+ srlx %o4,16,%o7
+ add %o7,%o5,%o5
+ and %o4,$mask,%o4
+ sllx %o5,16,%o7
+ or %o7,%o4,%o4
+ addcc %g1,%o4,%o4
+ srlx %o5,48,%g1
+ bcs,a %xcc,.+8
+ add %g1,1,%g1
+
+ addcc $carry,%o4,%o4
+ stx %o4,[$tp] ! tp[num-1]
+ mov %g1,$carry
+ bcs,a %xcc,.+8
+ add $carry,1,$carry
+
+ addcc $i,8,$i
+ bnz %icc,.Louter
+ nop
+
+ add $tp,8,$tp ! adjust tp to point at the end
+ orn %g0,%g0,%g4
+ sub %g0,$num,%o7 ! n=-num
+ ba .Lsub
+ subcc %g0,%g0,%g0 ! clear %icc.c
+
+.align 32
+.Lsub:
+ ldx [$tp+%o7],%o0
+ add $np,%o7,%g1
+ ld [%g1+0],%o2
+ ld [%g1+4],%o3
+ srlx %o0,32,%o1
+ subccc %o0,%o2,%o2
+ add $rp,%o7,%g1
+ subccc %o1,%o3,%o3
+ st %o2,[%g1+0]
+ add %o7,8,%o7
+ brnz,pt %o7,.Lsub
+ st %o3,[%g1+4]
+ subc $carry,0,%g4
+ sub %g0,$num,%o7 ! n=-num
+ ba .Lcopy
+ nop
+
+.align 32
+.Lcopy:
+ ldx [$tp+%o7],%o0
+ add $rp,%o7,%g1
+ ld [%g1+0],%o2
+ ld [%g1+4],%o3
+ stx %g0,[$tp+%o7]
+ and %o0,%g4,%o0
+ srlx %o0,32,%o1
+ andn %o2,%g4,%o2
+ andn %o3,%g4,%o3
+ or %o2,%o0,%o0
+ or %o3,%o1,%o1
+ st %o0,[%g1+0]
+ add %o7,8,%o7
+ brnz,pt %o7,.Lcopy
+ st %o1,[%g1+4]
+ sub %g0,$num,%o7 ! n=-num
+
+.Lzap:
+ stx %g0,[$ap_l+%o7]
+ stx %g0,[$ap_h+%o7]
+ stx %g0,[$np_l+%o7]
+ stx %g0,[$np_h+%o7]
+ add %o7,8,%o7
+ brnz,pt %o7,.Lzap
+ nop
+
+ ldx [%sp+$bias+$frame+48],%o7
+ wr %g0,%o7,%asi ! restore %asi
+
+ mov 1,%i0
+.Lret:
+ ret
+ restore
+.type $fname,#function
+.size $fname,(.-$fname)
+.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
+.align 32
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+# Below substitution makes it possible to compile without demanding
+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# dare to do this, because VIS capability is detected at run-time now
+# and this routine is not called on CPU not capable to execute it. Do
+# note that fzeros is not the only VIS dependency! Another dependency
+# is implicit and is just _a_ numerical value loaded to %asi register,
+# which assembler can't recognize as VIS specific...
+$code =~ s/fzeros\s+%f([0-9]+)/
+ sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
+ /gem;
+
+print $code;
+# flush
+close STDOUT;
diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl
new file mode 100644
index 0000000000..c046a514c8
--- /dev/null
+++ b/crypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Wrapper around 'rep montmul', VIA-specific instruction accessing
+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
+#
+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
+# different software configurations on 1.5GHz VIA Esther processor.
+# Lines marked with "software integer" denote performance of hand-
+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
+# refers to hand-coded SSE2 Montgomery multiplication procedure found
+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
+# Padlock SDK 2.0.1 available for download from VIA, which naturally
+# utilizes the magic 'repz montmul' instruction. And finally "hardware
+# this" refers to *this* implementation which also uses 'repz montmul'
+#
+# sign verify sign/s verify/s
+# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
+# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
+# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
+# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
+#
+# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
+# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
+# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
+# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
+#
+# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
+# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
+# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
+# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
+#
+# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
+# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
+# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
+# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
+#
+# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
+# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
+# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
+# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
+#
+# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
+# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
+# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
+# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
+#
+# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
+# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
+# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
+# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
+#
+# To give you some other reference point here is output for 2.4GHz P4
+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
+# SSE2" in above terms.
+#
+# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
+# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
+# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
+# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
+# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
+# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
+# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
+#
+# Conclusions:
+# - VIA SDK leaves a *lot* of room for improvement (which this
+# implementation successfully fills:-);
+# - 'rep montmul' gives up to >3x performance improvement depending on
+# key length;
+# - in terms of absolute performance it delivers approximately as much
+# as modern out-of-order 32-bit cores [again, for longer keys].
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"via-mont.pl");
+
+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+$func="bn_mul_mont_padlock";
+
+$pad=16*1; # amount of reserved bytes on top of every vector
+
+# stack layout
+$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
+$A=&DWP(4,"esp");
+$B=&DWP(8,"esp");
+$T=&DWP(12,"esp");
+$M=&DWP(16,"esp");
+$scratch=&DWP(20,"esp");
+$rp=&DWP(24,"esp"); # these are mine
+$sp=&DWP(28,"esp");
+# &DWP(32,"esp") # 32 byte scratch area
+# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
+# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
+# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
+# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
+# Note that SDK suggests to unconditionally allocate 2K per vector. This
+# has quite an impact on performance. It naturally depends on key length,
+# but to give an example 1024 bit private RSA key operations suffer >30%
+# penalty. I allocate only as much as actually required...
+
+&function_begin($func);
+ &xor ("eax","eax");
+ &mov ("ecx",&wparam(5)); # num
+ # meet VIA's limitations for num [note that the specification
+ # expresses them in bits, while we work with amount of 32-bit words]
+ &test ("ecx",3);
+ &jnz (&label("leave")); # num % 4 != 0
+ &cmp ("ecx",8);
+ &jb (&label("leave")); # num < 8
+ &cmp ("ecx",1024);
+ &ja (&label("leave")); # num > 1024
+
+ &pushf ();
+ &cld ();
+
+ &mov ("edi",&wparam(0)); # rp
+ &mov ("eax",&wparam(1)); # ap
+ &mov ("ebx",&wparam(2)); # bp
+ &mov ("edx",&wparam(3)); # np
+ &mov ("esi",&wparam(4)); # n0
+ &mov ("esi",&DWP(0,"esi")); # *n0
+
+ &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
+ &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
+ &neg ("ebp");
+ &add ("ebp","esp");
+ &and ("ebp",-64); # align to cache-line
+ &xchg ("ebp","esp"); # alloca
+
+ &mov ($rp,"edi"); # save rp
+ &mov ($sp,"ebp"); # save esp
+
+ &mov ($mZeroPrime,"esi");
+ &lea ("esi",&DWP(64,"esp")); # tp
+ &mov ($T,"esi");
+ &lea ("edi",&DWP(32,"esp")); # scratch area
+ &mov ($scratch,"edi");
+ &mov ("esi","eax");
+
+ &lea ("ebp",&DWP(-$pad,"ecx"));
+ &shr ("ebp",2); # restore original num value in ebp
+
+ &xor ("eax","eax");
+
+ &mov ("ecx","ebp");
+ &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
+ &data_byte(0xf3,0xab); # rep stosl, bzero
+
+ &mov ("ecx","ebp");
+ &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
+ &mov ($A,"edi");
+ &data_byte(0xf3,0xa5); # rep movsl, memcpy
+ &mov ("ecx",$pad/4);
+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
+ # edi points at the end of padded ap copy...
+
+ &mov ("ecx","ebp");
+ &mov ("esi","ebx");
+ &mov ($B,"edi");
+ &data_byte(0xf3,0xa5); # rep movsl, memcpy
+ &mov ("ecx",$pad/4);
+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
+ # edi points at the end of padded bp copy...
+
+ &mov ("ecx","ebp");
+ &mov ("esi","edx");
+ &mov ($M,"edi");
+ &data_byte(0xf3,0xa5); # rep movsl, memcpy
+ &mov ("ecx",$pad/4);
+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
+ # edi points at the end of padded np copy...
+
+ # let magic happen...
+ &mov ("ecx","ebp");
+ &mov ("esi","esp");
+ &shl ("ecx",5); # convert word counter to bit counter
+ &align (4);
+ &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
+
+ &mov ("ecx","ebp");
+ &lea ("esi",&DWP(64,"esp")); # tp
+ # edi still points at the end of padded np copy...
+ &neg ("ebp");
+ &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
+ &mov ("edi",$rp); # restore rp
+ &xor ("edx","edx"); # i=0 and clear CF
+
+&set_label("sub",8);
+ &mov ("eax",&DWP(0,"esi","edx",4));
+ &sbb ("eax",&DWP(0,"ebp","edx",4));
+ &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
+ &lea ("edx",&DWP(1,"edx")); # i++
+ &loop (&label("sub")); # doesn't affect CF!
+
+ &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
+ &sbb ("eax",0);
+ &and ("esi","eax");
+ &not ("eax");
+ &mov ("ebp","edi");
+ &and ("ebp","eax");
+ &or ("esi","ebp"); # tp=carry?tp:rp
+
+ &mov ("ecx","edx"); # num
+ &xor ("edx","edx"); # i=0
+
+&set_label("copy",8);
+ &mov ("eax",&DWP(0,"esi","edx",4));
+ &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
+ &mov (&DWP(0,"edi","edx",4),"eax");
+ &lea ("edx",&DWP(1,"edx")); # i++
+ &loop (&label("copy"));
+
+ &mov ("ebp",$sp);
+ &xor ("eax","eax");
+
+ &mov ("ecx",64/4);
+ &mov ("edi","esp"); # zap frame including scratch area
+ &data_byte(0xf3,0xab); # rep stosl, bzero
+
+ # zap copies of ap, bp and np
+ &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+ &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
+ &data_byte(0xf3,0xab); # rep stosl, bzero
+
+ &mov ("esp","ebp");
+ &inc ("eax"); # signal "done"
+ &popf ();
+&set_label("leave");
+&function_end($func);
+
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
new file mode 100755
index 0000000000..5cd3cd2ed5
--- /dev/null
+++ b/crypto/bn/asm/x86-mont.pl
@@ -0,0 +1,591 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi"; $tp="esi"; # overlapping variables!!!
+$rp="edi"; $bp="edi"; # overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp"); # stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32; # size of above frame rounded up to 16n
+
+ &xor ("eax","eax");
+ &mov ("edi",&wparam(5)); # int num
+ &cmp ("edi",4);
+ &jl (&label("just_leave"));
+
+ &lea ("esi",&wparam(0)); # put aside pointer to argument block
+ &lea ("edx",&wparam(1)); # load ap
+ &mov ("ebp","esp"); # saved stack pointer!
+ &add ("edi",2); # extra two words on top of tp
+ &neg ("edi");
+ &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
+ &neg ("edi");
+
+ # minimize cache contention by arraning 2K window between stack
+ # pointer and ap argument [np is also position sensitive vector,
+ # but it's assumed to be near ap, as it's allocated at ~same
+ # time].
+ &mov ("eax","esp");
+ &sub ("eax","edx");
+ &and ("eax",2047);
+ &sub ("esp","eax"); # this aligns sp and ap modulo 2048
+
+ &xor ("edx","esp");
+ &and ("edx",2048);
+ &xor ("edx",2048);
+ &sub ("esp","edx"); # this splits them apart modulo 4096
+
+ &and ("esp",-64); # align to cache line
+
+ ################################# load argument block...
+ &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+ &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+ &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+ &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+ &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+ #&mov ("edi",&DWP(5*4,"esi"));# int num
+
+ &mov ("esi",&DWP(0,"esi")); # pull n0[0]
+ &mov ($_rp,"eax"); # ... save a copy of argument block
+ &mov ($_ap,"ebx");
+ &mov ($_bp,"ecx");
+ &mov ($_np,"edx");
+ &mov ($_n0,"esi");
+ &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
+ #&mov ($_num,$num); # redundant as $num is not reused
+ &mov ($_sp,"ebp"); # saved stack pointer!
+
+if($sse2) {
+$acc0="mm0"; # mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+ &picmeup("eax","OPENSSL_ia32cap_P");
+ &bt (&DWP(0,"eax"),26);
+ &jnc (&label("non_sse2"));
+
+ &mov ("eax",-1);
+ &movd ($mask,"eax"); # mask 32 lower bits
+
+ &mov ($ap,$_ap); # load input pointers
+ &mov ($bp,$_bp);
+ &mov ($np,$_np);
+
+ &xor ($i,$i); # i=0
+ &xor ($j,$j); # j=0
+
+ &movd ($mul0,&DWP(0,$bp)); # bp[0]
+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
+ &movd ($car1,&DWP(0,$np)); # np[0]
+
+ &pmuludq($mul1,$mul0); # ap[0]*bp[0]
+ &movq ($car0,$mul1);
+ &movq ($acc0,$mul1); # I wish movd worked for
+ &pand ($acc0,$mask); # inter-register transfers
+
+ &pmuludq($mul1,$_n0q); # *=n0
+
+ &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
+ &paddq ($car1,$acc0);
+
+ &movd ($acc1,&DWP(4,$np)); # np[1]
+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
+
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+
+ &inc ($j); # j++
+&set_label("1st",16);
+ &pmuludq($acc0,$mul0); # ap[j]*bp[0]
+ &pmuludq($acc1,$mul1); # np[j]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &pand ($acc0,$mask);
+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
+ &paddq ($car1,$acc0); # +=ap[j]*bp[0];
+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
+ &psrlq ($car0,32);
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
+ &psrlq ($car1,32);
+
+ &lea ($j,&DWP(1,$j));
+ &cmp ($j,$num);
+ &jl (&label("1st"));
+
+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
+ &pmuludq($acc1,$mul1); # np[num-1]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &pand ($acc0,$mask);
+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
+
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+
+ &paddq ($car1,$car0);
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
+
+ &inc ($i); # i++
+&set_label("outer");
+ &xor ($j,$j); # j=0
+
+ &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
+ &movd ($temp,&DWP($frame,"esp")); # tp[0]
+ &movd ($car1,&DWP(0,$np)); # np[0]
+ &pmuludq($mul1,$mul0); # ap[0]*bp[i]
+
+ &paddq ($mul1,$temp); # +=tp[0]
+ &movq ($acc0,$mul1);
+ &movq ($car0,$mul1);
+ &pand ($acc0,$mask);
+
+ &pmuludq($mul1,$_n0q); # *=n0
+
+ &pmuludq($car1,$mul1);
+ &paddq ($car1,$acc0);
+
+ &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
+ &movd ($acc1,&DWP(4,$np)); # np[1]
+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
+
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+ &paddq ($car0,$temp); # +=tp[1]
+
+ &inc ($j); # j++
+ &dec ($num);
+&set_label("inner");
+ &pmuludq($acc0,$mul0); # ap[j]*bp[i]
+ &pmuludq($acc1,$mul1); # np[j]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+ &pand ($acc0,$mask);
+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
+ &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
+ &psrlq ($car0,32);
+ &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+ &psrlq ($car1,32);
+ &paddq ($car0,$temp); # +=tp[j+1]
+
+ &dec ($num);
+ &lea ($j,&DWP(1,$j)); # j++
+ &jnz (&label("inner"));
+
+ &mov ($num,$j);
+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
+ &pmuludq($acc1,$mul1); # np[num-1]*m1
+ &paddq ($car0,$acc0); # +=c0
+ &paddq ($car1,$acc1); # +=c1
+
+ &movq ($acc0,$car0);
+ &pand ($acc0,$mask);
+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
+ &psrlq ($car0,32);
+ &psrlq ($car1,32);
+
+ &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
+ &paddq ($car1,$car0);
+ &paddq ($car1,$temp);
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
+
+ &lea ($i,&DWP(1,$i)); # i++
+ &cmp ($i,$num);
+ &jle (&label("outer"));
+
+ &emms (); # done with mmx bank
+ &jmp (&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+ &mov ("esp",$_sp);
+ &xor ("eax","eax"); # signal "not fast enough [yet]"
+ &jmp (&label("just_leave"));
+ # While the below code provides competitive performance for
+ # all key lengthes on modern Intel cores, it's still more
+ # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+ # means compared to the original integer-only assembler.
+ # 512-bit RSA sign is better by ~40%, but that's about all
+ # one can say about all CPUs...
+} else {
+$inp="esi"; # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+ &mov ($inp,$_ap);
+ &lea ($carry,&DWP(1,$num));
+ &mov ($word,$_bp);
+ &xor ($j,$j); # j=0
+ &mov ("edx",$inp);
+ &and ($carry,1); # see if num is even
+ &sub ("edx",$word); # see if ap==bp
+ &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
+ &or ($carry,"edx");
+ &mov ($word,&DWP(0,$word)); # bp[0]
+ &jz (&label("bn_sqr_mont"));
+ &mov ($_bpend,"eax");
+ &mov ("eax",&DWP(0,$inp));
+ &xor ("edx","edx");
+
+&set_label("mull",16);
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*bp[0]
+ &add ($carry,"eax");
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
+ &cmp ($j,$num);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("mull"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*bp[0]
+ &mov ($word,$_n0);
+ &add ("eax",$carry);
+ &mov ($inp,$_np);
+ &adc ("edx",0);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
+ &xor ($j,$j);
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
+
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+ &adc ("edx",0);
+ &inc ($j);
+
+ &jmp (&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*bp[i]
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
+ &adc ("edx",0);
+ &cmp ($j,$num);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("1stmadd"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*bp[i]
+ &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &mov ($word,$_n0);
+ &adc ("edx",0);
+ &mov ($inp,$_np);
+ &add ($carry,"eax");
+ &adc ("edx",0);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &xor ($j,$j);
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
+ &adc ($j,0);
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
+
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+ &adc ("edx",0);
+ &mov ($j,1);
+
+&set_label("2ndmadd",16);
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
+ &adc ("edx",0);
+ &cmp ($j,$num);
+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
+ &jl (&label("2ndmadd"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
+
+ &xor ("eax","eax");
+ &mov ($j,$_bp); # &bp[i]
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
+ &lea ($j,&DWP(4,$j));
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
+ &cmp ($j,$_bpend);
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
+ &je (&label("common_tail"));
+
+ &mov ($word,&DWP(0,$j)); # bp[i+1]
+ &mov ($inp,$_ap);
+ &mov ($_bp,$j); # &bp[++i]
+ &xor ($j,$j);
+ &xor ("edx","edx");
+ &mov ("eax",&DWP(0,$inp));
+ &jmp (&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+ &mov ($_num,$num);
+ &mov ($_bp,$j); # i=0
+
+ &mov ("eax",$word); # ap[0]
+ &mul ($word); # ap[0]*ap[0]
+ &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
+ &mov ($sbit,"edx");
+ &shr ("edx",1);
+ &and ($sbit,1);
+ &inc ($j);
+&set_label("sqr",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*ap[0]
+ &add ("eax",$carry);
+ &lea ($j,&DWP(1,$j));
+ &adc ("edx",0);
+ &lea ($carry,&DWP(0,$sbit,"eax",2));
+ &shr ("eax",31);
+ &cmp ($j,$_num);
+ &mov ($sbit,"eax");
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("sqr"));
+
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[num-1]*ap[0]
+ &add ("eax",$carry);
+ &mov ($word,$_n0);
+ &adc ("edx",0);
+ &mov ($inp,$_np);
+ &lea ($carry,&DWP(0,$sbit,"eax",2));
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+ &shr ("eax",31);
+ &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
+
+ &lea ($carry,&DWP(0,"eax","edx",2));
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &shr ("edx",31);
+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
+ &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
+
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &mov ($num,$j);
+ &adc ("edx",0);
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+ &mov ($j,1);
+
+&set_label("3rdmadd",16);
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
+
+ &mov ($carry,"edx");
+ &mul ($word); # np[j+1]*m
+ &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
+ &lea ($j,&DWP(2,$j));
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
+ &adc ("edx",0);
+ &cmp ($j,$num);
+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
+ &jl (&label("3rdmadd"));
+
+ &mov ($carry,"edx");
+ &mul ($word); # np[j]*m
+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
+ &adc ("edx",0);
+ &add ($carry,"eax");
+ &adc ("edx",0);
+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
+
+ &mov ($j,$_bp); # i
+ &xor ("eax","eax");
+ &mov ($inp,$_ap);
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
+ &cmp ($j,$num);
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
+ &je (&label("common_tail"));
+
+ &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
+ &lea ($j,&DWP(1,$j));
+ &mov ("eax",$word);
+ &mov ($_bp,$j); # ++i
+ &mul ($word); # ap[i]*ap[i]
+ &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
+ &adc ("edx",0);
+ &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
+ &xor ($carry,$carry);
+ &cmp ($j,$num);
+ &lea ($j,&DWP(1,$j));
+ &je (&label("sqrlast"));
+
+ &mov ($sbit,"edx"); # zaps $num
+ &shr ("edx",1);
+ &and ($sbit,1);
+&set_label("sqradd",16);
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
+ &mov ($carry,"edx");
+ &mul ($word); # ap[j]*ap[i]
+ &add ("eax",$carry);
+ &lea ($carry,&DWP(0,"eax","eax"));
+ &adc ("edx",0);
+ &shr ("eax",31);
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
+ &lea ($j,&DWP(1,$j));
+ &adc ("eax",0);
+ &add ($carry,$sbit);
+ &adc ("eax",0);
+ &cmp ($j,$_num);
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
+ &mov ($sbit,"eax");
+ &jle (&label("sqradd"));
+
+ &mov ($carry,"edx");
+ &lea ("edx",&DWP(0,$sbit,"edx",2));
+ &shr ($carry,31);
+&set_label("sqrlast");
+ &mov ($word,$_n0);
+ &mov ($inp,$_np);
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
+
+ &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
+ &mov ("eax",&DWP(0,$inp)); # np[0]
+ &adc ($carry,0);
+ &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
+
+ &mul ($word); # np[0]*m
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
+ &lea ($num,&DWP(-1,$j));
+ &adc ("edx",0);
+ &mov ($j,1);
+ &mov ("eax",&DWP(4,$inp)); # np[1]
+
+ &jmp (&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+ &mov ($np,$_np); # load modulus pointer
+ &mov ($rp,$_rp); # load result pointer
+ &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
+
+ &mov ("eax",&DWP(0,$tp)); # tp[0]
+ &mov ($j,$num); # j=num-1
+ &xor ($i,$i); # i=0 and clear CF!
+
+&set_label("sub",16);
+ &sbb ("eax",&DWP(0,$np,$i,4));
+ &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
+ &dec ($j); # doesn't affect CF!
+ &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
+ &lea ($i,&DWP(1,$i)); # i++
+ &jge (&label("sub"));
+
+ &sbb ("eax",0); # handle upmost overflow bit
+ &and ($tp,"eax");
+ &not ("eax");
+ &mov ($np,$rp);
+ &and ($np,"eax");
+ &or ($tp,$np); # tp=carry?tp:rp
+
+&set_label("copy",16); # copy or in-place refresh
+ &mov ("eax",&DWP(0,$tp,$num,4));
+ &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
+ &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+ &dec ($num);
+ &jge (&label("copy"));
+
+ &mov ("esp",$_sp); # pull saved stack pointer
+ &mov ("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d13ec5a468..a0c49ee978 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -1,3 +1,7 @@
+#include "../bn_lcl.h"
+#if !(defined(__GNUC__) && __GNUC__>=2)
+# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
+#else
/*
* x86_64 BIGNUM accelerator version 0.1, December 2002.
*
@@ -51,7 +55,14 @@
* machine.
*/
+#ifdef _WIN64
+#define BN_ULONG unsigned long long
+#else
#define BN_ULONG unsigned long
+#endif
+
+#undef mul
+#undef mul_add
/*
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
@@ -94,7 +105,7 @@
: "a"(a) \
: "cc");
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
@@ -118,7 +129,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
return(c1);
}
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
@@ -141,7 +152,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
return(c1);
}
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0) return;
@@ -172,21 +183,21 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
return ret;
}
-BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
- ".align 16 \n"
+ ".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" leaq 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n"
- : "+a"(ret),"+c"(n),"+r"(i)
+ : "=&a"(ret),"+c"(n),"=&r"(i)
: "r"(rp),"r"(ap),"r"(bp)
: "cc"
);
@@ -195,21 +206,21 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
}
#ifndef SIMICS
-BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
- ".align 16 \n"
+ ".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" leaq 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n"
- : "+a"(ret),"+c"(n),"+r"(i)
+ : "=&a"(ret),"+c"(n),"=&r"(i)
: "r"(rp),"r"(ap),"r"(bp)
: "cc"
);
@@ -482,7 +493,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
r[7]=c2;
}
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
@@ -558,7 +569,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
r[15]=c1;
}
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
@@ -591,3 +602,4 @@ void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
r[6]=c1;
r[7]=c2;
}
+#endif
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
new file mode 100755
index 0000000000..3b7a6f243f
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005.
+#
+# Montgomery multiplication routine for x86_64. While it gives modest
+# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
+# than twice, >2x, as fast. Most common rsa1024 sign is improved by
+# respectful 50%. It remains to be seen if loop unrolling and
+# dedicated squaring routine can provide further improvement...
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont(
+$rp="%rdi"; # BN_ULONG *rp,
+$ap="%rsi"; # const BN_ULONG *ap,
+$bp="%rdx"; # const BN_ULONG *bp,
+$np="%rcx"; # const BN_ULONG *np,
+$n0="%r8"; # const BN_ULONG *n0,
+$num="%r9"; # int num);
+$lo0="%r10";
+$hi0="%r11";
+$bp="%r12"; # reassign $bp
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl bn_mul_mont
+.type bn_mul_mont,\@function,6
+.align 16
+bn_mul_mont:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov ${num}d,${num}d
+ lea 2($num),%r10
+ mov %rsp,%r11
+ neg %r10
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
+ and \$-1024,%rsp # minimize TLB usage
+
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+.Lprologue:
+ mov %rdx,$bp # $bp reassigned, remember?
+
+ mov ($n0),$n0 # pull n0[0] value
+
+ xor $i,$i # i=0
+ xor $j,$j # j=0
+
+ mov ($bp),$m0 # m0=bp[0]
+ mov ($ap),%rax
+ mulq $m0 # ap[0]*bp[0]
+ mov %rax,$lo0
+ mov %rdx,$hi0
+
+ imulq $n0,%rax # "tp[0]"*n0
+ mov %rax,$m1
+
+ mulq ($np) # np[0]*m1
+ add $lo0,%rax # discarded
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+.L1st:
+ mov ($ap,$j,8),%rax
+ mulq $m0 # ap[j]*bp[0]
+ add $hi0,%rax
+ adc \$0,%rdx
+ mov %rax,$lo0
+ mov ($np,$j,8),%rax
+ mov %rdx,$hi0
+
+ mulq $m1 # np[j]*m1
+ add $hi1,%rax
+ lea 1($j),$j # j++
+ adc \$0,%rdx
+ add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
+ adc \$0,%rdx
+ mov %rax,-16(%rsp,$j,8) # tp[j-1]
+ cmp $num,$j
+ mov %rdx,$hi1
+ jl .L1st
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+.align 4
+.Louter:
+ xor $j,$j # j=0
+
+ mov ($bp,$i,8),$m0 # m0=bp[i]
+ mov ($ap),%rax # ap[0]
+ mulq $m0 # ap[0]*bp[i]
+ add (%rsp),%rax # ap[0]*bp[i]+tp[0]
+ adc \$0,%rdx
+ mov %rax,$lo0
+ mov %rdx,$hi0
+
+ imulq $n0,%rax # tp[0]*n0
+ mov %rax,$m1
+
+ mulq ($np,$j,8) # np[0]*m1
+ add $lo0,%rax # discarded
+ mov 8(%rsp),$lo0 # tp[1]
+ adc \$0,%rdx
+ mov %rdx,$hi1
+
+ lea 1($j),$j # j++
+.align 4
+.Linner:
+ mov ($ap,$j,8),%rax
+ mulq $m0 # ap[j]*bp[i]
+ add $hi0,%rax
+ adc \$0,%rdx
+ add %rax,$lo0 # ap[j]*bp[i]+tp[j]
+ mov ($np,$j,8),%rax
+ adc \$0,%rdx
+ mov %rdx,$hi0
+
+ mulq $m1 # np[j]*m1
+ add $hi1,%rax
+ lea 1($j),$j # j++
+ adc \$0,%rdx
+ add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
+ adc \$0,%rdx
+ mov (%rsp,$j,8),$lo0
+ cmp $num,$j
+ mov %rax,-16(%rsp,$j,8) # tp[j-1]
+ mov %rdx,$hi1
+ jl .Linner
+
+ xor %rdx,%rdx
+ add $hi0,$hi1
+ adc \$0,%rdx
+ add $lo0,$hi1 # pull upmost overflow bit
+ adc \$0,%rdx
+ mov $hi1,-8(%rsp,$num,8)
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
+
+ lea 1($i),$i # i++
+ cmp $num,$i
+ jl .Louter
+
+ lea (%rsp),$ap # borrow ap for tp
+ lea -1($num),$j # j=num-1
+
+ mov ($ap),%rax # tp[0]
+ xor $i,$i # i=0 and clear CF!
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ dec $j # doesn't affect CF!
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ jge .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov ($ap,$j,8),%rax
+ mov %rax,($rp,$j,8) # rp[i]=tp[i]
+ mov $i,(%rsp,$j,8) # zap temporary vector
+ dec $j
+ jge .Lcopy
+
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
+ mov \$1,%rax
+ mov (%rsi),%r15
+ mov 8(%rsi),%r14
+ mov 16(%rsi),%r13
+ mov 24(%rsi),%r12
+ mov 32(%rsi),%rbp
+ mov 40(%rsi),%rbx
+ lea 48(%rsi),%rsp
+.Lepilogue:
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lin_prologue
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lepilogue(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lin_prologue
+
+ mov 192($context),%r10 # pull $num
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
+
+.Lin_prologue:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_bn_mul_mont
+ .rva .LSEH_end_bn_mul_mont
+ .rva .LSEH_info_bn_mul_mont
+
+.section .xdata
+.align 8
+.LSEH_info_bn_mul_mont:
+ .byte 9,0,0,0
+ .rva se_handler
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
index acf48b9784..e484b7fc11 100644
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -56,6 +56,59 @@
* [including the GNU Public Licence.]
*/
/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
* Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
*
* Portions of the attached software ("Contribution") are developed by
@@ -77,6 +130,7 @@
#include <stdio.h> /* FILE */
#endif
#include <openssl/ossl_typ.h>
+#include <openssl/crypto.h>
#ifdef __cplusplus
extern "C" {
@@ -90,21 +144,15 @@ extern "C" {
* BN_DEBUG - turn on various debugging alterations to the bignum code
* BN_DEBUG_RAND - uses random poisoning of unused words to trip up
* mismanagement of bignum internals. You must also define BN_DEBUG.
- * BN_STRICT - disables anything (not already caught by BN_DEBUG) that uses the
- * old ambiguity over zero representation. At some point, this behaviour should
- * become standard.
*/
/* #define BN_DEBUG */
/* #define BN_DEBUG_RAND */
-/* #define BN_STRICT */
-
-#ifdef OPENSSL_SYS_VMS
-#undef BN_LLONG /* experimental, so far... */
-#endif
+#ifndef OPENSSL_SMALL_FOOTPRINT
#define BN_MUL_COMBA
#define BN_SQR_COMBA
#define BN_RECURSION
+#endif
/* This next option uses the C libraries (2 word)/(1 word) function.
* If it is not defined, I use my C version (which is slower).
@@ -145,6 +193,8 @@ extern "C" {
#define BN_DEC_FMT1 "%lu"
#define BN_DEC_FMT2 "%019lu"
#define BN_DEC_NUM 19
+#define BN_HEX_FMT1 "%lX"
+#define BN_HEX_FMT2 "%016lX"
#endif
/* This is where the long long data type is 64 bits, but long is 32.
@@ -170,93 +220,72 @@ extern "C" {
#define BN_DEC_FMT1 "%llu"
#define BN_DEC_FMT2 "%019llu"
#define BN_DEC_NUM 19
+#define BN_HEX_FMT1 "%llX"
+#define BN_HEX_FMT2 "%016llX"
#endif
#ifdef THIRTY_TWO_BIT
-#if defined(OPENSSL_SYS_WIN32) && !defined(__GNUC__)
-#define BN_ULLONG unsigned _int64
-#else
-#define BN_ULLONG unsigned long long
+#ifdef BN_LLONG
+# if defined(_WIN32) && !defined(__GNUC__)
+# define BN_ULLONG unsigned __int64
+# define BN_MASK (0xffffffffffffffffI64)
+# else
+# define BN_ULLONG unsigned long long
+# define BN_MASK (0xffffffffffffffffLL)
+# endif
#endif
-#define BN_ULONG unsigned long
-#define BN_LONG long
+#define BN_ULONG unsigned int
+#define BN_LONG int
#define BN_BITS 64
#define BN_BYTES 4
#define BN_BITS2 32
#define BN_BITS4 16
-#ifdef OPENSSL_SYS_WIN32
-/* VC++ doesn't like the LL suffix */
-#define BN_MASK (0xffffffffffffffffL)
-#else
-#define BN_MASK (0xffffffffffffffffLL)
-#endif
#define BN_MASK2 (0xffffffffL)
#define BN_MASK2l (0xffff)
#define BN_MASK2h1 (0xffff8000L)
#define BN_MASK2h (0xffff0000L)
#define BN_TBIT (0x80000000L)
#define BN_DEC_CONV (1000000000L)
-#define BN_DEC_FMT1 "%lu"
-#define BN_DEC_FMT2 "%09lu"
-#define BN_DEC_NUM 9
-#endif
-
-#ifdef SIXTEEN_BIT
-#ifndef BN_DIV2W
-#define BN_DIV2W
-#endif
-#define BN_ULLONG unsigned long
-#define BN_ULONG unsigned short
-#define BN_LONG short
-#define BN_BITS 32
-#define BN_BYTES 2
-#define BN_BITS2 16
-#define BN_BITS4 8
-#define BN_MASK (0xffffffff)
-#define BN_MASK2 (0xffff)
-#define BN_MASK2l (0xff)
-#define BN_MASK2h1 (0xff80)
-#define BN_MASK2h (0xff00)
-#define BN_TBIT (0x8000)
-#define BN_DEC_CONV (100000)
-#define BN_DEC_FMT1 "%u"
-#define BN_DEC_FMT2 "%05u"
-#define BN_DEC_NUM 5
-#endif
-
-#ifdef EIGHT_BIT
-#ifndef BN_DIV2W
-#define BN_DIV2W
-#endif
-#define BN_ULLONG unsigned short
-#define BN_ULONG unsigned char
-#define BN_LONG char
-#define BN_BITS 16
-#define BN_BYTES 1
-#define BN_BITS2 8
-#define BN_BITS4 4
-#define BN_MASK (0xffff)
-#define BN_MASK2 (0xff)
-#define BN_MASK2l (0xf)
-#define BN_MASK2h1 (0xf8)
-#define BN_MASK2h (0xf0)
-#define BN_TBIT (0x80)
-#define BN_DEC_CONV (100)
#define BN_DEC_FMT1 "%u"
-#define BN_DEC_FMT2 "%02u"
-#define BN_DEC_NUM 2
+#define BN_DEC_FMT2 "%09u"
+#define BN_DEC_NUM 9
+#define BN_HEX_FMT1 "%X"
+#define BN_HEX_FMT2 "%08X"
#endif
#define BN_DEFAULT_BITS 1280
#define BN_FLG_MALLOCED 0x01
#define BN_FLG_STATIC_DATA 0x02
+#define BN_FLG_CONSTTIME 0x04 /* avoid leaking exponent information through timing,
+ * BN_mod_exp_mont() will call BN_mod_exp_mont_consttime,
+ * BN_div() will call BN_div_no_branch,
+ * BN_mod_inverse() will call BN_mod_inverse_no_branch.
+ */
+
+#ifndef OPENSSL_NO_DEPRECATED
+#define BN_FLG_EXP_CONSTTIME BN_FLG_CONSTTIME /* deprecated name for the flag */
+ /* avoid leaking exponent information through timings
+ * (BN_mod_exp_mont() will call BN_mod_exp_mont_consttime) */
+#endif
+
#ifndef OPENSSL_NO_DEPRECATED
#define BN_FLG_FREE 0x8000 /* used for debuging */
#endif
#define BN_set_flags(b,n) ((b)->flags|=(n))
#define BN_get_flags(b,n) ((b)->flags&(n))
+/* get a clone of a BIGNUM with changed flags, for *temporary* use only
+ * (the two BIGNUMs cannot not be used in parallel!) */
+#define BN_with_flags(dest,b,n) ((dest)->d=(b)->d, \
+ (dest)->top=(b)->top, \
+ (dest)->dmax=(b)->dmax, \
+ (dest)->neg=(b)->neg, \
+ (dest)->flags=(((dest)->flags & BN_FLG_MALLOCED) \
+ | ((b)->flags & ~BN_FLG_MALLOCED) \
+ | BN_FLG_STATIC_DATA \
+ | (n)))
+
/* Already declared in ossl_typ.h */
#if 0
typedef struct bignum_st BIGNUM;
@@ -278,16 +307,6 @@ struct bignum_st
int flags;
};
-struct bn_blinding_st
- {
- int init;
- BIGNUM *A;
- BIGNUM *Ai;
- BIGNUM *mod; /* just a reference */
- unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
- * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
- };
-
/* Used for montgomery multiplication */
struct bn_mont_ctx_st
{
@@ -296,7 +315,8 @@ struct bn_mont_ctx_st
BIGNUM N; /* The modulus */
BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1
* (Ni is only stored for bignum algorithm) */
- BN_ULONG n0; /* least significant word of Ni */
+ BN_ULONG n0[2];/* least significant word(s) of Ni;
+ (type changed with 0.9.9, was "BN_ULONG n0;" before) */
int flags;
};
@@ -366,11 +386,7 @@ int BN_GENCB_call(BN_GENCB *cb, int a, int b);
/* Note that BN_abs_is_word didn't work reliably for w == 0 until 0.9.8 */
#define BN_abs_is_word(a,w) ((((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w))) || \
(((w) == 0) && ((a)->top == 0)))
-#ifdef BN_STRICT
#define BN_is_zero(a) ((a)->top == 0)
-#else
-#define BN_is_zero(a) BN_abs_is_word(a,0)
-#endif
#define BN_is_one(a) (BN_abs_is_word((a),1) && !(a)->neg)
#define BN_is_word(a,w) (BN_abs_is_word((a),(w)) && (!(w) || !(a)->neg))
#define BN_is_odd(a) (((a)->top > 0) && ((a)->d[0] & 1))
@@ -387,14 +403,6 @@ int BN_GENCB_call(BN_GENCB *cb, int a, int b);
#else
#define BN_zero(a) (BN_set_word((a),0))
#endif
-/* BN_set_sign(BIGNUM *, int) sets the sign of a BIGNUM
- * (0 for a non-negative value, 1 for negative) */
-#define BN_set_sign(a,b) ((a)->neg = (b))
-/* BN_get_sign(BIGNUM *) returns the sign of the BIGNUM */
-#define BN_get_sign(a) ((a)->neg)
-
-/*#define BN_ascii2bn(a) BN_hex2bn(a) */
-/*#define BN_bn2ascii(a) BN_bn2hex(a) */
const BIGNUM *BN_value_one(void);
char * BN_options(void);
@@ -408,16 +416,14 @@ BIGNUM *BN_CTX_get(BN_CTX *ctx);
void BN_CTX_end(BN_CTX *ctx);
int BN_rand(BIGNUM *rnd, int bits, int top,int bottom);
int BN_pseudo_rand(BIGNUM *rnd, int bits, int top,int bottom);
-int BN_rand_range(BIGNUM *rnd, BIGNUM *range);
-int BN_pseudo_rand_range(BIGNUM *rnd, BIGNUM *range);
+int BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
+int BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
int BN_num_bits(const BIGNUM *a);
int BN_num_bits_word(BN_ULONG);
BIGNUM *BN_new(void);
void BN_init(BIGNUM *);
void BN_clear_free(BIGNUM *a);
BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
-/* BN_ncopy(): like BN_copy() but copies at most the first n BN_ULONGs */
-BIGNUM *BN_ncopy(BIGNUM *a, const BIGNUM *b, size_t n);
void BN_swap(BIGNUM *a, BIGNUM *b);
BIGNUM *BN_bin2bn(const unsigned char *s,int len,BIGNUM *ret);
int BN_bn2bin(const BIGNUM *a, unsigned char *to);
@@ -429,6 +435,16 @@ int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
int BN_sqr(BIGNUM *r, const BIGNUM *a,BN_CTX *ctx);
+/** BN_set_negative sets sign of a BIGNUM
+ * \param b pointer to the BIGNUM object
+ * \param n 0 if the BIGNUM b should be positive and a value != 0 otherwise
+ */
+void BN_set_negative(BIGNUM *b, int n);
+/** BN_is_negative returns 1 if the BIGNUM is negative
+ * \param a pointer to the BIGNUM object
+ * \return 1 if a < 0 and 0 otherwise
+ */
+#define BN_is_negative(a) ((a)->neg != 0)
int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
BN_CTX *ctx);
@@ -465,6 +481,8 @@ int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m,BN_CTX *ctx);
int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont);
int BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
int BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
@@ -494,6 +512,7 @@ char * BN_bn2hex(const BIGNUM *a);
char * BN_bn2dec(const BIGNUM *a);
int BN_hex2bn(BIGNUM **a, const char *str);
int BN_dec2bn(BIGNUM **a, const char *str);
+int BN_asc2bn(BIGNUM **a, const char *str);
int BN_gcd(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx);
int BN_kronecker(const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx); /* returns -2 for error */
BIGNUM *BN_mod_inverse(BIGNUM *ret,
@@ -532,12 +551,32 @@ int BN_from_montgomery(BIGNUM *r,const BIGNUM *a,
void BN_MONT_CTX_free(BN_MONT_CTX *mont);
int BN_MONT_CTX_set(BN_MONT_CTX *mont,const BIGNUM *mod,BN_CTX *ctx);
BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from);
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+ const BIGNUM *mod, BN_CTX *ctx);
-BN_BLINDING *BN_BLINDING_new(BIGNUM *A,BIGNUM *Ai,BIGNUM *mod);
+/* BN_BLINDING flags */
+#define BN_BLINDING_NO_UPDATE 0x00000001
+#define BN_BLINDING_NO_RECREATE 0x00000002
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod);
void BN_BLINDING_free(BN_BLINDING *b);
int BN_BLINDING_update(BN_BLINDING *b,BN_CTX *ctx);
-int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *r, BN_CTX *ctx);
+int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx);
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *);
+#ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *);
+void BN_BLINDING_set_thread_id(BN_BLINDING *, unsigned long);
+#endif
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *);
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *);
+void BN_BLINDING_set_flags(BN_BLINDING *, unsigned long);
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+ const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+ int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
+ BN_MONT_CTX *m_ctx);
#ifndef OPENSSL_NO_DEPRECATED
void BN_set_params(int mul,int high,int low,int mont);
@@ -587,24 +626,24 @@ int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
* t^p[0] + t^p[1] + ... + t^p[k]
* where m = p[0] > p[1] > ... > p[k] = 0.
*/
-int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[]);
+int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]);
/* r = a mod p */
int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
- const unsigned int p[], BN_CTX *ctx); /* r = (a * b) mod p */
-int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[],
+ const int p[], BN_CTX *ctx); /* r = (a * b) mod p */
+int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[],
BN_CTX *ctx); /* r = (a * a) mod p */
-int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const unsigned int p[],
+int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *b, const int p[],
BN_CTX *ctx); /* r = (1 / b) mod p */
int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
- const unsigned int p[], BN_CTX *ctx); /* r = (a / b) mod p */
+ const int p[], BN_CTX *ctx); /* r = (a / b) mod p */
int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
- const unsigned int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
+ const int p[], BN_CTX *ctx); /* r = (a ^ b) mod p */
int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a,
- const unsigned int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
+ const int p[], BN_CTX *ctx); /* r = sqrt(a) mod p */
int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
- const unsigned int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
-int BN_GF2m_poly2arr(const BIGNUM *a, unsigned int p[], int max);
-int BN_GF2m_arr2poly(const unsigned int p[], BIGNUM *a);
+ const int p[], BN_CTX *ctx); /* r^2 + r = a mod p */
+int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
+int BN_GF2m_arr2poly(const int p[], BIGNUM *a);
/* faster mod functions for the 'NIST primes'
* 0 <= a < p^2 */
@@ -693,9 +732,11 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
#define bn_check_top(a) \
do { \
const BIGNUM *_bnum2 = (a); \
- assert((_bnum2->top == 0) || \
+ if (_bnum2 != NULL) { \
+ assert((_bnum2->top == 0) || \
(_bnum2->d[_bnum2->top - 1] != 0)); \
- bn_pollute(_bnum2); \
+ bn_pollute(_bnum2); \
+ } \
} while(0)
#define bn_fix_top(a) bn_check_top(a)
@@ -711,10 +752,12 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
#define bn_correct_top(a) \
{ \
BN_ULONG *ftl; \
- if ((a)->top > 0) \
+ int tmp_top = (a)->top; \
+ if (tmp_top > 0) \
{ \
- for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \
- if (*(ftl--)) break; \
+ for (ftl= &((a)->d[tmp_top-1]); tmp_top > 0; tmp_top--) \
+ if (*(ftl--)) break; \
+ (a)->top = tmp_top; \
} \
bn_pollute(a); \
}
@@ -726,6 +769,18 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+/* Primes from RFC 2409 */
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn);
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn);
+
+/* Primes from RFC 3526 */
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn);
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn);
+
int BN_bntest_rand(BIGNUM *rnd, int bits, int top,int bottom);
/* BEGIN ERROR CODES */
@@ -737,28 +792,38 @@ void ERR_load_BN_strings(void);
/* Error codes for the BN functions. */
/* Function codes. */
-#define BN_F_BN_BLINDING_CONVERT 100
-#define BN_F_BN_BLINDING_INVERT 101
+#define BN_F_BNRAND 127
+#define BN_F_BN_BLINDING_CONVERT_EX 100
+#define BN_F_BN_BLINDING_CREATE_PARAM 128
+#define BN_F_BN_BLINDING_INVERT_EX 101
#define BN_F_BN_BLINDING_NEW 102
#define BN_F_BN_BLINDING_UPDATE 103
#define BN_F_BN_BN2DEC 104
#define BN_F_BN_BN2HEX 105
#define BN_F_BN_CTX_GET 116
#define BN_F_BN_CTX_NEW 106
+#define BN_F_BN_CTX_START 129
#define BN_F_BN_DIV 107
+#define BN_F_BN_DIV_NO_BRANCH 138
+#define BN_F_BN_DIV_RECP 130
+#define BN_F_BN_EXP 123
#define BN_F_BN_EXPAND2 108
#define BN_F_BN_EXPAND_INTERNAL 120
-#define BN_F_BN_GF2M_MOD 126
-#define BN_F_BN_GF2M_MOD_DIV 123
-#define BN_F_BN_GF2M_MOD_EXP 127
-#define BN_F_BN_GF2M_MOD_MUL 124
-#define BN_F_BN_GF2M_MOD_SOLVE_QUAD 128
-#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR 129
-#define BN_F_BN_GF2M_MOD_SQR 125
+#define BN_F_BN_GF2M_MOD 131
+#define BN_F_BN_GF2M_MOD_EXP 132
+#define BN_F_BN_GF2M_MOD_MUL 133
+#define BN_F_BN_GF2M_MOD_SOLVE_QUAD 134
+#define BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR 135
+#define BN_F_BN_GF2M_MOD_SQR 136
+#define BN_F_BN_GF2M_MOD_SQRT 137
#define BN_F_BN_MOD_EXP2_MONT 118
#define BN_F_BN_MOD_EXP_MONT 109
+#define BN_F_BN_MOD_EXP_MONT_CONSTTIME 124
#define BN_F_BN_MOD_EXP_MONT_WORD 117
+#define BN_F_BN_MOD_EXP_RECP 125
+#define BN_F_BN_MOD_EXP_SIMPLE 126
#define BN_F_BN_MOD_INVERSE 110
+#define BN_F_BN_MOD_INVERSE_NO_BRANCH 139
#define BN_F_BN_MOD_LSHIFT_QUICK 119
#define BN_F_BN_MOD_MUL_RECIPROCAL 111
#define BN_F_BN_MOD_SQRT 121
@@ -780,10 +845,9 @@ void ERR_load_BN_strings(void);
#define BN_R_INVALID_LENGTH 106
#define BN_R_INVALID_RANGE 115
#define BN_R_NOT_A_SQUARE 111
-#define BN_R_NOT_IMPLEMENTED 116
#define BN_R_NOT_INITIALIZED 107
#define BN_R_NO_INVERSE 108
-#define BN_R_NO_SOLUTION 117
+#define BN_R_NO_SOLUTION 116
#define BN_R_P_IS_NOT_PRIME 112
#define BN_R_TOO_MANY_ITERATIONS 113
#define BN_R_TOO_MANY_TEMPORARY_VARIABLES 109
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c
index be8aa3ffc5..c43c91cc09 100644
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -75,6 +75,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
assert(num >= 0);
if (num <= 0) return(c1);
+#ifndef OPENSSL_SMALL_FOOTPRINT
while (num&~3)
{
mul_add(rp[0],ap[0],w,c1);
@@ -83,11 +84,11 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
mul_add(rp[3],ap[3],w,c1);
ap+=4; rp+=4; num-=4;
}
- if (num)
+#endif
+ while (num)
{
- mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
- mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
- mul_add(rp[2],ap[2],w,c1); return c1;
+ mul_add(rp[0],ap[0],w,c1);
+ ap++; rp++; num--;
}
return(c1);
@@ -100,6 +101,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
assert(num >= 0);
if (num <= 0) return(c1);
+#ifndef OPENSSL_SMALL_FOOTPRINT
while (num&~3)
{
mul(rp[0],ap[0],w,c1);
@@ -108,11 +110,11 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
mul(rp[3],ap[3],w,c1);
ap+=4; rp+=4; num-=4;
}
- if (num)
+#endif
+ while (num)
{
- mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
- mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
- mul(rp[2],ap[2],w,c1);
+ mul(rp[0],ap[0],w,c1);
+ ap++; rp++; num--;
}
return(c1);
}
@@ -121,6 +123,8 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
assert(n >= 0);
if (n <= 0) return;
+
+#ifndef OPENSSL_SMALL_FOOTPRINT
while (n&~3)
{
sqr(r[0],r[1],a[0]);
@@ -129,11 +133,11 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
sqr(r[6],r[7],a[3]);
a+=4; r+=8; n-=4;
}
- if (n)
+#endif
+ while (n)
{
- sqr(r[0],r[1],a[0]); if (--n == 0) return;
- sqr(r[2],r[3],a[1]); if (--n == 0) return;
- sqr(r[4],r[5],a[2]);
+ sqr(r[0],r[1],a[0]);
+ a++; r+=2; n--;
}
}
@@ -150,18 +154,20 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
bl=LBITS(w);
bh=HBITS(w);
- for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+ while (num&~3)
{
mul_add(rp[0],ap[0],bl,bh,c);
- if (--num == 0) break;
mul_add(rp[1],ap[1],bl,bh,c);
- if (--num == 0) break;
mul_add(rp[2],ap[2],bl,bh,c);
- if (--num == 0) break;
mul_add(rp[3],ap[3],bl,bh,c);
- if (--num == 0) break;
- ap+=4;
- rp+=4;
+ ap+=4; rp+=4; num-=4;
+ }
+#endif
+ while (num)
+ {
+ mul_add(rp[0],ap[0],bl,bh,c);
+ ap++; rp++; num--;
}
return(c);
}
@@ -177,18 +183,20 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
bl=LBITS(w);
bh=HBITS(w);
- for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+ while (num&~3)
{
mul(rp[0],ap[0],bl,bh,carry);
- if (--num == 0) break;
mul(rp[1],ap[1],bl,bh,carry);
- if (--num == 0) break;
mul(rp[2],ap[2],bl,bh,carry);
- if (--num == 0) break;
mul(rp[3],ap[3],bl,bh,carry);
- if (--num == 0) break;
- ap+=4;
- rp+=4;
+ ap+=4; rp+=4; num-=4;
+ }
+#endif
+ while (num)
+ {
+ mul(rp[0],ap[0],bl,bh,carry);
+ ap++; rp++; num--;
}
return(carry);
}
@@ -197,22 +205,21 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
assert(n >= 0);
if (n <= 0) return;
- for (;;)
+
+#ifndef OPENSSL_SMALL_FOOTPRINT
+ while (n&~3)
{
sqr64(r[0],r[1],a[0]);
- if (--n == 0) break;
-
sqr64(r[2],r[3],a[1]);
- if (--n == 0) break;
-
sqr64(r[4],r[5],a[2]);
- if (--n == 0) break;
-
sqr64(r[6],r[7],a[3]);
- if (--n == 0) break;
-
- a+=4;
- r+=8;
+ a+=4; r+=8; n-=4;
+ }
+#endif
+ while (n)
+ {
+ sqr64(r[0],r[1],a[0]);
+ a++; r+=2; n--;
}
}
@@ -237,7 +244,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
if (d == 0) return(BN_MASK2);
i=BN_num_bits_word(d);
- assert((i == BN_BITS2) || (h > (BN_ULONG)1<<i));
+ assert((i == BN_BITS2) || (h <= (BN_ULONG)1<<i));
i=BN_BITS2-i;
if (h >= d) h-=d;
@@ -303,31 +310,30 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
assert(n >= 0);
if (n <= 0) return((BN_ULONG)0);
- for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+ while (n&~3)
{
ll+=(BN_ULLONG)a[0]+b[0];
r[0]=(BN_ULONG)ll&BN_MASK2;
ll>>=BN_BITS2;
- if (--n <= 0) break;
-
ll+=(BN_ULLONG)a[1]+b[1];
r[1]=(BN_ULONG)ll&BN_MASK2;
ll>>=BN_BITS2;
- if (--n <= 0) break;
-
ll+=(BN_ULLONG)a[2]+b[2];
r[2]=(BN_ULONG)ll&BN_MASK2;
ll>>=BN_BITS2;
- if (--n <= 0) break;
-
ll+=(BN_ULLONG)a[3]+b[3];
r[3]=(BN_ULONG)ll&BN_MASK2;
ll>>=BN_BITS2;
- if (--n <= 0) break;
-
- a+=4;
- b+=4;
- r+=4;
+ a+=4; b+=4; r+=4; n-=4;
+ }
+#endif
+ while (n)
+ {
+ ll+=(BN_ULLONG)a[0]+b[0];
+ r[0]=(BN_ULONG)ll&BN_MASK2;
+ ll>>=BN_BITS2;
+ a++; b++; r++; n--;
}
return((BN_ULONG)ll);
}
@@ -340,7 +346,8 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
if (n <= 0) return((BN_ULONG)0);
c=0;
- for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+ while (n&~3)
{
t=a[0];
t=(t+c)&BN_MASK2;
@@ -348,35 +355,36 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
l=(t+b[0])&BN_MASK2;
c+=(l < t);
r[0]=l;
- if (--n <= 0) break;
-
t=a[1];
t=(t+c)&BN_MASK2;
c=(t < c);
l=(t+b[1])&BN_MASK2;
c+=(l < t);
r[1]=l;
- if (--n <= 0) break;
-
t=a[2];
t=(t+c)&BN_MASK2;
c=(t < c);
l=(t+b[2])&BN_MASK2;
c+=(l < t);
r[2]=l;
- if (--n <= 0) break;
-
t=a[3];
t=(t+c)&BN_MASK2;
c=(t < c);
l=(t+b[3])&BN_MASK2;
c+=(l < t);
r[3]=l;
- if (--n <= 0) break;
-
- a+=4;
- b+=4;
- r+=4;
+ a+=4; b+=4; r+=4; n-=4;
+ }
+#endif
+ while(n)
+ {
+ t=a[0];
+ t=(t+c)&BN_MASK2;
+ c=(t < c);
+ l=(t+b[0])&BN_MASK2;
+ c+=(l < t);
+ r[0]=l;
+ a++; b++; r++; n--;
}
return((BN_ULONG)c);
}
@@ -390,36 +398,35 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
assert(n >= 0);
if (n <= 0) return((BN_ULONG)0);
- for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+ while (n&~3)
{
t1=a[0]; t2=b[0];
r[0]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
- if (--n <= 0) break;
-
t1=a[1]; t2=b[1];
r[1]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
- if (--n <= 0) break;
-
t1=a[2]; t2=b[2];
r[2]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
- if (--n <= 0) break;
-
t1=a[3]; t2=b[3];
r[3]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
- if (--n <= 0) break;
-
- a+=4;
- b+=4;
- r+=4;
+ a+=4; b+=4; r+=4; n-=4;
+ }
+#endif
+ while (n)
+ {
+ t1=a[0]; t2=b[0];
+ r[0]=(t1-t2-c)&BN_MASK2;
+ if (t1 != t2) c=(t1 < t2);
+ a++; b++; r++; n--;
}
return(c);
}
-#ifdef BN_MUL_COMBA
+#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
#undef bn_mul_comba8
#undef bn_mul_comba4
@@ -459,6 +466,34 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
#define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+#elif defined(BN_UMULT_LOHI)
+
+#define mul_add_c(a,b,c0,c1,c2) { \
+ BN_ULONG ta=(a),tb=(b); \
+ BN_UMULT_LOHI(t1,t2,ta,tb); \
+ c0 += t1; t2 += (c0<t1)?1:0; \
+ c1 += t2; c2 += (c1<t2)?1:0; \
+ }
+
+#define mul_add_c2(a,b,c0,c1,c2) { \
+ BN_ULONG ta=(a),tb=(b),t0; \
+ BN_UMULT_LOHI(t0,t1,ta,tb); \
+ t2 = t1+t1; c2 += (t2<t1)?1:0; \
+ t1 = t0+t0; t2 += (t1<t0)?1:0; \
+ c0 += t1; t2 += (c0<t1)?1:0; \
+ c1 += t2; c2 += (c1<t2)?1:0; \
+ }
+
+#define sqr_add_c(a,i,c0,c1,c2) { \
+ BN_ULONG ta=(a)[i]; \
+ BN_UMULT_LOHI(t1,t2,ta,ta); \
+ c0 += t1; t2 += (c0<t1)?1:0; \
+ c1 += t2; c2 += (c1<t2)?1:0; \
+ }
+
+#define sqr_add_c2(a,i,j,c0,c1,c2) \
+ mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
#elif defined(BN_UMULT_HIGH)
#define mul_add_c(a,b,c0,c1,c2) { \
@@ -792,18 +827,134 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
r[6]=c1;
r[7]=c2;
}
+
+#ifdef OPENSSL_NO_ASM
+#ifdef OPENSSL_BN_ASM_MONT
+#include <alloca.h>
+/*
+ * This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this routine was
+ * observed to give 40% faster rsa1024 private key operations and 10%
+ * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
+ * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
+ * reference implementation, one to be used as starting point for
+ * platform-specific assembler. Mentioned numbers apply to compiler
+ * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
+ * can vary not only from platform to platform, but even for compiler
+ * versions. Assembler vs. assembler improvement coefficients can
+ * [and are known to] differ and are to be documented elsewhere.
+ */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
+ {
+ BN_ULONG c0,c1,ml,*tp,n0;
+#ifdef mul64
+ BN_ULONG mh;
+#endif
+ volatile BN_ULONG *vp;
+ int i=0,j;
+
+#if 0 /* template for platform-specific implementation */
+ if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
+#endif
+ vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+ n0 = *n0p;
+
+ c0 = 0;
+ ml = bp[0];
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ for (j=0;j<num;++j)
+ mul(tp[j],ap[j],ml,mh,c0);
+#else
+ for (j=0;j<num;++j)
+ mul(tp[j],ap[j],ml,c0);
+#endif
+
+ tp[num] = c0;
+ tp[num+1] = 0;
+ goto enter;
+
+ for(i=0;i<num;i++)
+ {
+ c0 = 0;
+ ml = bp[i];
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ for (j=0;j<num;++j)
+ mul_add(tp[j],ap[j],ml,mh,c0);
+#else
+ for (j=0;j<num;++j)
+ mul_add(tp[j],ap[j],ml,c0);
+#endif
+ c1 = (tp[num] + c0)&BN_MASK2;
+ tp[num] = c1;
+ tp[num+1] = (c1<c0?1:0);
+ enter:
+ c1 = tp[0];
+ ml = (c1*n0)&BN_MASK2;
+ c0 = 0;
+#ifdef mul64
+ mh = HBITS(ml);
+ ml = LBITS(ml);
+ mul_add(c1,np[0],ml,mh,c0);
+#else
+ mul_add(c1,ml,np[0],c0);
+#endif
+ for(j=1;j<num;j++)
+ {
+ c1 = tp[j];
+#ifdef mul64
+ mul_add(c1,np[j],ml,mh,c0);
+#else
+ mul_add(c1,ml,np[j],c0);
+#endif
+ tp[j-1] = c1&BN_MASK2;
+ }
+ c1 = (tp[num] + c0)&BN_MASK2;
+ tp[num-1] = c1;
+ tp[num] = tp[num+1] + (c1<c0?1:0);
+ }
+
+ if (tp[num]!=0 || tp[num-1]>=np[num-1])
+ {
+ c0 = bn_sub_words(rp,tp,np,num);
+ if (tp[num]!=0 || c0==0)
+ {
+ for(i=0;i<num+2;i++) vp[i] = 0;
+ return 1;
+ }
+ }
+ for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
+ vp[num] = 0;
+ vp[num+1] = 0;
+ return 1;
+ }
+#else
+/*
+ * Return value of 0 indicates that multiplication/convolution was not
+ * performed to signal the caller to fall down to alternative/original
+ * code-path.
+ */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+{ return 0; }
+#endif /* OPENSSL_BN_ASM_MONT */
+#endif
+
#else /* !BN_MUL_COMBA */
/* hmm... is it faster just to do a multiply? */
#undef bn_sqr_comba4
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t[8];
bn_sqr_normal(r,a,4,t);
}
#undef bn_sqr_comba8
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t[16];
bn_sqr_normal(r,a,8,t);
@@ -829,4 +980,51 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
}
+#ifdef OPENSSL_NO_ASM
+#ifdef OPENSSL_BN_ASM_MONT
+#include <alloca.h>
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
+ {
+ BN_ULONG c0,c1,*tp,n0=*n0p;
+ volatile BN_ULONG *vp;
+ int i=0,j;
+
+ vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+ for(i=0;i<=num;i++) tp[i]=0;
+
+ for(i=0;i<num;i++)
+ {
+ c0 = bn_mul_add_words(tp,ap,num,bp[i]);
+ c1 = (tp[num] + c0)&BN_MASK2;
+ tp[num] = c1;
+ tp[num+1] = (c1<c0?1:0);
+
+ c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);
+ c1 = (tp[num] + c0)&BN_MASK2;
+ tp[num] = c1;
+ tp[num+1] += (c1<c0?1:0);
+ for(j=0;j<=num;j++) tp[j]=tp[j+1];
+ }
+
+ if (tp[num]!=0 || tp[num-1]>=np[num-1])
+ {
+ c0 = bn_sub_words(rp,tp,np,num);
+ if (tp[num]!=0 || c0==0)
+ {
+ for(i=0;i<num+2;i++) vp[i] = 0;
+ return 1;
+ }
+ }
+ for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
+ vp[num] = 0;
+ vp[num+1] = 0;
+ return 1;
+ }
+#else
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+{ return 0; }
+#endif /* OPENSSL_BN_ASM_MONT */
+#endif
+
#endif /* !BN_MUL_COMBA */
diff --git a/crypto/bn/bn_blind.c b/crypto/bn/bn_blind.c
index 011d37f1ff..e060592fdc 100644
--- a/crypto/bn/bn_blind.c
+++ b/crypto/bn/bn_blind.c
@@ -1,4 +1,57 @@
/* crypto/bn/bn_blind.c */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
@@ -60,11 +113,31 @@
#include "cryptlib.h"
#include "bn_lcl.h"
-BN_BLINDING *BN_BLINDING_new(BIGNUM *A, BIGNUM *Ai, BIGNUM *mod)
+#define BN_BLINDING_COUNTER 32
+
+struct bn_blinding_st
+ {
+ BIGNUM *A;
+ BIGNUM *Ai;
+ BIGNUM *e;
+ BIGNUM *mod; /* just a reference */
+#ifndef OPENSSL_NO_DEPRECATED
+ unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
+ * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
+#endif
+ CRYPTO_THREADID tid;
+ unsigned int counter;
+ unsigned long flags;
+ BN_MONT_CTX *m_ctx;
+ int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx,
+ BN_MONT_CTX *m_ctx);
+ };
+
+BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
{
BN_BLINDING *ret=NULL;
- bn_check_top(Ai);
bn_check_top(mod);
if ((ret=(BN_BLINDING *)OPENSSL_malloc(sizeof(BN_BLINDING))) == NULL)
@@ -73,11 +146,22 @@ BN_BLINDING *BN_BLINDING_new(BIGNUM *A, BIGNUM *Ai, BIGNUM *mod)
return(NULL);
}
memset(ret,0,sizeof(BN_BLINDING));
- if ((ret->A=BN_new()) == NULL) goto err;
- if ((ret->Ai=BN_new()) == NULL) goto err;
- if (!BN_copy(ret->A,A)) goto err;
- if (!BN_copy(ret->Ai,Ai)) goto err;
- ret->mod=mod;
+ if (A != NULL)
+ {
+ if ((ret->A = BN_dup(A)) == NULL) goto err;
+ }
+ if (Ai != NULL)
+ {
+ if ((ret->Ai = BN_dup(Ai)) == NULL) goto err;
+ }
+
+ /* save a copy of mod in the BN_BLINDING structure */
+ if ((ret->mod = BN_dup(mod)) == NULL) goto err;
+ if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
+ BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
+
+ ret->counter = BN_BLINDING_COUNTER;
+ CRYPTO_THREADID_current(&ret->tid);
return(ret);
err:
if (ret != NULL) BN_BLINDING_free(ret);
@@ -91,6 +175,8 @@ void BN_BLINDING_free(BN_BLINDING *r)
if (r->A != NULL) BN_free(r->A );
if (r->Ai != NULL) BN_free(r->Ai);
+ if (r->e != NULL) BN_free(r->e );
+ if (r->mod != NULL) BN_free(r->mod);
OPENSSL_free(r);
}
@@ -103,38 +189,76 @@ int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
BNerr(BN_F_BN_BLINDING_UPDATE,BN_R_NOT_INITIALIZED);
goto err;
}
-
- if (!BN_mod_mul(b->A,b->A,b->A,b->mod,ctx)) goto err;
- if (!BN_mod_mul(b->Ai,b->Ai,b->Ai,b->mod,ctx)) goto err;
+
+ if (--(b->counter) == 0 && b->e != NULL &&
+ !(b->flags & BN_BLINDING_NO_RECREATE))
+ {
+ /* re-create blinding parameters */
+ if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
+ goto err;
+ }
+ else if (!(b->flags & BN_BLINDING_NO_UPDATE))
+ {
+ if (!BN_mod_mul(b->A,b->A,b->A,b->mod,ctx)) goto err;
+ if (!BN_mod_mul(b->Ai,b->Ai,b->Ai,b->mod,ctx)) goto err;
+ }
ret=1;
err:
+ if (b->counter == 0)
+ b->counter = BN_BLINDING_COUNTER;
return(ret);
}
int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
{
+ return BN_BLINDING_convert_ex(n, NULL, b, ctx);
+ }
+
+int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
+ {
+ int ret = 1;
+
bn_check_top(n);
if ((b->A == NULL) || (b->Ai == NULL))
{
- BNerr(BN_F_BN_BLINDING_CONVERT,BN_R_NOT_INITIALIZED);
+ BNerr(BN_F_BN_BLINDING_CONVERT_EX,BN_R_NOT_INITIALIZED);
return(0);
}
- return(BN_mod_mul(n,n,b->A,b->mod,ctx));
+
+ if (r != NULL)
+ {
+ if (!BN_copy(r, b->Ai)) ret=0;
+ }
+
+ if (!BN_mod_mul(n,n,b->A,b->mod,ctx)) ret=0;
+
+ return ret;
}
int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
{
+ return BN_BLINDING_invert_ex(n, NULL, b, ctx);
+ }
+
+int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
+ {
int ret;
bn_check_top(n);
if ((b->A == NULL) || (b->Ai == NULL))
{
- BNerr(BN_F_BN_BLINDING_INVERT,BN_R_NOT_INITIALIZED);
+ BNerr(BN_F_BN_BLINDING_INVERT_EX,BN_R_NOT_INITIALIZED);
return(0);
}
- if ((ret=BN_mod_mul(n,n,b->Ai,b->mod,ctx)) >= 0)
+
+ if (r != NULL)
+ ret = BN_mod_mul(n, n, r, b->mod, ctx);
+ else
+ ret = BN_mod_mul(n, n, b->Ai, b->mod, ctx);
+
+ if (ret >= 0)
{
if (!BN_BLINDING_update(b,ctx))
return(0);
@@ -143,3 +267,110 @@ int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
return(ret);
}
+#ifndef OPENSSL_NO_DEPRECATED
+unsigned long BN_BLINDING_get_thread_id(const BN_BLINDING *b)
+ {
+ return b->thread_id;
+ }
+
+void BN_BLINDING_set_thread_id(BN_BLINDING *b, unsigned long n)
+ {
+ b->thread_id = n;
+ }
+#endif
+
+CRYPTO_THREADID *BN_BLINDING_thread_id(BN_BLINDING *b)
+ {
+ return &b->tid;
+ }
+
+unsigned long BN_BLINDING_get_flags(const BN_BLINDING *b)
+ {
+ return b->flags;
+ }
+
+void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
+ {
+ b->flags = flags;
+ }
+
+BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
+ const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
+ int (*bn_mod_exp)(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx),
+ BN_MONT_CTX *m_ctx)
+{
+ int retry_counter = 32;
+ BN_BLINDING *ret = NULL;
+
+ if (b == NULL)
+ ret = BN_BLINDING_new(NULL, NULL, m);
+ else
+ ret = b;
+
+ if (ret == NULL)
+ goto err;
+
+ if (ret->A == NULL && (ret->A = BN_new()) == NULL)
+ goto err;
+ if (ret->Ai == NULL && (ret->Ai = BN_new()) == NULL)
+ goto err;
+
+ if (e != NULL)
+ {
+ if (ret->e != NULL)
+ BN_free(ret->e);
+ ret->e = BN_dup(e);
+ }
+ if (ret->e == NULL)
+ goto err;
+
+ if (bn_mod_exp != NULL)
+ ret->bn_mod_exp = bn_mod_exp;
+ if (m_ctx != NULL)
+ ret->m_ctx = m_ctx;
+
+ do {
+ if (!BN_rand_range(ret->A, ret->mod)) goto err;
+ if (BN_mod_inverse(ret->Ai, ret->A, ret->mod, ctx) == NULL)
+ {
+ /* this should almost never happen for good RSA keys */
+ unsigned long error = ERR_peek_last_error();
+ if (ERR_GET_REASON(error) == BN_R_NO_INVERSE)
+ {
+ if (retry_counter-- == 0)
+ {
+ BNerr(BN_F_BN_BLINDING_CREATE_PARAM,
+ BN_R_TOO_MANY_ITERATIONS);
+ goto err;
+ }
+ ERR_clear_error();
+ }
+ else
+ goto err;
+ }
+ else
+ break;
+ } while (1);
+
+ if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL)
+ {
+ if (!ret->bn_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx, ret->m_ctx))
+ goto err;
+ }
+ else
+ {
+ if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
+ goto err;
+ }
+
+ return ret;
+err:
+ if (b == NULL && ret != NULL)
+ {
+ BN_BLINDING_free(ret);
+ ret = NULL;
+ }
+
+ return ret;
+}
diff --git a/crypto/bn/bn_const.c b/crypto/bn/bn_const.c
new file mode 100755
index 0000000000..eb60a25b3c
--- /dev/null
+++ b/crypto/bn/bn_const.c
@@ -0,0 +1,402 @@
+/* crypto/bn/knownprimes.c */
+/* Insert boilerplate */
+
+#include "bn.h"
+
+/* "First Oakley Default Group" from RFC2409, section 6.1.
+ *
+ * The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
+ *
+ * RFC2409 specifies a generator of 2.
+ * RFC2412 specifies a generator of of 22.
+ */
+
+BIGNUM *get_rfc2409_prime_768(BIGNUM *bn)
+ {
+ static const unsigned char RFC2409_PRIME_768[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x3A,0x36,0x20,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC2409_PRIME_768,sizeof(RFC2409_PRIME_768),bn);
+ }
+
+/* "Second Oakley Default Group" from RFC2409, section 6.2.
+ *
+ * The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
+ *
+ * RFC2409 specifies a generator of 2.
+ * RFC2412 specifies a generator of 22.
+ */
+
+BIGNUM *get_rfc2409_prime_1024(BIGNUM *bn)
+ {
+ static const unsigned char RFC2409_PRIME_1024[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE6,0x53,0x81,
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC2409_PRIME_1024,sizeof(RFC2409_PRIME_1024),bn);
+ }
+
+/* "1536-bit MODP Group" from RFC3526, Section 2.
+ *
+ * The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
+ *
+ * RFC3526 specifies a generator of 2.
+ * RFC2312 specifies a generator of 22.
+ */
+
+BIGNUM *get_rfc3526_prime_1536(BIGNUM *bn)
+ {
+ static const unsigned char RFC3526_PRIME_1536[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+ 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+ 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+ 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+ 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+ 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+ 0xCA,0x23,0x73,0x27,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC3526_PRIME_1536,sizeof(RFC3526_PRIME_1536),bn);
+ }
+
+/* "2048-bit MODP Group" from RFC3526, Section 3.
+ *
+ * The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_2048(BIGNUM *bn)
+ {
+ static const unsigned char RFC3526_PRIME_2048[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+ 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+ 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+ 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+ 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+ 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+ 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+ 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+ 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+ 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+ 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+ 0x15,0x72,0x8E,0x5A,0x8A,0xAC,0xAA,0x68,0xFF,0xFF,0xFF,0xFF,
+ 0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC3526_PRIME_2048,sizeof(RFC3526_PRIME_2048),bn);
+ }
+
+/* "3072-bit MODP Group" from RFC3526, Section 4.
+ *
+ * The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_3072(BIGNUM *bn)
+ {
+ static const unsigned char RFC3526_PRIME_3072[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+ 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+ 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+ 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+ 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+ 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+ 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+ 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+ 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+ 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+ 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+ 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+ 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+ 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+ 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+ 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+ 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+ 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+ 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+ 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+ 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+ 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+ 0xA9,0x3A,0xD2,0xCA,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC3526_PRIME_3072,sizeof(RFC3526_PRIME_3072),bn);
+ }
+
+/* "4096-bit MODP Group" from RFC3526, Section 5.
+ *
+ * The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_4096(BIGNUM *bn)
+ {
+ static const unsigned char RFC3526_PRIME_4096[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+ 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+ 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+ 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+ 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+ 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+ 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+ 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+ 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+ 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+ 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+ 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+ 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+ 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+ 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+ 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+ 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+ 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+ 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+ 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+ 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+ 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+ 0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
+ 0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
+ 0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
+ 0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
+ 0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
+ 0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
+ 0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
+ 0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
+ 0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
+ 0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
+ 0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x06,0x31,0x99,
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC3526_PRIME_4096,sizeof(RFC3526_PRIME_4096),bn);
+ }
+
+/* "6144-bit MODP Group" from RFC3526, Section 6.
+ *
+ * The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_6144(BIGNUM *bn)
+ {
+ static const unsigned char RFC3526_PRIME_6144[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+ 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+ 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+ 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+ 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+ 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+ 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+ 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+ 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+ 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+ 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+ 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+ 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+ 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+ 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+ 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+ 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+ 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+ 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+ 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+ 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+ 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+ 0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
+ 0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
+ 0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
+ 0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
+ 0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
+ 0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
+ 0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
+ 0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
+ 0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
+ 0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
+ 0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x02,0x84,0x92,
+ 0x36,0xC3,0xFA,0xB4,0xD2,0x7C,0x70,0x26,0xC1,0xD4,0xDC,0xB2,
+ 0x60,0x26,0x46,0xDE,0xC9,0x75,0x1E,0x76,0x3D,0xBA,0x37,0xBD,
+ 0xF8,0xFF,0x94,0x06,0xAD,0x9E,0x53,0x0E,0xE5,0xDB,0x38,0x2F,
+ 0x41,0x30,0x01,0xAE,0xB0,0x6A,0x53,0xED,0x90,0x27,0xD8,0x31,
+ 0x17,0x97,0x27,0xB0,0x86,0x5A,0x89,0x18,0xDA,0x3E,0xDB,0xEB,
+ 0xCF,0x9B,0x14,0xED,0x44,0xCE,0x6C,0xBA,0xCE,0xD4,0xBB,0x1B,
+ 0xDB,0x7F,0x14,0x47,0xE6,0xCC,0x25,0x4B,0x33,0x20,0x51,0x51,
+ 0x2B,0xD7,0xAF,0x42,0x6F,0xB8,0xF4,0x01,0x37,0x8C,0xD2,0xBF,
+ 0x59,0x83,0xCA,0x01,0xC6,0x4B,0x92,0xEC,0xF0,0x32,0xEA,0x15,
+ 0xD1,0x72,0x1D,0x03,0xF4,0x82,0xD7,0xCE,0x6E,0x74,0xFE,0xF6,
+ 0xD5,0x5E,0x70,0x2F,0x46,0x98,0x0C,0x82,0xB5,0xA8,0x40,0x31,
+ 0x90,0x0B,0x1C,0x9E,0x59,0xE7,0xC9,0x7F,0xBE,0xC7,0xE8,0xF3,
+ 0x23,0xA9,0x7A,0x7E,0x36,0xCC,0x88,0xBE,0x0F,0x1D,0x45,0xB7,
+ 0xFF,0x58,0x5A,0xC5,0x4B,0xD4,0x07,0xB2,0x2B,0x41,0x54,0xAA,
+ 0xCC,0x8F,0x6D,0x7E,0xBF,0x48,0xE1,0xD8,0x14,0xCC,0x5E,0xD2,
+ 0x0F,0x80,0x37,0xE0,0xA7,0x97,0x15,0xEE,0xF2,0x9B,0xE3,0x28,
+ 0x06,0xA1,0xD5,0x8B,0xB7,0xC5,0xDA,0x76,0xF5,0x50,0xAA,0x3D,
+ 0x8A,0x1F,0xBF,0xF0,0xEB,0x19,0xCC,0xB1,0xA3,0x13,0xD5,0x5C,
+ 0xDA,0x56,0xC9,0xEC,0x2E,0xF2,0x96,0x32,0x38,0x7F,0xE8,0xD7,
+ 0x6E,0x3C,0x04,0x68,0x04,0x3E,0x8F,0x66,0x3F,0x48,0x60,0xEE,
+ 0x12,0xBF,0x2D,0x5B,0x0B,0x74,0x74,0xD6,0xE6,0x94,0xF9,0x1E,
+ 0x6D,0xCC,0x40,0x24,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC3526_PRIME_6144,sizeof(RFC3526_PRIME_6144),bn);
+ }
+
+/* "8192-bit MODP Group" from RFC3526, Section 7.
+ *
+ * The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
+ *
+ * RFC3526 specifies a generator of 2.
+ */
+
+BIGNUM *get_rfc3526_prime_8192(BIGNUM *bn)
+ {
+ static const unsigned char RFC3526_PRIME_8192[]={
+ 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xC9,0x0F,0xDA,0xA2,
+ 0x21,0x68,0xC2,0x34,0xC4,0xC6,0x62,0x8B,0x80,0xDC,0x1C,0xD1,
+ 0x29,0x02,0x4E,0x08,0x8A,0x67,0xCC,0x74,0x02,0x0B,0xBE,0xA6,
+ 0x3B,0x13,0x9B,0x22,0x51,0x4A,0x08,0x79,0x8E,0x34,0x04,0xDD,
+ 0xEF,0x95,0x19,0xB3,0xCD,0x3A,0x43,0x1B,0x30,0x2B,0x0A,0x6D,
+ 0xF2,0x5F,0x14,0x37,0x4F,0xE1,0x35,0x6D,0x6D,0x51,0xC2,0x45,
+ 0xE4,0x85,0xB5,0x76,0x62,0x5E,0x7E,0xC6,0xF4,0x4C,0x42,0xE9,
+ 0xA6,0x37,0xED,0x6B,0x0B,0xFF,0x5C,0xB6,0xF4,0x06,0xB7,0xED,
+ 0xEE,0x38,0x6B,0xFB,0x5A,0x89,0x9F,0xA5,0xAE,0x9F,0x24,0x11,
+ 0x7C,0x4B,0x1F,0xE6,0x49,0x28,0x66,0x51,0xEC,0xE4,0x5B,0x3D,
+ 0xC2,0x00,0x7C,0xB8,0xA1,0x63,0xBF,0x05,0x98,0xDA,0x48,0x36,
+ 0x1C,0x55,0xD3,0x9A,0x69,0x16,0x3F,0xA8,0xFD,0x24,0xCF,0x5F,
+ 0x83,0x65,0x5D,0x23,0xDC,0xA3,0xAD,0x96,0x1C,0x62,0xF3,0x56,
+ 0x20,0x85,0x52,0xBB,0x9E,0xD5,0x29,0x07,0x70,0x96,0x96,0x6D,
+ 0x67,0x0C,0x35,0x4E,0x4A,0xBC,0x98,0x04,0xF1,0x74,0x6C,0x08,
+ 0xCA,0x18,0x21,0x7C,0x32,0x90,0x5E,0x46,0x2E,0x36,0xCE,0x3B,
+ 0xE3,0x9E,0x77,0x2C,0x18,0x0E,0x86,0x03,0x9B,0x27,0x83,0xA2,
+ 0xEC,0x07,0xA2,0x8F,0xB5,0xC5,0x5D,0xF0,0x6F,0x4C,0x52,0xC9,
+ 0xDE,0x2B,0xCB,0xF6,0x95,0x58,0x17,0x18,0x39,0x95,0x49,0x7C,
+ 0xEA,0x95,0x6A,0xE5,0x15,0xD2,0x26,0x18,0x98,0xFA,0x05,0x10,
+ 0x15,0x72,0x8E,0x5A,0x8A,0xAA,0xC4,0x2D,0xAD,0x33,0x17,0x0D,
+ 0x04,0x50,0x7A,0x33,0xA8,0x55,0x21,0xAB,0xDF,0x1C,0xBA,0x64,
+ 0xEC,0xFB,0x85,0x04,0x58,0xDB,0xEF,0x0A,0x8A,0xEA,0x71,0x57,
+ 0x5D,0x06,0x0C,0x7D,0xB3,0x97,0x0F,0x85,0xA6,0xE1,0xE4,0xC7,
+ 0xAB,0xF5,0xAE,0x8C,0xDB,0x09,0x33,0xD7,0x1E,0x8C,0x94,0xE0,
+ 0x4A,0x25,0x61,0x9D,0xCE,0xE3,0xD2,0x26,0x1A,0xD2,0xEE,0x6B,
+ 0xF1,0x2F,0xFA,0x06,0xD9,0x8A,0x08,0x64,0xD8,0x76,0x02,0x73,
+ 0x3E,0xC8,0x6A,0x64,0x52,0x1F,0x2B,0x18,0x17,0x7B,0x20,0x0C,
+ 0xBB,0xE1,0x17,0x57,0x7A,0x61,0x5D,0x6C,0x77,0x09,0x88,0xC0,
+ 0xBA,0xD9,0x46,0xE2,0x08,0xE2,0x4F,0xA0,0x74,0xE5,0xAB,0x31,
+ 0x43,0xDB,0x5B,0xFC,0xE0,0xFD,0x10,0x8E,0x4B,0x82,0xD1,0x20,
+ 0xA9,0x21,0x08,0x01,0x1A,0x72,0x3C,0x12,0xA7,0x87,0xE6,0xD7,
+ 0x88,0x71,0x9A,0x10,0xBD,0xBA,0x5B,0x26,0x99,0xC3,0x27,0x18,
+ 0x6A,0xF4,0xE2,0x3C,0x1A,0x94,0x68,0x34,0xB6,0x15,0x0B,0xDA,
+ 0x25,0x83,0xE9,0xCA,0x2A,0xD4,0x4C,0xE8,0xDB,0xBB,0xC2,0xDB,
+ 0x04,0xDE,0x8E,0xF9,0x2E,0x8E,0xFC,0x14,0x1F,0xBE,0xCA,0xA6,
+ 0x28,0x7C,0x59,0x47,0x4E,0x6B,0xC0,0x5D,0x99,0xB2,0x96,0x4F,
+ 0xA0,0x90,0xC3,0xA2,0x23,0x3B,0xA1,0x86,0x51,0x5B,0xE7,0xED,
+ 0x1F,0x61,0x29,0x70,0xCE,0xE2,0xD7,0xAF,0xB8,0x1B,0xDD,0x76,
+ 0x21,0x70,0x48,0x1C,0xD0,0x06,0x91,0x27,0xD5,0xB0,0x5A,0xA9,
+ 0x93,0xB4,0xEA,0x98,0x8D,0x8F,0xDD,0xC1,0x86,0xFF,0xB7,0xDC,
+ 0x90,0xA6,0xC0,0x8F,0x4D,0xF4,0x35,0xC9,0x34,0x02,0x84,0x92,
+ 0x36,0xC3,0xFA,0xB4,0xD2,0x7C,0x70,0x26,0xC1,0xD4,0xDC,0xB2,
+ 0x60,0x26,0x46,0xDE,0xC9,0x75,0x1E,0x76,0x3D,0xBA,0x37,0xBD,
+ 0xF8,0xFF,0x94,0x06,0xAD,0x9E,0x53,0x0E,0xE5,0xDB,0x38,0x2F,
+ 0x41,0x30,0x01,0xAE,0xB0,0x6A,0x53,0xED,0x90,0x27,0xD8,0x31,
+ 0x17,0x97,0x27,0xB0,0x86,0x5A,0x89,0x18,0xDA,0x3E,0xDB,0xEB,
+ 0xCF,0x9B,0x14,0xED,0x44,0xCE,0x6C,0xBA,0xCE,0xD4,0xBB,0x1B,
+ 0xDB,0x7F,0x14,0x47,0xE6,0xCC,0x25,0x4B,0x33,0x20,0x51,0x51,
+ 0x2B,0xD7,0xAF,0x42,0x6F,0xB8,0xF4,0x01,0x37,0x8C,0xD2,0xBF,
+ 0x59,0x83,0xCA,0x01,0xC6,0x4B,0x92,0xEC,0xF0,0x32,0xEA,0x15,
+ 0xD1,0x72,0x1D,0x03,0xF4,0x82,0xD7,0xCE,0x6E,0x74,0xFE,0xF6,
+ 0xD5,0x5E,0x70,0x2F,0x46,0x98,0x0C,0x82,0xB5,0xA8,0x40,0x31,
+ 0x90,0x0B,0x1C,0x9E,0x59,0xE7,0xC9,0x7F,0xBE,0xC7,0xE8,0xF3,
+ 0x23,0xA9,0x7A,0x7E,0x36,0xCC,0x88,0xBE,0x0F,0x1D,0x45,0xB7,
+ 0xFF,0x58,0x5A,0xC5,0x4B,0xD4,0x07,0xB2,0x2B,0x41,0x54,0xAA,
+ 0xCC,0x8F,0x6D,0x7E,0xBF,0x48,0xE1,0xD8,0x14,0xCC,0x5E,0xD2,
+ 0x0F,0x80,0x37,0xE0,0xA7,0x97,0x15,0xEE,0xF2,0x9B,0xE3,0x28,
+ 0x06,0xA1,0xD5,0x8B,0xB7,0xC5,0xDA,0x76,0xF5,0x50,0xAA,0x3D,
+ 0x8A,0x1F,0xBF,0xF0,0xEB,0x19,0xCC,0xB1,0xA3,0x13,0xD5,0x5C,
+ 0xDA,0x56,0xC9,0xEC,0x2E,0xF2,0x96,0x32,0x38,0x7F,0xE8,0xD7,
+ 0x6E,0x3C,0x04,0x68,0x04,0x3E,0x8F,0x66,0x3F,0x48,0x60,0xEE,
+ 0x12,0xBF,0x2D,0x5B,0x0B,0x74,0x74,0xD6,0xE6,0x94,0xF9,0x1E,
+ 0x6D,0xBE,0x11,0x59,0x74,0xA3,0x92,0x6F,0x12,0xFE,0xE5,0xE4,
+ 0x38,0x77,0x7C,0xB6,0xA9,0x32,0xDF,0x8C,0xD8,0xBE,0xC4,0xD0,
+ 0x73,0xB9,0x31,0xBA,0x3B,0xC8,0x32,0xB6,0x8D,0x9D,0xD3,0x00,
+ 0x74,0x1F,0xA7,0xBF,0x8A,0xFC,0x47,0xED,0x25,0x76,0xF6,0x93,
+ 0x6B,0xA4,0x24,0x66,0x3A,0xAB,0x63,0x9C,0x5A,0xE4,0xF5,0x68,
+ 0x34,0x23,0xB4,0x74,0x2B,0xF1,0xC9,0x78,0x23,0x8F,0x16,0xCB,
+ 0xE3,0x9D,0x65,0x2D,0xE3,0xFD,0xB8,0xBE,0xFC,0x84,0x8A,0xD9,
+ 0x22,0x22,0x2E,0x04,0xA4,0x03,0x7C,0x07,0x13,0xEB,0x57,0xA8,
+ 0x1A,0x23,0xF0,0xC7,0x34,0x73,0xFC,0x64,0x6C,0xEA,0x30,0x6B,
+ 0x4B,0xCB,0xC8,0x86,0x2F,0x83,0x85,0xDD,0xFA,0x9D,0x4B,0x7F,
+ 0xA2,0xC0,0x87,0xE8,0x79,0x68,0x33,0x03,0xED,0x5B,0xDD,0x3A,
+ 0x06,0x2B,0x3C,0xF5,0xB3,0xA2,0x78,0xA6,0x6D,0x2A,0x13,0xF8,
+ 0x3F,0x44,0xF8,0x2D,0xDF,0x31,0x0E,0xE0,0x74,0xAB,0x6A,0x36,
+ 0x45,0x97,0xE8,0x99,0xA0,0x25,0x5D,0xC1,0x64,0xF3,0x1C,0xC5,
+ 0x08,0x46,0x85,0x1D,0xF9,0xAB,0x48,0x19,0x5D,0xED,0x7E,0xA1,
+ 0xB1,0xD5,0x10,0xBD,0x7E,0xE7,0x4D,0x73,0xFA,0xF3,0x6B,0xC3,
+ 0x1E,0xCF,0xA2,0x68,0x35,0x90,0x46,0xF4,0xEB,0x87,0x9F,0x92,
+ 0x40,0x09,0x43,0x8B,0x48,0x1C,0x6C,0xD7,0x88,0x9A,0x00,0x2E,
+ 0xD5,0xEE,0x38,0x2B,0xC9,0x19,0x0D,0xA6,0xFC,0x02,0x6E,0x47,
+ 0x95,0x58,0xE4,0x47,0x56,0x77,0xE9,0xAA,0x9E,0x30,0x50,0xE2,
+ 0x76,0x56,0x94,0xDF,0xC8,0x1F,0x56,0xE8,0x80,0xB9,0x6E,0x71,
+ 0x60,0xC9,0x80,0xDD,0x98,0xED,0xD3,0xDF,0xFF,0xFF,0xFF,0xFF,
+ 0xFF,0xFF,0xFF,0xFF,
+ };
+ return BN_bin2bn(RFC3526_PRIME_8192,sizeof(RFC3526_PRIME_8192),bn);
+ }
+
diff --git a/crypto/bn/bn_ctx.c b/crypto/bn/bn_ctx.c
index 5bd742a97c..3f2256f675 100644
--- a/crypto/bn/bn_ctx.c
+++ b/crypto/bn/bn_ctx.c
@@ -161,7 +161,7 @@ static void ctxdbg(BN_CTX *ctx)
fprintf(stderr,"(%08x): ", (unsigned int)ctx);
while(bnidx < ctx->used)
{
- fprintf(stderr,"%02x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
+ fprintf(stderr,"%03x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
if(!(bnidx % BN_CTX_POOL_SIZE))
item = item->next;
}
@@ -171,8 +171,8 @@ static void ctxdbg(BN_CTX *ctx)
while(fpidx < stack->depth)
{
while(bnidx++ < stack->indexes[fpidx])
- fprintf(stderr," ");
- fprintf(stderr,"^^ ");
+ fprintf(stderr," ");
+ fprintf(stderr,"^^^ ");
bnidx++;
fpidx++;
}
@@ -230,7 +230,10 @@ BN_CTX *BN_CTX_new(void)
void BN_CTX_free(BN_CTX *ctx)
{
+ if (ctx == NULL)
+ return;
#ifdef BN_CTX_DEBUG
+ {
BN_POOL_ITEM *pool = ctx->pool.head;
fprintf(stderr,"BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
ctx->stack.size, ctx->pool.size);
@@ -242,6 +245,7 @@ void BN_CTX_free(BN_CTX *ctx)
pool = pool->next;
}
fprintf(stderr,"\n");
+ }
#endif
BN_STACK_finish(&ctx->stack);
BN_POOL_finish(&ctx->pool);
@@ -257,8 +261,7 @@ void BN_CTX_start(BN_CTX *ctx)
/* (Try to) get a new frame pointer */
else if(!BN_STACK_push(&ctx->stack, ctx->used))
{
- /* I know this isn't BN_CTX_get, but ... */
- BNerr(BN_F_BN_CTX_GET,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+ BNerr(BN_F_BN_CTX_START,BN_R_TOO_MANY_TEMPORARY_VARIABLES);
ctx->err_stack++;
}
CTXDBG_EXIT(ctx);
diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c
index 3b4392955e..899d07ca24 100644
--- a/crypto/bn/bn_div.c
+++ b/crypto/bn/bn_div.c
@@ -169,13 +169,15 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
#endif /* OPENSSL_NO_ASM */
-/* BN_div computes dv := num / divisor, rounding towards zero, and sets up
- * rm such that dv*divisor + rm = num holds.
+/* BN_div[_no_branch] computes dv := num / divisor, rounding towards
+ * zero, and sets up rm such that dv*divisor + rm = num holds.
* Thus:
* dv->neg == num->neg ^ divisor->neg (unless the result is zero)
* rm->neg == num->neg (unless the remainder is zero)
* If 'dv' or 'rm' is NULL, the respective value is not returned.
*/
+static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
+ const BIGNUM *divisor, BN_CTX *ctx);
int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_CTX *ctx)
{
@@ -185,11 +187,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_ULONG d0,d1;
int num_n,div_n;
- if (dv)
- bn_check_top(dv);
- if (rm)
- bn_check_top(rm);
+ /* Invalid zero-padding would have particularly bad consequences
+ * in the case of 'num', so don't just rely on bn_check_top() for this one
+ * (bn_check_top() works only for BN_DEBUG builds) */
+ if (num->top > 0 && num->d[num->top - 1] == 0)
+ {
+ BNerr(BN_F_BN_DIV,BN_R_NOT_INITIALIZED);
+ return 0;
+ }
+
bn_check_top(num);
+
+ if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
+ {
+ return BN_div_no_branch(dv, rm, num, divisor, ctx);
+ }
+
+ bn_check_top(dv);
+ bn_check_top(rm);
+ /* bn_check_top(num); */ /* 'num' has been checked already */
bn_check_top(divisor);
if (BN_is_zero(divisor))
@@ -213,7 +229,8 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
if (dv == NULL)
res=BN_CTX_get(ctx);
else res=dv;
- if (sdiv == NULL || res == NULL) goto err;
+ if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL)
+ goto err;
/* First we normalise the numbers */
norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
@@ -320,7 +337,7 @@ X) -> 0x%08X\n",
t2 -= d1;
}
#else /* !BN_LLONG */
- BN_ULONG t2l,t2h,ql,qh;
+ BN_ULONG t2l,t2h;
q=bn_div_words(n0,n1,d0);
#ifdef BN_DEBUG_LEVITTE
@@ -338,9 +355,12 @@ X) -> 0x%08X\n",
t2l = d1 * q;
t2h = BN_UMULT_HIGH(d1,q);
#else
+ {
+ BN_ULONG ql, qh;
t2l=LBITS(d1); t2h=HBITS(d1);
ql =LBITS(q); qh =HBITS(q);
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+ }
#endif
for (;;)
@@ -394,8 +414,235 @@ X) -> 0x%08X\n",
BN_CTX_end(ctx);
return(1);
err:
- if (rm)
+ bn_check_top(rm);
+ BN_CTX_end(ctx);
+ return(0);
+ }
+
+
+/* BN_div_no_branch is a special version of BN_div. It does not contain
+ * branches that may leak sensitive information.
+ */
+static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
+ const BIGNUM *divisor, BN_CTX *ctx)
+ {
+ int norm_shift,i,loop;
+ BIGNUM *tmp,wnum,*snum,*sdiv,*res;
+ BN_ULONG *resp,*wnump;
+ BN_ULONG d0,d1;
+ int num_n,div_n;
+
+ bn_check_top(dv);
+ bn_check_top(rm);
+ /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
+ bn_check_top(divisor);
+
+ if (BN_is_zero(divisor))
+ {
+ BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
+ return(0);
+ }
+
+ BN_CTX_start(ctx);
+ tmp=BN_CTX_get(ctx);
+ snum=BN_CTX_get(ctx);
+ sdiv=BN_CTX_get(ctx);
+ if (dv == NULL)
+ res=BN_CTX_get(ctx);
+ else res=dv;
+ if (sdiv == NULL || res == NULL) goto err;
+
+ /* First we normalise the numbers */
+ norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
+ if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
+ sdiv->neg=0;
+ norm_shift+=BN_BITS2;
+ if (!(BN_lshift(snum,num,norm_shift))) goto err;
+ snum->neg=0;
+
+ /* Since we don't know whether snum is larger than sdiv,
+ * we pad snum with enough zeroes without changing its
+ * value.
+ */
+ if (snum->top <= sdiv->top+1)
+ {
+ if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+ for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+ snum->top = sdiv->top + 2;
+ }
+ else
+ {
+ if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+ snum->d[snum->top] = 0;
+ snum->top ++;
+ }
+
+ div_n=sdiv->top;
+ num_n=snum->top;
+ loop=num_n-div_n;
+ /* Lets setup a 'window' into snum
+ * This is the part that corresponds to the current
+ * 'area' being divided */
+ wnum.neg = 0;
+ wnum.d = &(snum->d[loop]);
+ wnum.top = div_n;
+ /* only needed when BN_ucmp messes up the values between top and max */
+ wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
+
+ /* Get the top 2 words of sdiv */
+ /* div_n=sdiv->top; */
+ d0=sdiv->d[div_n-1];
+ d1=(div_n == 1)?0:sdiv->d[div_n-2];
+
+ /* pointer to the 'top' of snum */
+ wnump= &(snum->d[num_n-1]);
+
+ /* Setup to 'res' */
+ res->neg= (num->neg^divisor->neg);
+ if (!bn_wexpand(res,(loop+1))) goto err;
+ res->top=loop-1;
+ resp= &(res->d[loop-1]);
+
+ /* space for temp */
+ if (!bn_wexpand(tmp,(div_n+1))) goto err;
+
+ /* if res->top == 0 then clear the neg value otherwise decrease
+ * the resp pointer */
+ if (res->top == 0)
+ res->neg = 0;
+ else
+ resp--;
+
+ for (i=0; i<loop-1; i++, wnump--, resp--)
+ {
+ BN_ULONG q,l0;
+ /* the first part of the loop uses the top two words of
+ * snum and sdiv to calculate a BN_ULONG q such that
+ * | wnum - sdiv * q | < sdiv */
+#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
+ BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
+ q=bn_div_3_words(wnump,d1,d0);
+#else
+ BN_ULONG n0,n1,rem=0;
+
+ n0=wnump[0];
+ n1=wnump[-1];
+ if (n0 == d0)
+ q=BN_MASK2;
+ else /* n0 < d0 */
+ {
+#ifdef BN_LLONG
+ BN_ULLONG t2;
+
+#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
+ q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
+#else
+ q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+ fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+ n0, n1, d0, q);
+#endif
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+ /*
+ * rem doesn't have to be BN_ULLONG. The least we
+ * know it's less that d0, isn't it?
+ */
+ rem=(n1-q*d0)&BN_MASK2;
+#endif
+ t2=(BN_ULLONG)d1*q;
+
+ for (;;)
+ {
+ if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
+ break;
+ q--;
+ rem += d0;
+ if (rem < d0) break; /* don't let rem overflow */
+ t2 -= d1;
+ }
+#else /* !BN_LLONG */
+ BN_ULONG t2l,t2h;
+
+ q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+ fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+ n0, n1, d0, q);
+#endif
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+ rem=(n1-q*d0)&BN_MASK2;
+#endif
+
+#if defined(BN_UMULT_LOHI)
+ BN_UMULT_LOHI(t2l,t2h,d1,q);
+#elif defined(BN_UMULT_HIGH)
+ t2l = d1 * q;
+ t2h = BN_UMULT_HIGH(d1,q);
+#else
+ {
+ BN_ULONG ql, qh;
+ t2l=LBITS(d1); t2h=HBITS(d1);
+ ql =LBITS(q); qh =HBITS(q);
+ mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+ }
+#endif
+
+ for (;;)
+ {
+ if ((t2h < rem) ||
+ ((t2h == rem) && (t2l <= wnump[-2])))
+ break;
+ q--;
+ rem += d0;
+ if (rem < d0) break; /* don't let rem overflow */
+ if (t2l < d1) t2h--; t2l -= d1;
+ }
+#endif /* !BN_LLONG */
+ }
+#endif /* !BN_DIV3W */
+
+ l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
+ tmp->d[div_n]=l0;
+ wnum.d--;
+ /* ingore top values of the bignums just sub the two
+ * BN_ULONG arrays with bn_sub_words */
+ if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+ {
+ /* Note: As we have considered only the leading
+ * two BN_ULONGs in the calculation of q, sdiv * q
+ * might be greater than wnum (but then (q-1) * sdiv
+ * is less or equal than wnum)
+ */
+ q--;
+ if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
+ /* we can't have an overflow here (assuming
+ * that q != 0, but if q == 0 then tmp is
+ * zero anyway) */
+ (*wnump)++;
+ }
+ /* store part of the result */
+ *resp = q;
+ }
+ bn_correct_top(snum);
+ if (rm != NULL)
+ {
+ /* Keep a copy of the neg flag in num because if rm==num
+ * BN_rshift() will overwrite it.
+ */
+ int neg = num->neg;
+ BN_rshift(rm,snum,norm_shift);
+ if (!BN_is_zero(rm))
+ rm->neg = neg;
bn_check_top(rm);
+ }
+ bn_correct_top(res);
+ BN_CTX_end(ctx);
+ return(1);
+err:
+ bn_check_top(rm);
BN_CTX_end(ctx);
return(0);
}
diff --git a/crypto/bn/bn_err.c b/crypto/bn/bn_err.c
index b42208ae0b..cfe2eb94a0 100644
--- a/crypto/bn/bn_err.c
+++ b/crypto/bn/bn_err.c
@@ -1,6 +1,6 @@
/* crypto/bn/bn_err.c */
/* ====================================================================
- * Copyright (c) 1999-2002 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -61,65 +61,77 @@
#include <stdio.h>
#include <openssl/err.h>
#include <openssl/bn.h>
-#include <openssl/opensslconf.h> /* To see if OPENSSL_NO_ERR is defined */
/* BEGIN ERROR CODES */
#ifndef OPENSSL_NO_ERR
+
+#define ERR_FUNC(func) ERR_PACK(ERR_LIB_BN,func,0)
+#define ERR_REASON(reason) ERR_PACK(ERR_LIB_BN,0,reason)
+
static ERR_STRING_DATA BN_str_functs[]=
{
-{ERR_PACK(0,BN_F_BN_BLINDING_CONVERT,0), "BN_BLINDING_convert"},
-{ERR_PACK(0,BN_F_BN_BLINDING_INVERT,0), "BN_BLINDING_invert"},
-{ERR_PACK(0,BN_F_BN_BLINDING_NEW,0), "BN_BLINDING_new"},
-{ERR_PACK(0,BN_F_BN_BLINDING_UPDATE,0), "BN_BLINDING_update"},
-{ERR_PACK(0,BN_F_BN_BN2DEC,0), "BN_bn2dec"},
-{ERR_PACK(0,BN_F_BN_BN2HEX,0), "BN_bn2hex"},
-{ERR_PACK(0,BN_F_BN_CTX_GET,0), "BN_CTX_get"},
-{ERR_PACK(0,BN_F_BN_CTX_NEW,0), "BN_CTX_new"},
-{ERR_PACK(0,BN_F_BN_DIV,0), "BN_div"},
-{ERR_PACK(0,BN_F_BN_EXPAND2,0), "bn_expand2"},
-{ERR_PACK(0,BN_F_BN_EXPAND_INTERNAL,0), "BN_EXPAND_INTERNAL"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD,0), "BN_GF2m_mod"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_DIV,0), "BN_GF2m_mod_div"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_EXP,0), "BN_GF2m_mod_exp"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_MUL,0), "BN_GF2m_mod_mul"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_SOLVE_QUAD,0), "BN_GF2m_mod_solve_quad"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR,0), "BN_GF2m_mod_solve_quad_arr"},
-{ERR_PACK(0,BN_F_BN_GF2M_MOD_SQR,0), "BN_GF2m_mod_sqr"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP2_MONT,0), "BN_mod_exp2_mont"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT,0), "BN_mod_exp_mont"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0), "BN_mod_exp_mont_word"},
-{ERR_PACK(0,BN_F_BN_MOD_INVERSE,0), "BN_mod_inverse"},
-{ERR_PACK(0,BN_F_BN_MOD_LSHIFT_QUICK,0), "BN_mod_lshift_quick"},
-{ERR_PACK(0,BN_F_BN_MOD_MUL_RECIPROCAL,0), "BN_mod_mul_reciprocal"},
-{ERR_PACK(0,BN_F_BN_MOD_SQRT,0), "BN_mod_sqrt"},
-{ERR_PACK(0,BN_F_BN_MPI2BN,0), "BN_mpi2bn"},
-{ERR_PACK(0,BN_F_BN_NEW,0), "BN_new"},
-{ERR_PACK(0,BN_F_BN_RAND,0), "BN_rand"},
-{ERR_PACK(0,BN_F_BN_RAND_RANGE,0), "BN_rand_range"},
-{ERR_PACK(0,BN_F_BN_USUB,0), "BN_usub"},
+{ERR_FUNC(BN_F_BNRAND), "BNRAND"},
+{ERR_FUNC(BN_F_BN_BLINDING_CONVERT_EX), "BN_BLINDING_convert_ex"},
+{ERR_FUNC(BN_F_BN_BLINDING_CREATE_PARAM), "BN_BLINDING_create_param"},
+{ERR_FUNC(BN_F_BN_BLINDING_INVERT_EX), "BN_BLINDING_invert_ex"},
+{ERR_FUNC(BN_F_BN_BLINDING_NEW), "BN_BLINDING_new"},
+{ERR_FUNC(BN_F_BN_BLINDING_UPDATE), "BN_BLINDING_update"},
+{ERR_FUNC(BN_F_BN_BN2DEC), "BN_bn2dec"},
+{ERR_FUNC(BN_F_BN_BN2HEX), "BN_bn2hex"},
+{ERR_FUNC(BN_F_BN_CTX_GET), "BN_CTX_get"},
+{ERR_FUNC(BN_F_BN_CTX_NEW), "BN_CTX_new"},
+{ERR_FUNC(BN_F_BN_CTX_START), "BN_CTX_start"},
+{ERR_FUNC(BN_F_BN_DIV), "BN_div"},
+{ERR_FUNC(BN_F_BN_DIV_NO_BRANCH), "BN_div_no_branch"},
+{ERR_FUNC(BN_F_BN_DIV_RECP), "BN_div_recp"},
+{ERR_FUNC(BN_F_BN_EXP), "BN_exp"},
+{ERR_FUNC(BN_F_BN_EXPAND2), "bn_expand2"},
+{ERR_FUNC(BN_F_BN_EXPAND_INTERNAL), "BN_EXPAND_INTERNAL"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD), "BN_GF2m_mod"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_EXP), "BN_GF2m_mod_exp"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_MUL), "BN_GF2m_mod_mul"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD), "BN_GF2m_mod_solve_quad"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR), "BN_GF2m_mod_solve_quad_arr"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SQR), "BN_GF2m_mod_sqr"},
+{ERR_FUNC(BN_F_BN_GF2M_MOD_SQRT), "BN_GF2m_mod_sqrt"},
+{ERR_FUNC(BN_F_BN_MOD_EXP2_MONT), "BN_mod_exp2_mont"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_MONT), "BN_mod_exp_mont"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_CONSTTIME), "BN_mod_exp_mont_consttime"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_MONT_WORD), "BN_mod_exp_mont_word"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_RECP), "BN_mod_exp_recp"},
+{ERR_FUNC(BN_F_BN_MOD_EXP_SIMPLE), "BN_mod_exp_simple"},
+{ERR_FUNC(BN_F_BN_MOD_INVERSE), "BN_mod_inverse"},
+{ERR_FUNC(BN_F_BN_MOD_INVERSE_NO_BRANCH), "BN_mod_inverse_no_branch"},
+{ERR_FUNC(BN_F_BN_MOD_LSHIFT_QUICK), "BN_mod_lshift_quick"},
+{ERR_FUNC(BN_F_BN_MOD_MUL_RECIPROCAL), "BN_mod_mul_reciprocal"},
+{ERR_FUNC(BN_F_BN_MOD_SQRT), "BN_mod_sqrt"},
+{ERR_FUNC(BN_F_BN_MPI2BN), "BN_mpi2bn"},
+{ERR_FUNC(BN_F_BN_NEW), "BN_new"},
+{ERR_FUNC(BN_F_BN_RAND), "BN_rand"},
+{ERR_FUNC(BN_F_BN_RAND_RANGE), "BN_rand_range"},
+{ERR_FUNC(BN_F_BN_USUB), "BN_usub"},
{0,NULL}
};
static ERR_STRING_DATA BN_str_reasons[]=
{
-{BN_R_ARG2_LT_ARG3 ,"arg2 lt arg3"},
-{BN_R_BAD_RECIPROCAL ,"bad reciprocal"},
-{BN_R_BIGNUM_TOO_LONG ,"bignum too long"},
-{BN_R_CALLED_WITH_EVEN_MODULUS ,"called with even modulus"},
-{BN_R_DIV_BY_ZERO ,"div by zero"},
-{BN_R_ENCODING_ERROR ,"encoding error"},
-{BN_R_EXPAND_ON_STATIC_BIGNUM_DATA ,"expand on static bignum data"},
-{BN_R_INPUT_NOT_REDUCED ,"input not reduced"},
-{BN_R_INVALID_LENGTH ,"invalid length"},
-{BN_R_INVALID_RANGE ,"invalid range"},
-{BN_R_NOT_A_SQUARE ,"not a square"},
-{BN_R_NOT_IMPLEMENTED ,"not implemented"},
-{BN_R_NOT_INITIALIZED ,"not initialized"},
-{BN_R_NO_INVERSE ,"no inverse"},
-{BN_R_NO_SOLUTION ,"no solution"},
-{BN_R_P_IS_NOT_PRIME ,"p is not prime"},
-{BN_R_TOO_MANY_ITERATIONS ,"too many iterations"},
-{BN_R_TOO_MANY_TEMPORARY_VARIABLES ,"too many temporary variables"},
+{ERR_REASON(BN_R_ARG2_LT_ARG3) ,"arg2 lt arg3"},
+{ERR_REASON(BN_R_BAD_RECIPROCAL) ,"bad reciprocal"},
+{ERR_REASON(BN_R_BIGNUM_TOO_LONG) ,"bignum too long"},
+{ERR_REASON(BN_R_CALLED_WITH_EVEN_MODULUS),"called with even modulus"},
+{ERR_REASON(BN_R_DIV_BY_ZERO) ,"div by zero"},
+{ERR_REASON(BN_R_ENCODING_ERROR) ,"encoding error"},
+{ERR_REASON(BN_R_EXPAND_ON_STATIC_BIGNUM_DATA),"expand on static bignum data"},
+{ERR_REASON(BN_R_INPUT_NOT_REDUCED) ,"input not reduced"},
+{ERR_REASON(BN_R_INVALID_LENGTH) ,"invalid length"},
+{ERR_REASON(BN_R_INVALID_RANGE) ,"invalid range"},
+{ERR_REASON(BN_R_NOT_A_SQUARE) ,"not a square"},
+{ERR_REASON(BN_R_NOT_INITIALIZED) ,"not initialized"},
+{ERR_REASON(BN_R_NO_INVERSE) ,"no inverse"},
+{ERR_REASON(BN_R_NO_SOLUTION) ,"no solution"},
+{ERR_REASON(BN_R_P_IS_NOT_PRIME) ,"p is not prime"},
+{ERR_REASON(BN_R_TOO_MANY_ITERATIONS) ,"too many iterations"},
+{ERR_REASON(BN_R_TOO_MANY_TEMPORARY_VARIABLES),"too many temporary variables"},
{0,NULL}
};
@@ -127,15 +139,12 @@ static ERR_STRING_DATA BN_str_reasons[]=
void ERR_load_BN_strings(void)
{
- static int init=1;
-
- if (init)
- {
- init=0;
#ifndef OPENSSL_NO_ERR
- ERR_load_strings(ERR_LIB_BN,BN_str_functs);
- ERR_load_strings(ERR_LIB_BN,BN_str_reasons);
-#endif
+ if (ERR_func_error_string(BN_str_functs[0].error) == NULL)
+ {
+ ERR_load_strings(0,BN_str_functs);
+ ERR_load_strings(0,BN_str_reasons);
}
+#endif
}
diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c
index d6bb2b4397..d9b6c737fc 100644
--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -56,7 +56,7 @@
* [including the GNU Public Licence.]
*/
/* ====================================================================
- * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -113,6 +113,7 @@
#include "cryptlib.h"
#include "bn_lcl.h"
+/* maximum precomputation table size for *variable* sliding windows */
#define TABLE_SIZE 32
/* this one works - simple but works */
@@ -121,12 +122,20 @@ int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
int i,bits,ret=0;
BIGNUM *v,*rr;
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+ {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ BNerr(BN_F_BN_EXP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return -1;
+ }
+
BN_CTX_start(ctx);
if ((r == a) || (r == p))
rr = BN_CTX_get(ctx);
else
rr = r;
- if ((v = BN_CTX_get(ctx)) == NULL) goto err;
+ v = BN_CTX_get(ctx);
+ if (rr == NULL || v == NULL) goto err;
if (BN_copy(v,a) == NULL) goto err;
bits=BN_num_bits(p);
@@ -205,7 +214,7 @@ int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
if (BN_is_odd(m))
{
# ifdef MONT_EXP_WORD
- if (a->top == 1 && !a->neg)
+ if (a->top == 1 && !a->neg && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0))
{
BN_ULONG A = a->d[0];
ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL);
@@ -237,6 +246,13 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
BIGNUM *val[TABLE_SIZE];
BN_RECP_CTX recp;
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+ {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ BNerr(BN_F_BN_MOD_EXP_RECP,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return -1;
+ }
+
bits=BN_num_bits(p);
if (bits == 0)
@@ -364,6 +380,11 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
BIGNUM *val[TABLE_SIZE];
BN_MONT_CTX *mont=NULL;
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+ {
+ return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
+ }
+
bn_check_top(a);
bn_check_top(p);
bn_check_top(m);
@@ -495,6 +516,212 @@ err:
return(ret);
}
+
+/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
+ * so that accessing any of these table values shows the same access pattern as far
+ * as cache lines are concerned. The following functions are used to transfer a BIGNUM
+ * from/to that table. */
+
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+ {
+ size_t i, j;
+
+ if (bn_wexpand(b, top) == NULL)
+ return 0;
+ while (b->top < top)
+ {
+ b->d[b->top++] = 0;
+ }
+
+ for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
+ {
+ buf[j] = ((unsigned char*)b->d)[i];
+ }
+
+ bn_correct_top(b);
+ return 1;
+ }
+
+static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+ {
+ size_t i, j;
+
+ if (bn_wexpand(b, top) == NULL)
+ return 0;
+
+ for (i=0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
+ {
+ ((unsigned char*)b->d)[i] = buf[j];
+ }
+
+ b->top = top;
+ bn_correct_top(b);
+ return 1;
+ }
+
+/* Given a pointer value, compute the next address that is a cache line multiple. */
+#define MOD_EXP_CTIME_ALIGN(x_) \
+ ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+
+/* This variant of BN_mod_exp_mont() uses fixed windows and the special
+ * precomputation memory layout to limit data-dependency to a minimum
+ * to protect secret exponents (cf. the hyper-threading timing attacks
+ * pointed out by Colin Percival,
+ * http://www.daemonology.net/hyperthreading-considered-harmful/)
+ */
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+ const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
+ {
+ int i,bits,ret=0,idx,window,wvalue;
+ int top;
+ BIGNUM *r;
+ const BIGNUM *aa;
+ BN_MONT_CTX *mont=NULL;
+
+ int numPowers;
+ unsigned char *powerbufFree=NULL;
+ int powerbufLen = 0;
+ unsigned char *powerbuf=NULL;
+ BIGNUM *computeTemp=NULL, *am=NULL;
+
+ bn_check_top(a);
+ bn_check_top(p);
+ bn_check_top(m);
+
+ top = m->top;
+
+ if (!(m->d[0] & 1))
+ {
+ BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,BN_R_CALLED_WITH_EVEN_MODULUS);
+ return(0);
+ }
+ bits=BN_num_bits(p);
+ if (bits == 0)
+ {
+ ret = BN_one(rr);
+ return ret;
+ }
+
+ /* Initialize BIGNUM context and allocate intermediate result */
+ BN_CTX_start(ctx);
+ r = BN_CTX_get(ctx);
+ if (r == NULL) goto err;
+
+ /* Allocate a montgomery context if it was not supplied by the caller.
+ * If this is not done, things will break in the montgomery part.
+ */
+ if (in_mont != NULL)
+ mont=in_mont;
+ else
+ {
+ if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
+ if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
+ }
+
+ /* Get the window size to use with size of p. */
+ window = BN_window_bits_for_ctime_exponent_size(bits);
+
+ /* Allocate a buffer large enough to hold all of the pre-computed
+ * powers of a.
+ */
+ numPowers = 1 << window;
+ powerbufLen = sizeof(m->d[0])*top*numPowers;
+ if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
+ goto err;
+
+ powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
+ memset(powerbuf, 0, powerbufLen);
+
+ /* Initialize the intermediate result. Do this early to save double conversion,
+ * once each for a^0 and intermediate result.
+ */
+ if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+
+ /* Initialize computeTemp as a^1 with montgomery precalcs */
+ computeTemp = BN_CTX_get(ctx);
+ am = BN_CTX_get(ctx);
+ if (computeTemp==NULL || am==NULL) goto err;
+
+ if (a->neg || BN_ucmp(a,m) >= 0)
+ {
+ if (!BN_mod(am,a,m,ctx))
+ goto err;
+ aa= am;
+ }
+ else
+ aa=a;
+ if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
+ if (!BN_copy(computeTemp, am)) goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+
+ /* If the window size is greater than 1, then calculate
+ * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
+ * (even powers could instead be computed as (a^(i/2))^2
+ * to use the slight performance advantage of sqr over mul).
+ */
+ if (window > 1)
+ {
+ for (i=2; i<numPowers; i++)
+ {
+ /* Calculate a^i = a^(i-1) * a */
+ if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+ goto err;
+ if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+ }
+ }
+
+ /* Adjust the number of bits up to a multiple of the window size.
+ * If the exponent length is not a multiple of the window size, then
+ * this pads the most significant bits with zeros to normalize the
+ * scanning loop to there's no special cases.
+ *
+ * * NOTE: Making the window size a power of two less than the native
+ * * word size ensures that the padded bits won't go past the last
+ * * word in the internal BIGNUM structure. Going past the end will
+ * * still produce the correct result, but causes a different branch
+ * * to be taken in the BN_is_bit_set function.
+ */
+ bits = ((bits+window-1)/window)*window;
+ idx=bits-1; /* The top bit of the window */
+
+ /* Scan the exponent one window at a time starting from the most
+ * significant bits.
+ */
+ while (idx >= 0)
+ {
+ wvalue=0; /* The 'value' of the window */
+
+ /* Scan the window, squaring the result as we go */
+ for (i=0; i<window; i++,idx--)
+ {
+ if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) goto err;
+ wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+ }
+
+ /* Fetch the appropriate pre-computed value from the pre-buf */
+ if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+
+ /* Multiply the result into the intermediate result */
+ if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ }
+
+ /* Convert the final result from montgomery to standard format */
+ if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+ ret=1;
+err:
+ if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
+ if (powerbuf!=NULL)
+ {
+ OPENSSL_cleanse(powerbuf,powerbufLen);
+ OPENSSL_free(powerbufFree);
+ }
+ if (am!=NULL) BN_clear(am);
+ if (computeTemp!=NULL) BN_clear(computeTemp);
+ BN_CTX_end(ctx);
+ return(ret);
+ }
+
int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
@@ -519,6 +746,13 @@ int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
#define BN_TO_MONTGOMERY_WORD(r, w, mont) \
(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+ {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ BNerr(BN_F_BN_MOD_EXP_MONT_WORD,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return -1;
+ }
+
bn_check_top(p);
bn_check_top(m);
@@ -648,6 +882,13 @@ int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
/* Table of variables obtained from 'ctx' */
BIGNUM *val[TABLE_SIZE];
+ if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0)
+ {
+ /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+ BNerr(BN_F_BN_MOD_EXP_SIMPLE,ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+ return -1;
+ }
+
bits=BN_num_bits(p);
if (bits == 0)
diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c
index 0248753f6d..4a352119ba 100644
--- a/crypto/bn/bn_gcd.c
+++ b/crypto/bn/bn_gcd.c
@@ -203,6 +203,8 @@ err:
/* solves ax == 1 (mod n) */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+ const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx);
BIGNUM *BN_mod_inverse(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
{
@@ -210,6 +212,11 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
BIGNUM *ret=NULL;
int sign;
+ if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(n, BN_FLG_CONSTTIME) != 0))
+ {
+ return BN_mod_inverse_no_branch(in, a, n, ctx);
+ }
+
bn_check_top(a);
bn_check_top(n);
@@ -488,7 +495,160 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
err:
if ((ret == NULL) && (in == NULL)) BN_free(R);
BN_CTX_end(ctx);
- if (ret)
- bn_check_top(ret);
+ bn_check_top(ret);
+ return(ret);
+ }
+
+
+/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse.
+ * It does not contain branches that may leak sensitive information.
+ */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
+ const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
+ {
+ BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL;
+ BIGNUM local_A, local_B;
+ BIGNUM *pA, *pB;
+ BIGNUM *ret=NULL;
+ int sign;
+
+ bn_check_top(a);
+ bn_check_top(n);
+
+ BN_CTX_start(ctx);
+ A = BN_CTX_get(ctx);
+ B = BN_CTX_get(ctx);
+ X = BN_CTX_get(ctx);
+ D = BN_CTX_get(ctx);
+ M = BN_CTX_get(ctx);
+ Y = BN_CTX_get(ctx);
+ T = BN_CTX_get(ctx);
+ if (T == NULL) goto err;
+
+ if (in == NULL)
+ R=BN_new();
+ else
+ R=in;
+ if (R == NULL) goto err;
+
+ BN_one(X);
+ BN_zero(Y);
+ if (BN_copy(B,a) == NULL) goto err;
+ if (BN_copy(A,n) == NULL) goto err;
+ A->neg = 0;
+
+ if (B->neg || (BN_ucmp(B, A) >= 0))
+ {
+ /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+ * BN_div_no_branch will be called eventually.
+ */
+ pB = &local_B;
+ BN_with_flags(pB, B, BN_FLG_CONSTTIME);
+ if (!BN_nnmod(B, pB, A, ctx)) goto err;
+ }
+ sign = -1;
+ /* From B = a mod |n|, A = |n| it follows that
+ *
+ * 0 <= B < A,
+ * -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|).
+ */
+
+ while (!BN_is_zero(B))
+ {
+ BIGNUM *tmp;
+
+ /*
+ * 0 < B < A,
+ * (*) -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|)
+ */
+
+ /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+ * BN_div_no_branch will be called eventually.
+ */
+ pA = &local_A;
+ BN_with_flags(pA, A, BN_FLG_CONSTTIME);
+
+ /* (D, M) := (A/B, A%B) ... */
+ if (!BN_div(D,M,pA,B,ctx)) goto err;
+
+ /* Now
+ * A = D*B + M;
+ * thus we have
+ * (**) sign*Y*a == D*B + M (mod |n|).
+ */
+
+ tmp=A; /* keep the BIGNUM object, the value does not matter */
+
+ /* (A, B) := (B, A mod B) ... */
+ A=B;
+ B=M;
+ /* ... so we have 0 <= B < A again */
+
+ /* Since the former M is now B and the former B is now A,
+ * (**) translates into
+ * sign*Y*a == D*A + B (mod |n|),
+ * i.e.
+ * sign*Y*a - D*A == B (mod |n|).
+ * Similarly, (*) translates into
+ * -sign*X*a == A (mod |n|).
+ *
+ * Thus,
+ * sign*Y*a + D*sign*X*a == B (mod |n|),
+ * i.e.
+ * sign*(Y + D*X)*a == B (mod |n|).
+ *
+ * So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
+ * -sign*X*a == B (mod |n|),
+ * sign*Y*a == A (mod |n|).
+ * Note that X and Y stay non-negative all the time.
+ */
+
+ if (!BN_mul(tmp,D,X,ctx)) goto err;
+ if (!BN_add(tmp,tmp,Y)) goto err;
+
+ M=Y; /* keep the BIGNUM object, the value does not matter */
+ Y=X;
+ X=tmp;
+ sign = -sign;
+ }
+
+ /*
+ * The while loop (Euclid's algorithm) ends when
+ * A == gcd(a,n);
+ * we have
+ * sign*Y*a == A (mod |n|),
+ * where Y is non-negative.
+ */
+
+ if (sign < 0)
+ {
+ if (!BN_sub(Y,n,Y)) goto err;
+ }
+ /* Now Y*a == A (mod |n|). */
+
+ if (BN_is_one(A))
+ {
+ /* Y*a == 1 (mod |n|) */
+ if (!Y->neg && BN_ucmp(Y,n) < 0)
+ {
+ if (!BN_copy(R,Y)) goto err;
+ }
+ else
+ {
+ if (!BN_nnmod(R,Y,n,ctx)) goto err;
+ }
+ }
+ else
+ {
+ BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH,BN_R_NO_INVERSE);
+ goto err;
+ }
+ ret=R;
+err:
+ if ((ret == NULL) && (in == NULL)) BN_free(R);
+ BN_CTX_end(ctx);
+ bn_check_top(ret);
return(ret);
}
diff --git a/crypto/bn/bn_gf2m.c b/crypto/bn/bn_gf2m.c
index 8a945f043f..f7551dacd9 100644
--- a/crypto/bn/bn_gf2m.c
+++ b/crypto/bn/bn_gf2m.c
@@ -121,74 +121,12 @@ static const BN_ULONG SQR_tb[16] =
SQR_tb[(w) >> 12 & 0xF] << 24 | SQR_tb[(w) >> 8 & 0xF] << 16 | \
SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
#endif
-#ifdef SIXTEEN_BIT
-#define SQR1(w) \
- SQR_tb[(w) >> 12 & 0xF] << 8 | SQR_tb[(w) >> 8 & 0xF]
-#define SQR0(w) \
- SQR_tb[(w) >> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF]
-#endif
-#ifdef EIGHT_BIT
-#define SQR1(w) \
- SQR_tb[(w) >> 4 & 0xF]
-#define SQR0(w) \
- SQR_tb[(w) & 15]
-#endif
/* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
* result is a polynomial r with degree < 2 * BN_BITS - 1
* The caller MUST ensure that the variables have the right amount
* of space allocated.
*/
-#ifdef EIGHT_BIT
-static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
- {
- register BN_ULONG h, l, s;
- BN_ULONG tab[4], top1b = a >> 7;
- register BN_ULONG a1, a2;
-
- a1 = a & (0x7F); a2 = a1 << 1;
-
- tab[0] = 0; tab[1] = a1; tab[2] = a2; tab[3] = a1^a2;
-
- s = tab[b & 0x3]; l = s;
- s = tab[b >> 2 & 0x3]; l ^= s << 2; h = s >> 6;
- s = tab[b >> 4 & 0x3]; l ^= s << 4; h ^= s >> 4;
- s = tab[b >> 6 ]; l ^= s << 6; h ^= s >> 2;
-
- /* compensate for the top bit of a */
-
- if (top1b & 01) { l ^= b << 7; h ^= b >> 1; }
-
- *r1 = h; *r0 = l;
- }
-#endif
-#ifdef SIXTEEN_BIT
-static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
- {
- register BN_ULONG h, l, s;
- BN_ULONG tab[4], top1b = a >> 15;
- register BN_ULONG a1, a2;
-
- a1 = a & (0x7FFF); a2 = a1 << 1;
-
- tab[0] = 0; tab[1] = a1; tab[2] = a2; tab[3] = a1^a2;
-
- s = tab[b & 0x3]; l = s;
- s = tab[b >> 2 & 0x3]; l ^= s << 2; h = s >> 14;
- s = tab[b >> 4 & 0x3]; l ^= s << 4; h ^= s >> 12;
- s = tab[b >> 6 & 0x3]; l ^= s << 6; h ^= s >> 10;
- s = tab[b >> 8 & 0x3]; l ^= s << 8; h ^= s >> 8;
- s = tab[b >>10 & 0x3]; l ^= s << 10; h ^= s >> 6;
- s = tab[b >>12 & 0x3]; l ^= s << 12; h ^= s >> 4;
- s = tab[b >>14 ]; l ^= s << 14; h ^= s >> 2;
-
- /* compensate for the top bit of a */
-
- if (top1b & 01) { l ^= b << 15; h ^= b >> 1; }
-
- *r1 = h; *r0 = l;
- }
-#endif
#ifdef THIRTY_TWO_BIT
static void bn_GF2m_mul_1x1(BN_ULONG *r1, BN_ULONG *r0, const BN_ULONG a, const BN_ULONG b)
{
@@ -320,7 +258,7 @@ int BN_GF2m_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
/* Performs modular reduction of a and store result in r. r could be a. */
-int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
+int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
{
int j, k;
int n, dN, d0, d1;
@@ -384,7 +322,11 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
if (zz == 0) break;
d1 = BN_BITS2 - d0;
- if (d0) z[dN] = (z[dN] << d1) >> d1; /* clear up the top d1 bits */
+ /* clear up the top d1 bits */
+ if (d0)
+ z[dN] = (z[dN] << d1) >> d1;
+ else
+ z[dN] = 0;
z[0] ^= zz; /* reduction t^0 component */
for (k = 1; p[k] != 0; k++)
@@ -417,11 +359,11 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[])
int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
{
int ret = 0;
- const int max = BN_num_bits(p);
- unsigned int *arr=NULL;
+ const int max = BN_num_bits(p) + 1;
+ int *arr=NULL;
bn_check_top(a);
bn_check_top(p);
- if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+ if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
ret = BN_GF2m_poly2arr(p, arr, max);
if (!ret || ret > max)
{
@@ -439,7 +381,7 @@ err:
/* Compute the product of two polynomials a and b, reduce modulo p, and store
* the result in r. r could be a or b; a could be b.
*/
-int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_mul_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[], BN_CTX *ctx)
{
int zlen, i, j, k, ret = 0;
BIGNUM *s;
@@ -495,12 +437,12 @@ err:
int BN_GF2m_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx)
{
int ret = 0;
- const int max = BN_num_bits(p);
- unsigned int *arr=NULL;
+ const int max = BN_num_bits(p) + 1;
+ int *arr=NULL;
bn_check_top(a);
bn_check_top(b);
bn_check_top(p);
- if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+ if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
ret = BN_GF2m_poly2arr(p, arr, max);
if (!ret || ret > max)
{
@@ -516,7 +458,7 @@ err:
/* Square a, reduce the result mod p, and store it in a. r could be a. */
-int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_sqr_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
{
int i, ret = 0;
BIGNUM *s;
@@ -551,12 +493,12 @@ err:
int BN_GF2m_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
{
int ret = 0;
- const int max = BN_num_bits(p);
- unsigned int *arr=NULL;
+ const int max = BN_num_bits(p) + 1;
+ int *arr=NULL;
bn_check_top(a);
bn_check_top(p);
- if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+ if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
ret = BN_GF2m_poly2arr(p, arr, max);
if (!ret || ret > max)
{
@@ -638,7 +580,7 @@ err:
* function is only provided for convenience; for best performance, use the
* BN_GF2m_mod_inv function.
*/
-int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *xx, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_inv_arr(BIGNUM *r, const BIGNUM *xx, const int p[], BN_CTX *ctx)
{
BIGNUM *field;
int ret = 0;
@@ -764,7 +706,7 @@ err:
* function is only provided for convenience; for best performance, use the
* BN_GF2m_mod_div function.
*/
-int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *yy, const BIGNUM *xx, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_div_arr(BIGNUM *r, const BIGNUM *yy, const BIGNUM *xx, const int p[], BN_CTX *ctx)
{
BIGNUM *field;
int ret = 0;
@@ -789,7 +731,7 @@ err:
* the result in r. r could be a.
* Uses simple square-and-multiply algorithm A.5.1 from IEEE P1363.
*/
-int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_exp_arr(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const int p[], BN_CTX *ctx)
{
int ret = 0, i, n;
BIGNUM *u;
@@ -835,12 +777,12 @@ err:
int BN_GF2m_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *p, BN_CTX *ctx)
{
int ret = 0;
- const int max = BN_num_bits(p);
- unsigned int *arr=NULL;
+ const int max = BN_num_bits(p) + 1;
+ int *arr=NULL;
bn_check_top(a);
bn_check_top(b);
bn_check_top(p);
- if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+ if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
ret = BN_GF2m_poly2arr(p, arr, max);
if (!ret || ret > max)
{
@@ -858,7 +800,7 @@ err:
* the result in r. r could be a.
* Uses exponentiation as in algorithm A.4.1 from IEEE P1363.
*/
-int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_sqrt_arr(BIGNUM *r, const BIGNUM *a, const int p[], BN_CTX *ctx)
{
int ret = 0;
BIGNUM *u;
@@ -894,15 +836,15 @@ err:
int BN_GF2m_mod_sqrt(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
{
int ret = 0;
- const int max = BN_num_bits(p);
- unsigned int *arr=NULL;
+ const int max = BN_num_bits(p) + 1;
+ int *arr=NULL;
bn_check_top(a);
bn_check_top(p);
- if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) * max)) == NULL) goto err;
+ if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
ret = BN_GF2m_poly2arr(p, arr, max);
if (!ret || ret > max)
{
- BNerr(BN_F_BN_GF2M_MOD_EXP,BN_R_INVALID_LENGTH);
+ BNerr(BN_F_BN_GF2M_MOD_SQRT,BN_R_INVALID_LENGTH);
goto err;
}
ret = BN_GF2m_mod_sqrt_arr(r, a, arr, ctx);
@@ -915,10 +857,9 @@ err:
/* Find r such that r^2 + r = a mod p. r could be a. If no r exists returns 0.
* Uses algorithms A.4.7 and A.4.6 from IEEE P1363.
*/
-int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a_, const unsigned int p[], BN_CTX *ctx)
+int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a_, const int p[], BN_CTX *ctx)
{
- int ret = 0, count = 0;
- unsigned int j;
+ int ret = 0, count = 0, j;
BIGNUM *a, *z, *rho, *w, *w2, *tmp;
bn_check_top(a_);
@@ -1013,11 +954,11 @@ err:
int BN_GF2m_mod_solve_quad(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
{
int ret = 0;
- const int max = BN_num_bits(p);
- unsigned int *arr=NULL;
+ const int max = BN_num_bits(p) + 1;
+ int *arr=NULL;
bn_check_top(a);
bn_check_top(p);
- if ((arr = (unsigned int *)OPENSSL_malloc(sizeof(unsigned int) *
+ if ((arr = (int *)OPENSSL_malloc(sizeof(int) *
max)) == NULL) goto err;
ret = BN_GF2m_poly2arr(p, arr, max);
if (!ret || ret > max)
@@ -1033,20 +974,17 @@ err:
}
/* Convert the bit-string representation of a polynomial
- * ( \sum_{i=0}^n a_i * x^i , where a_0 is *not* zero) into an array
- * of integers corresponding to the bits with non-zero coefficient.
+ * ( \sum_{i=0}^n a_i * x^i) into an array of integers corresponding
+ * to the bits with non-zero coefficient. Array is terminated with -1.
* Up to max elements of the array will be filled. Return value is total
- * number of coefficients that would be extracted if array was large enough.
+ * number of array elements that would be filled if array was large enough.
*/
-int BN_GF2m_poly2arr(const BIGNUM *a, unsigned int p[], int max)
+int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max)
{
int i, j, k = 0;
BN_ULONG mask;
- if (BN_is_zero(a) || !BN_is_bit_set(a, 0))
- /* a_0 == 0 => return error (the unsigned int array
- * must be terminated by 0)
- */
+ if (BN_is_zero(a))
return 0;
for (i = a->top - 1; i >= 0; i--)
@@ -1066,23 +1004,28 @@ int BN_GF2m_poly2arr(const BIGNUM *a, unsigned int p[], int max)
}
}
+ if (k < max) {
+ p[k] = -1;
+ k++;
+ }
+
return k;
}
/* Convert the coefficient array representation of a polynomial to a
- * bit-string. The array must be terminated by 0.
+ * bit-string. The array must be terminated by -1.
*/
-int BN_GF2m_arr2poly(const unsigned int p[], BIGNUM *a)
+int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
{
int i;
bn_check_top(a);
BN_zero(a);
- for (i = 0; p[i] != 0; i++)
+ for (i = 0; p[i] != -1; i++)
{
- BN_set_bit(a, p[i]);
+ if (BN_set_bit(a, p[i]) == 0)
+ return 0;
}
- BN_set_bit(a, 0);
bn_check_top(a);
return 1;
diff --git a/crypto/bn/bn_lcl.h b/crypto/bn/bn_lcl.h
index 45e19221aa..d7dff0d90c 100644
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -163,6 +163,45 @@ extern "C" {
+/* BN_mod_exp_mont_conttime is based on the assumption that the
+ * L1 data cache line width of the target processor is at least
+ * the following value.
+ */
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH ( 64 )
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/* Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime).
+ *
+ * To achieve the security goals of BN_mode_exp_mont_consttime, the
+ * maximum size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
+ *
+ * Window size thresholds are defined for cache line sizes of 32 and 64,
+ * cache line sizes where log_2(32)=5 and log_2(64)=6 respectively. A
+ * window size of 7 should only be used on processors that have a 128
+ * byte or greater cache line size.
+ */
+#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+# define BN_window_bits_for_ctime_exponent_size(b) \
+ ((b) > 937 ? 6 : \
+ (b) > 306 ? 5 : \
+ (b) > 89 ? 4 : \
+ (b) > 22 ? 3 : 1)
+# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
+
+#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+# define BN_window_bits_for_ctime_exponent_size(b) \
+ ((b) > 306 ? 5 : \
+ (b) > 89 ? 4 : \
+ (b) > 22 ? 3 : 1)
+# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
+
+#endif
+
+
/* Pentium pro 16,16,16,32,64 */
/* Alpha 16,16,16,16.64 */
#define BN_MULL_SIZE_NORMAL (16) /* 32 */
@@ -199,7 +238,7 @@ extern "C" {
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-# elif defined(__GNUC__)
+# elif defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
@@ -208,7 +247,7 @@ extern "C" {
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-# if defined(__GNUC__)
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
@@ -216,8 +255,9 @@ extern "C" {
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
-# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
-# if defined(__GNUC__)
+# elif (defined(__x86_64) || defined(__x86_64__)) && \
+ (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret,discard; \
asm ("mulq %3" \
@@ -231,6 +271,28 @@ extern "C" {
: "a"(a),"g"(b) \
: "cc");
# endif
+# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
+# if defined(_MSC_VER) && _MSC_VER>=1400
+ unsigned __int64 __umulh (unsigned __int64 a,unsigned __int64 b);
+ unsigned __int64 _umul128 (unsigned __int64 a,unsigned __int64 b,
+ unsigned __int64 *h);
+# pragma intrinsic(__umulh,_umul128)
+# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
+# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
+# endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("dmultu %1,%2" \
+ : "=h"(ret) \
+ : "r"(a), "r"(b) : "l"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b) \
+ asm ("dmultu %2,%3" \
+ : "=l"(low),"=h"(high) \
+ : "r"(a), "r"(b));
+# endif
# endif /* cpu */
#endif /* OPENSSL_NO_ASM */
@@ -274,6 +336,33 @@ extern "C" {
(r1)=Hw(t); \
}
+#elif defined(BN_UMULT_LOHI)
+#define mul_add(r,a,w,c) { \
+ BN_ULONG high,low,ret,tmp=(a); \
+ ret = (r); \
+ BN_UMULT_LOHI(low,high,w,tmp); \
+ ret += (c); \
+ (c) = (ret<(c))?1:0; \
+ (c) += high; \
+ ret += low; \
+ (c) += (ret<low)?1:0; \
+ (r) = ret; \
+ }
+
+#define mul(r,a,w,c) { \
+ BN_ULONG high,low,ret,ta=(a); \
+ BN_UMULT_LOHI(low,high,w,ta); \
+ ret = low + (c); \
+ (c) = high; \
+ (c) += (ret<low)?1:0; \
+ (r) = ret; \
+ }
+
+#define sqr(r0,r1,a) { \
+ BN_ULONG tmp=(a); \
+ BN_UMULT_LOHI(r0,r1,tmp,tmp); \
+ }
+
#elif defined(BN_UMULT_HIGH)
#define mul_add(r,a,w,c) { \
BN_ULONG high,low,ret,tmp=(a); \
@@ -406,6 +495,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
int cl, int dl);
BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
int cl, int dl);
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
#ifdef __cplusplus
}
diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index bbefd80309..5470fbe6ef 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -67,7 +67,7 @@
#include "cryptlib.h"
#include "bn_lcl.h"
-const char *BN_version="Big Number" OPENSSL_VERSION_PTEXT;
+const char BN_version[]="Big Number" OPENSSL_VERSION_PTEXT;
/* This stuff appears to be completely unused, so is deprecated */
#ifndef OPENSSL_NO_DEPRECATED
@@ -133,8 +133,8 @@ int BN_get_params(int which)
const BIGNUM *BN_value_one(void)
{
- static BN_ULONG data_one=1L;
- static BIGNUM const_one={&data_one,1,1,0,BN_FLG_STATIC_DATA};
+ static const BN_ULONG data_one=1L;
+ static const BIGNUM const_one={(BN_ULONG *)&data_one,1,1,0,BN_FLG_STATIC_DATA};
return(&const_one);
}
@@ -160,7 +160,7 @@ char *BN_options(void)
int BN_num_bits_word(BN_ULONG l)
{
- static const char bits[256]={
+ static const unsigned char bits[256]={
0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -235,7 +235,7 @@ int BN_num_bits_word(BN_ULONG l)
else
#endif
{
-#if defined(SIXTEEN_BIT) || defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
+#if defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
if (l & 0xff00L)
return(bits[(int)(l>>8)]+8);
else
@@ -531,46 +531,6 @@ BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
return(a);
}
-BIGNUM *BN_ncopy(BIGNUM *a, const BIGNUM *b, size_t n)
- {
- int i, min;
- BN_ULONG *A;
- const BN_ULONG *B;
-
- bn_check_top(b);
- if (a == b)
- return a;
-
- min = (b->top < (int)n)? b->top: (int)n;
- if (!min)
- {
- BN_zero(a);
- return a;
- }
- if (bn_wexpand(a, min) == NULL)
- return NULL;
-
- A=a->d;
- B=b->d;
- for (i=min>>2; i>0; i--, A+=4, B+=4)
- {
- BN_ULONG a0,a1,a2,a3;
- a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
- A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
- }
- switch (min&3)
- {
- case 3: A[2]=B[2];
- case 2: A[1]=B[1];
- case 1: A[0]=B[0];
- case 0: ;
- }
- a->top = min;
- a->neg = b->neg;
- bn_correct_top(a);
- return(a);
- }
-
void BN_swap(BIGNUM *a, BIGNUM *b)
{
int flags_old_a, flags_old_b;
@@ -803,7 +763,7 @@ int BN_is_bit_set(const BIGNUM *a, int n)
i=n/BN_BITS2;
j=n%BN_BITS2;
if (a->top <= i) return 0;
- return((a->d[i]&(((BN_ULONG)1)<<j))?1:0);
+ return (int)(((a->d[i])>>j)&((BN_ULONG)1));
}
int BN_mask_bits(BIGNUM *a, int n)
@@ -827,6 +787,14 @@ int BN_mask_bits(BIGNUM *a, int n)
return(1);
}
+void BN_set_negative(BIGNUM *a, int b)
+ {
+ if (b && !BN_is_zero(a))
+ a->neg = 1;
+ else
+ a->neg = 0;
+ }
+
int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
{
int i;
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c
index 61416483cb..7224637ab3 100644
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -55,6 +55,59 @@
* copied and put under another distribution licence
* [including the GNU Public Licence.]
*/
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com). This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
/*
* Details about Montgomery multiplication algorithms can be found at
@@ -69,11 +122,30 @@
#define MONT_WORD /* use the faster word-based algorithm */
+#ifdef MONT_WORD
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
+#endif
+
int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_MONT_CTX *mont, BN_CTX *ctx)
{
BIGNUM *tmp;
int ret=0;
+#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
+ int num = mont->N.top;
+
+ if (num>1 && a->top==num && b->top==num)
+ {
+ if (bn_wexpand(r,num) == NULL) return(0);
+ if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
+ {
+ r->neg = a->neg^b->neg;
+ r->top = num;
+ bn_correct_top(r);
+ return(1);
+ }
+ }
+#endif
BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx);
@@ -89,7 +161,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
if (!BN_mul(tmp,a,b,ctx)) goto err;
}
/* reduce from aRR to aR */
+#ifdef MONT_WORD
+ if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
+#else
if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
+#endif
bn_check_top(r);
ret=1;
err:
@@ -97,35 +173,25 @@ err:
return(ret);
}
-int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
- BN_CTX *ctx)
- {
- int retn=0;
-
#ifdef MONT_WORD
- BIGNUM *n,*r;
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
+ {
+ BIGNUM *n;
BN_ULONG *ap,*np,*rp,n0,v,*nrp;
int al,nl,max,i,x,ri;
- BN_CTX_start(ctx);
- if ((r = BN_CTX_get(ctx)) == NULL) goto err;
-
- if (!BN_copy(r,a)) goto err;
n= &(mont->N);
-
- ap=a->d;
/* mont->ri is the size of mont->N in bits (rounded up
to the word size) */
al=ri=mont->ri/BN_BITS2;
-
+
nl=n->top;
- if ((al == 0) || (nl == 0)) { r->top=0; return(1); }
+ if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
max=(nl+al+1); /* allow for overflow (no?) XXX */
- if (bn_wexpand(r,max) == NULL) goto err;
- if (bn_wexpand(ret,max) == NULL) goto err;
+ if (bn_wexpand(r,max) == NULL) return(0);
- r->neg=a->neg^n->neg;
+ r->neg^=n->neg;
np=n->d;
rp=r->d;
nrp= &(r->d[nl]);
@@ -139,10 +205,10 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
#endif
r->top=max;
- n0=mont->n0;
+ n0=mont->n0[0];
#ifdef BN_COUNT
- fprintf(stderr,"word BN_from_montgomery %d * %d\n",nl,nl);
+ fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
#endif
for (i=0; i<nl; i++)
{
@@ -174,20 +240,72 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
}
}
bn_correct_top(r);
-
- /* mont->ri will be a multiple of the word size */
-#if 0
- BN_rshift(ret,r,mont->ri);
-#else
- ret->neg = r->neg;
- x=ri;
+
+ /* mont->ri will be a multiple of the word size and below code
+ * is kind of BN_rshift(ret,r,mont->ri) equivalent */
+ if (r->top <= ri)
+ {
+ ret->top=0;
+ return(1);
+ }
+ al=r->top-ri;
+
+#define BRANCH_FREE 1
+#if BRANCH_FREE
+ if (bn_wexpand(ret,ri) == NULL) return(0);
+ x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
+ ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */
+ ret->neg=r->neg;
+
rp=ret->d;
- ap= &(r->d[x]);
- if (r->top < x)
- al=0;
- else
- al=r->top-x;
+ ap=&(r->d[ri]);
+
+ {
+ size_t m1,m2;
+
+ v=bn_sub_words(rp,ap,np,ri);
+ /* this ----------------^^ works even in al<ri case
+ * thanks to zealous zeroing of top of the vector in the
+ * beginning. */
+
+ /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+ /* in other words if subtraction result is real, then
+ * trick unconditional memcpy below to perform in-place
+ * "refresh" instead of actual copy. */
+ m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al<ri */
+ m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1); /* al>ri */
+ m1|=m2; /* (al!=ri) */
+ m1|=(0-(size_t)v); /* (al!=ri || v) */
+ m1&=~m2; /* (al!=ri || v) && !al>ri */
+ nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+ }
+
+ /* 'i<ri' is chosen to eliminate dependency on input data, even
+ * though it results in redundant copy in al<ri case. */
+ for (i=0,ri-=4; i<ri; i+=4)
+ {
+ BN_ULONG t1,t2,t3,t4;
+
+ t1=nrp[i+0];
+ t2=nrp[i+1];
+ t3=nrp[i+2]; ap[i+0]=0;
+ t4=nrp[i+3]; ap[i+1]=0;
+ rp[i+0]=t1; ap[i+2]=0;
+ rp[i+1]=t2; ap[i+3]=0;
+ rp[i+2]=t3;
+ rp[i+3]=t4;
+ }
+ for (ri+=4; i<ri; i++)
+ rp[i]=nrp[i], ap[i]=0;
+ bn_correct_top(r);
+ bn_correct_top(ret);
+#else
+ if (bn_wexpand(ret,al) == NULL) return(0);
ret->top=al;
+ ret->neg=r->neg;
+
+ rp=ret->d;
+ ap=&(r->d[ri]);
al-=4;
for (i=0; i<al; i+=4)
{
@@ -205,8 +323,30 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
al+=4;
for (; i<al; i++)
rp[i]=ap[i];
+
+ if (BN_ucmp(ret, &(mont->N)) >= 0)
+ {
+ if (!BN_usub(ret,ret,&(mont->N))) return(0);
+ }
#endif
-#else /* !MONT_WORD */
+ bn_check_top(ret);
+
+ return(1);
+ }
+#endif /* MONT_WORD */
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+ BN_CTX *ctx)
+ {
+ int retn=0;
+#ifdef MONT_WORD
+ BIGNUM *t;
+
+ BN_CTX_start(ctx);
+ if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
+ retn = BN_from_montgomery_word(ret,t,mont);
+ BN_CTX_end(ctx);
+#else /* !MONT_WORD */
BIGNUM *t1,*t2;
BN_CTX_start(ctx);
@@ -223,7 +363,6 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
if (!BN_mul(t1,t2,&mont->N,ctx)) goto err;
if (!BN_add(t2,a,t1)) goto err;
if (!BN_rshift(ret,t2,mont->ri)) goto err;
-#endif /* MONT_WORD */
if (BN_ucmp(ret, &(mont->N)) >= 0)
{
@@ -233,6 +372,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
bn_check_top(ret);
err:
BN_CTX_end(ctx);
+#endif /* MONT_WORD */
return(retn);
}
@@ -254,6 +394,7 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
BN_init(&(ctx->RR));
BN_init(&(ctx->N));
BN_init(&(ctx->Ni));
+ ctx->n0[0] = ctx->n0[1] = 0;
ctx->flags=0;
}
@@ -285,16 +426,55 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
BIGNUM tmod;
BN_ULONG buf[2];
+ BN_init(&tmod);
+ tmod.d=buf;
+ tmod.dmax=2;
+ tmod.neg=0;
+
mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
+
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+ /* Only certain BN_BITS2<=32 platforms actually make use of
+ * n0[1], and we could use the #else case (with a shorter R
+ * value) for the others. However, currently only the assembler
+ * files do know which is which. */
+
+ BN_zero(R);
+ if (!(BN_set_bit(R,2*BN_BITS2))) goto err;
+
+ tmod.top=0;
+ if ((buf[0] = mod->d[0])) tmod.top=1;
+ if ((buf[1] = mod->top>1 ? mod->d[1] : 0)) tmod.top=2;
+
+ if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
+ goto err;
+ if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
+ if (!BN_is_zero(Ri))
+ {
+ if (!BN_sub_word(Ri,1)) goto err;
+ }
+ else /* if N mod word size == 1 */
+ {
+ if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
+ goto err;
+ /* Ri-- (mod double word size) */
+ Ri->neg=0;
+ Ri->d[0]=BN_MASK2;
+ Ri->d[1]=BN_MASK2;
+ Ri->top=2;
+ }
+ if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
+ /* Ni = (R*Ri-1)/N,
+ * keep only couple of least significant words: */
+ mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+ mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
BN_zero(R);
if (!(BN_set_bit(R,BN_BITS2))) goto err; /* R */
buf[0]=mod->d[0]; /* tmod = N mod word size */
buf[1]=0;
- tmod.d=buf;
- tmod.top=1;
- tmod.dmax=2;
- tmod.neg=0;
+ tmod.top = buf[0] != 0 ? 1 : 0;
/* Ri = R^-1 mod N*/
if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
goto err;
@@ -310,7 +490,9 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
/* Ni = (R*Ri-1)/N,
* keep only least significant word: */
- mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0;
+ mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+ mont->n0[1] = 0;
+#endif
}
#else /* !MONT_WORD */
{ /* bignum version */
@@ -346,7 +528,40 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
if (!BN_copy(&(to->N),&(from->N))) return NULL;
if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
to->ri=from->ri;
- to->n0=from->n0;
+ to->n0[0]=from->n0[0];
+ to->n0[1]=from->n0[1];
return(to);
}
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+ const BIGNUM *mod, BN_CTX *ctx)
+ {
+ int got_write_lock = 0;
+ BN_MONT_CTX *ret;
+
+ CRYPTO_r_lock(lock);
+ if (!*pmont)
+ {
+ CRYPTO_r_unlock(lock);
+ CRYPTO_w_lock(lock);
+ got_write_lock = 1;
+
+ if (!*pmont)
+ {
+ ret = BN_MONT_CTX_new();
+ if (ret && !BN_MONT_CTX_set(ret, mod, ctx))
+ BN_MONT_CTX_free(ret);
+ else
+ *pmont = ret;
+ }
+ }
+
+ ret = *pmont;
+
+ if (got_write_lock)
+ CRYPTO_w_unlock(lock);
+ else
+ CRYPTO_r_unlock(lock);
+
+ return ret;
+ }
diff --git a/crypto/bn/bn_mul.c b/crypto/bn/bn_mul.c
index aec1eafc65..a0e9ec3b46 100644
--- a/crypto/bn/bn_mul.c
+++ b/crypto/bn/bn_mul.c
@@ -389,6 +389,7 @@ BN_ULONG bn_add_part_words(BN_ULONG *r,
* a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
* a[1]*b[1]
*/
+/* dnX may not be positive, but n2/2+dnX has to be */
void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
int dna, int dnb, BN_ULONG *t)
{
@@ -398,7 +399,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
BN_ULONG ln,lo,*p;
# ifdef BN_COUNT
- fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2);
+ fprintf(stderr," bn_mul_recursive %d%+d * %d%+d\n",n2,dna,n2,dnb);
# endif
# ifdef BN_MUL_COMBA
# if 0
@@ -545,6 +546,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
/* n+tn is the word length
* t needs to be n*4 is size, as does r */
+/* tnX may not be negative but less than n */
void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
int tna, int tnb, BN_ULONG *t)
{
@@ -553,8 +555,8 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
BN_ULONG ln,lo,*p;
# ifdef BN_COUNT
- fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n",
- tna, n, tnb, n);
+ fprintf(stderr," bn_mul_part_recursive (%d%+d) * (%d%+d)\n",
+ n, tna, n, tnb);
# endif
if (n < 8)
{
@@ -655,14 +657,17 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
for (;;)
{
i/=2;
- if (i < tna && i < tnb)
+ /* these simplified conditions work
+ * exclusively because difference
+ * between tna and tnb is 1 or 0 */
+ if (i < tna || i < tnb)
{
bn_mul_part_recursive(&(r[n2]),
&(a[n]),&(b[n]),
i,tna-i,tnb-i,p);
break;
}
- else if (i <= tna && i <= tnb)
+ else if (i == tna || i == tnb)
{
bn_mul_recursive(&(r[n2]),
&(a[n]),&(b[n]),
@@ -1023,17 +1028,19 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
assert(j <= al || j <= bl);
k = j+j;
t = BN_CTX_get(ctx);
+ if (t == NULL)
+ goto err;
if (al > j || bl > j)
{
- bn_wexpand(t,k*4);
- bn_wexpand(rr,k*4);
+ if (bn_wexpand(t,k*4) == NULL) goto err;
+ if (bn_wexpand(rr,k*4) == NULL) goto err;
bn_mul_part_recursive(rr->d,a->d,b->d,
j,al-j,bl-j,t->d);
}
else /* al <= j || bl <= j */
{
- bn_wexpand(t,k*2);
- bn_wexpand(rr,k*2);
+ if (bn_wexpand(t,k*2) == NULL) goto err;
+ if (bn_wexpand(rr,k*2) == NULL) goto err;
bn_mul_recursive(rr->d,a->d,b->d,
j,al-j,bl-j,t->d);
}
diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c
index bbe2cbe749..2ca5b01391 100644
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@@ -1,6 +1,9 @@
/* crypto/bn/bn_nist.c */
+/*
+ * Written by Nils Larsch for the OpenSSL project
+ */
/* ====================================================================
- * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved.
+ * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -56,795 +59,778 @@
#include "bn_lcl.h"
#include "cryptlib.h"
+
#define BN_NIST_192_TOP (192+BN_BITS2-1)/BN_BITS2
#define BN_NIST_224_TOP (224+BN_BITS2-1)/BN_BITS2
#define BN_NIST_256_TOP (256+BN_BITS2-1)/BN_BITS2
#define BN_NIST_384_TOP (384+BN_BITS2-1)/BN_BITS2
#define BN_NIST_521_TOP (521+BN_BITS2-1)/BN_BITS2
+/* pre-computed tables are "carry-less" values of modulus*(i+1) */
#if BN_BITS2 == 64
-const static BN_ULONG _nist_p_192[] =
- {0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFEULL,
- 0xFFFFFFFFFFFFFFFFULL};
-const static BN_ULONG _nist_p_224[] =
+static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
+ {0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFFULL},
+ {0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFFULL},
+ {0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFCULL,0xFFFFFFFFFFFFFFFFULL}
+ };
+static const BN_ULONG _nist_p_192_sqr[] = {
+ 0x0000000000000001ULL,0x0000000000000002ULL,0x0000000000000001ULL,
+ 0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFDULL,0xFFFFFFFFFFFFFFFFULL
+ };
+static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
{0x0000000000000001ULL,0xFFFFFFFF00000000ULL,
- 0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL};
-const static BN_ULONG _nist_p_256[] =
+ 0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL},
+ {0x0000000000000002ULL,0xFFFFFFFE00000000ULL,
+ 0xFFFFFFFFFFFFFFFFULL,0x00000001FFFFFFFFULL} /* this one is "carry-full" */
+ };
+static const BN_ULONG _nist_p_224_sqr[] = {
+ 0x0000000000000001ULL,0xFFFFFFFE00000000ULL,
+ 0xFFFFFFFFFFFFFFFFULL,0x0000000200000000ULL,
+ 0x0000000000000000ULL,0xFFFFFFFFFFFFFFFEULL,
+ 0xFFFFFFFFFFFFFFFFULL
+ };
+static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
{0xFFFFFFFFFFFFFFFFULL,0x00000000FFFFFFFFULL,
- 0x0000000000000000ULL,0xFFFFFFFF00000001ULL};
-const static BN_ULONG _nist_p_384[] =
- {0x00000000FFFFFFFFULL,0xFFFFFFFF00000000ULL,
- 0xFFFFFFFFFFFFFFFEULL,0xFFFFFFFFFFFFFFFFULL,
- 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL};
-const static BN_ULONG _nist_p_521[] =
+ 0x0000000000000000ULL,0xFFFFFFFF00000001ULL},
+ {0xFFFFFFFFFFFFFFFEULL,0x00000001FFFFFFFFULL,
+ 0x0000000000000000ULL,0xFFFFFFFE00000002ULL},
+ {0xFFFFFFFFFFFFFFFDULL,0x00000002FFFFFFFFULL,
+ 0x0000000000000000ULL,0xFFFFFFFD00000003ULL},
+ {0xFFFFFFFFFFFFFFFCULL,0x00000003FFFFFFFFULL,
+ 0x0000000000000000ULL,0xFFFFFFFC00000004ULL},
+ {0xFFFFFFFFFFFFFFFBULL,0x00000004FFFFFFFFULL,
+ 0x0000000000000000ULL,0xFFFFFFFB00000005ULL},
+ };
+static const BN_ULONG _nist_p_256_sqr[] = {
+ 0x0000000000000001ULL,0xFFFFFFFE00000000ULL,
+ 0xFFFFFFFFFFFFFFFFULL,0x00000001FFFFFFFEULL,
+ 0x00000001FFFFFFFEULL,0x00000001FFFFFFFEULL,
+ 0xFFFFFFFE00000001ULL,0xFFFFFFFE00000002ULL
+ };
+static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
+ {0x00000000FFFFFFFFULL,0xFFFFFFFF00000000ULL,0xFFFFFFFFFFFFFFFEULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+ {0x00000001FFFFFFFEULL,0xFFFFFFFE00000000ULL,0xFFFFFFFFFFFFFFFDULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+ {0x00000002FFFFFFFDULL,0xFFFFFFFD00000000ULL,0xFFFFFFFFFFFFFFFCULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+ {0x00000003FFFFFFFCULL,0xFFFFFFFC00000000ULL,0xFFFFFFFFFFFFFFFBULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+ {0x00000004FFFFFFFBULL,0xFFFFFFFB00000000ULL,0xFFFFFFFFFFFFFFFAULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL},
+ };
+static const BN_ULONG _nist_p_384_sqr[] = {
+ 0xFFFFFFFE00000001ULL,0x0000000200000000ULL,0xFFFFFFFE00000000ULL,
+ 0x0000000200000000ULL,0x0000000000000001ULL,0x0000000000000000ULL,
+ 0x00000001FFFFFFFEULL,0xFFFFFFFE00000000ULL,0xFFFFFFFFFFFFFFFDULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL
+ };
+static const BN_ULONG _nist_p_521[] =
{0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
0x00000000000001FFULL};
+static const BN_ULONG _nist_p_521_sqr[] = {
+ 0x0000000000000001ULL,0x0000000000000000ULL,0x0000000000000000ULL,
+ 0x0000000000000000ULL,0x0000000000000000ULL,0x0000000000000000ULL,
+ 0x0000000000000000ULL,0x0000000000000000ULL,0xFFFFFFFFFFFFFC00ULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
+ 0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,0xFFFFFFFFFFFFFFFFULL,
+ 0xFFFFFFFFFFFFFFFFULL,0x000000000003FFFFULL
+ };
#elif BN_BITS2 == 32
-const static BN_ULONG _nist_p_192[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,
- 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-const static BN_ULONG _nist_p_224[] = {0x00000001,0x00000000,0x00000000,
- 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-const static BN_ULONG _nist_p_256[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
- 0x00000000,0x00000000,0x00000000,0x00000001,0xFFFFFFFF};
-const static BN_ULONG _nist_p_384[] = {0xFFFFFFFF,0x00000000,0x00000000,
- 0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
- 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
-const static BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
+static const BN_ULONG _nist_p_192[][BN_NIST_192_TOP] = {
+ {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFC,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}
+ };
+static const BN_ULONG _nist_p_192_sqr[] = {
+ 0x00000001,0x00000000,0x00000002,0x00000000,0x00000001,0x00000000,
+ 0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
+ };
+static const BN_ULONG _nist_p_224[][BN_NIST_224_TOP] = {
+ {0x00000001,0x00000000,0x00000000,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0x00000002,0x00000000,0x00000000,0xFFFFFFFE,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF}
+ };
+static const BN_ULONG _nist_p_224_sqr[] = {
+ 0x00000001,0x00000000,0x00000000,0xFFFFFFFE,
+ 0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000002,
+ 0x00000000,0x00000000,0xFFFFFFFE,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF
+ };
+static const BN_ULONG _nist_p_256[][BN_NIST_256_TOP] = {
+ {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x00000000,
+ 0x00000000,0x00000000,0x00000001,0xFFFFFFFF},
+ {0xFFFFFFFE,0xFFFFFFFF,0xFFFFFFFF,0x00000001,
+ 0x00000000,0x00000000,0x00000002,0xFFFFFFFE},
+ {0xFFFFFFFD,0xFFFFFFFF,0xFFFFFFFF,0x00000002,
+ 0x00000000,0x00000000,0x00000003,0xFFFFFFFD},
+ {0xFFFFFFFC,0xFFFFFFFF,0xFFFFFFFF,0x00000003,
+ 0x00000000,0x00000000,0x00000004,0xFFFFFFFC},
+ {0xFFFFFFFB,0xFFFFFFFF,0xFFFFFFFF,0x00000004,
+ 0x00000000,0x00000000,0x00000005,0xFFFFFFFB},
+ };
+static const BN_ULONG _nist_p_256_sqr[] = {
+ 0x00000001,0x00000000,0x00000000,0xFFFFFFFE,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFE,0x00000001,
+ 0xFFFFFFFE,0x00000001,0xFFFFFFFE,0x00000001,
+ 0x00000001,0xFFFFFFFE,0x00000002,0xFFFFFFFE
+ };
+static const BN_ULONG _nist_p_384[][BN_NIST_384_TOP] = {
+ {0xFFFFFFFF,0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFE,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0xFFFFFFFE,0x00000001,0x00000000,0xFFFFFFFE,0xFFFFFFFD,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0xFFFFFFFD,0x00000002,0x00000000,0xFFFFFFFD,0xFFFFFFFC,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0xFFFFFFFC,0x00000003,0x00000000,0xFFFFFFFC,0xFFFFFFFB,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ {0xFFFFFFFB,0x00000004,0x00000000,0xFFFFFFFB,0xFFFFFFFA,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF},
+ };
+static const BN_ULONG _nist_p_384_sqr[] = {
+ 0x00000001,0xFFFFFFFE,0x00000000,0x00000002,0x00000000,0xFFFFFFFE,
+ 0x00000000,0x00000002,0x00000001,0x00000000,0x00000000,0x00000000,
+ 0xFFFFFFFE,0x00000001,0x00000000,0xFFFFFFFE,0xFFFFFFFD,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
+ };
+static const BN_ULONG _nist_p_521[] = {0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
0xFFFFFFFF,0x000001FF};
-#elif BN_BITS2 == 16
-const static BN_ULONG _nist_p_192[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFE,
- 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-const static BN_ULONG _nist_p_224[] = {0x0001,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-const static BN_ULONG _nist_p_256[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
- 0xFFFF,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0000,0xFFFF,
- 0xFFFF};
-const static BN_ULONG _nist_p_384[] = {0xFFFF,0xFFFF,0x0000,0x0000,0x0000,
- 0x0000,0xFFFF,0xFFFF,0xFFFE,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
- 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF};
-const static BN_ULONG _nist_p_521[] = {0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
- 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
- 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
- 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0x01FF};
-#elif BN_BITS2 == 8
-const static BN_ULONG _nist_p_192[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF};
-const static BN_ULONG _nist_p_224[] = {0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-const static BN_ULONG _nist_p_256[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x01,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF};
-const static BN_ULONG _nist_p_384[] = {0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,
- 0x00,0x00,0x00,0x00,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-const static BN_ULONG _nist_p_521[] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
- 0xFF,0x01};
+static const BN_ULONG _nist_p_521_sqr[] = {
+ 0x00000001,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
+ 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
+ 0x00000000,0x00000000,0x00000000,0x00000000,0xFFFFFC00,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
+ 0xFFFFFFFF,0xFFFFFFFF,0x0003FFFF
+ };
+#else
+#error "unsupported BN_BITS2"
#endif
+
+static const BIGNUM _bignum_nist_p_192 =
+ {
+ (BN_ULONG *)_nist_p_192[0],
+ BN_NIST_192_TOP,
+ BN_NIST_192_TOP,
+ 0,
+ BN_FLG_STATIC_DATA
+ };
+
+static const BIGNUM _bignum_nist_p_224 =
+ {
+ (BN_ULONG *)_nist_p_224[0],
+ BN_NIST_224_TOP,
+ BN_NIST_224_TOP,
+ 0,
+ BN_FLG_STATIC_DATA
+ };
+
+static const BIGNUM _bignum_nist_p_256 =
+ {
+ (BN_ULONG *)_nist_p_256[0],
+ BN_NIST_256_TOP,
+ BN_NIST_256_TOP,
+ 0,
+ BN_FLG_STATIC_DATA
+ };
+
+static const BIGNUM _bignum_nist_p_384 =
+ {
+ (BN_ULONG *)_nist_p_384[0],
+ BN_NIST_384_TOP,
+ BN_NIST_384_TOP,
+ 0,
+ BN_FLG_STATIC_DATA
+ };
+
+static const BIGNUM _bignum_nist_p_521 =
+ {
+ (BN_ULONG *)_nist_p_521,
+ BN_NIST_521_TOP,
+ BN_NIST_521_TOP,
+ 0,
+ BN_FLG_STATIC_DATA
+ };
+
+
const BIGNUM *BN_get0_nist_prime_192(void)
{
- static BIGNUM const_nist_192 = { (BN_ULONG *)_nist_p_192,
- BN_NIST_192_TOP, BN_NIST_192_TOP, 0, BN_FLG_STATIC_DATA };
- return &const_nist_192;
+ return &_bignum_nist_p_192;
}
const BIGNUM *BN_get0_nist_prime_224(void)
{
- static BIGNUM const_nist_224 = { (BN_ULONG *)_nist_p_224,
- BN_NIST_224_TOP, BN_NIST_224_TOP, 0, BN_FLG_STATIC_DATA };
- return &const_nist_224;
+ return &_bignum_nist_p_224;
}
const BIGNUM *BN_get0_nist_prime_256(void)
{
- static BIGNUM const_nist_256 = { (BN_ULONG *)_nist_p_256,
- BN_NIST_256_TOP, BN_NIST_256_TOP, 0, BN_FLG_STATIC_DATA };
- return &const_nist_256;
+ return &_bignum_nist_p_256;
}
const BIGNUM *BN_get0_nist_prime_384(void)
{
- static BIGNUM const_nist_384 = { (BN_ULONG *)_nist_p_384,
- BN_NIST_384_TOP, BN_NIST_384_TOP, 0, BN_FLG_STATIC_DATA };
- return &const_nist_384;
+ return &_bignum_nist_p_384;
}
const BIGNUM *BN_get0_nist_prime_521(void)
{
- static BIGNUM const_nist_521 = { (BN_ULONG *)_nist_p_521,
- BN_NIST_521_TOP, BN_NIST_521_TOP, 0, BN_FLG_STATIC_DATA };
- return &const_nist_521;
+ return &_bignum_nist_p_521;
}
-/* some misc internal functions */
-static BN_ULONG _256_data[BN_NIST_256_TOP*6];
-static int _is_set_256_data = 0;
-static void _init_256_data(void);
-static BN_ULONG _384_data[BN_NIST_384_TOP*8];
-static int _is_set_384_data = 0;
-static void _init_384_data(void);
+static void nist_cp_bn_0(BN_ULONG *buf, BN_ULONG *a, int top, int max)
+ {
+ int i;
+ BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
+
+#ifdef BN_DEBUG
+ OPENSSL_assert(top <= max);
+#endif
+ for (i = (top); i != 0; i--)
+ *_tmp1++ = *_tmp2++;
+ for (i = (max) - (top); i != 0; i--)
+ *_tmp1++ = (BN_ULONG) 0;
+ }
+
+static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
+ {
+ int i;
+ BN_ULONG *_tmp1 = (buf), *_tmp2 = (a);
+ for (i = (top); i != 0; i--)
+ *_tmp1++ = *_tmp2++;
+ }
-#define BN_NIST_ADD_ONE(a) while (!(++(*(a)))) ++(a);
-#define __buf_0 (BN_ULONG)0
-#define __buf_0_1 (BN_ULONG)0
-#define __buf_0_2 (BN_ULONG)0
#if BN_BITS2 == 64
-#define BN_64_BIT_BUF(n) BN_ULONG __buf_##n = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n) __buf_##n = (a)[(n)];
-#define BN_CP_64_FROM_BUF(a,n) *(a)++ = __buf_##n;
-#define BN_CASE_64_BIT(n,a) case (n): __buf_##n = (a)[(n)];
-#if UINT_MAX == 4294967295UL
-#define nist32 unsigned int
-#define BN_32_BIT_BUF(n) nist32 __buf_##n = (nist32)0;
-#define BN_CP_32_TO_BUF(n) __buf_##n = ((nist32 *)(a))[(n)];
-#define BN_CP_32_FROM_BUF(a,n) *((nist32)(a))++ = __buf_##n;
-#define BN_CASE_32_BIT(n,a) case (n): __buf_##n = ((nist32)(a))[(n)];
-#elif ULONG_MAX == 4294967295UL
-#define nist32 unsigned long
-#define BN_32_BIT_BUF(n) nist32 __buf_##n = (nist32)0;
-#define BN_CP_32_TO_BUF(n) __buf_##n = ((nist32 *)(a))[(n)];
-#define BN_CP_32_FROM_BUF(a,n) *((nist32)(a))++ = __buf_##n;
-#define BN_CASE_32_BIT(n,a) case (n): __buf_##n = ((nist32)(a))[(n)];
+#define bn_cp_64(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
+#define bn_64_set_0(to, n) (to)[n] = (BN_ULONG)0;
+/*
+ * two following macros are implemented under assumption that they
+ * are called in a sequence with *ascending* n, i.e. as they are...
+ */
+#define bn_cp_32_naked(to, n, from, m) (((n)&1)?(to[(n)/2]|=((m)&1)?(from[(m)/2]&BN_MASK2h):(from[(m)/2]<<32))\
+ :(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
+#define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
+#define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
#else
-#define NO_32_BIT_TYPE
-#endif
-#elif BN_BITS2 == 32
-#define BN_64_BIT_BUF(n) BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_2 = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n) __buf_##n##_2 = (a)[2*(n)+1];\
- __buf_##n##_1 = (a)[2*(n)];
-#define BN_CP_64_FROM_BUF(a,n) *(a)++ = __buf_##n##_1;\
- *(a)++ = __buf_##n##_2;
-#define BN_CASE_64_BIT(n,a) case 2*(n)+1: __buf_##n##_2 = (a)[2*(n)+1];\
- case 2*(n): __buf_##n##_1 = (a)[2*(n)];
-
-#define BN_32_BIT_BUF(n) BN_ULONG __buf_##n = (BN_ULONG)0;
-#define BN_CP_32_TO_BUF(n) __buf_##n = (a)[(n)];
-#define BN_CP_32_FROM_BUF(a,n) *(a)++ = __buf_##n;
-#define BN_CASE_32_BIT(n,a) case (n): __buf_##n = (a)[(n)];
-#elif BN_BITS2 == 16
-#define __buf_0_3 (BN_ULONG)0
-#define __buf_0_4 (BN_ULONG)0
-#define BN_64_BIT_BUF(n) BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_2 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_3 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_4 = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n) __buf_##n##_4 = (a)[4*(n)+3];\
- __buf_##n##_3 = (a)[4*(n)+2];\
- __buf_##n##_2 = (a)[4*(n)+1];\
- __buf_##n##_1 = (a)[4*(n)];
-#define BN_CP_64_FROM_BUF(a,n) *(a)++ = __buf_##n##_1;\
- *(a)++ = __buf_##n##_2;\
- *(a)++ = __buf_##n##_3;\
- *(a)++ = __buf_##n##_4;
-#define BN_CASE_64_BIT(n,a) case 4*(n)+3: __buf_##n##_4 = (a)[4*(n)+3];\
- case 4*(n)+2: __buf_##n##_3 = (a)[4*(n)+2];\
- case 4*(n)+1: __buf_##n##_2 = (a)[4*(n)+1];\
- case 4*(n): __buf_##n##_1 = (a)[4*(n)];
-#define BN_32_BIT_BUF(n) BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_2 = (BN_ULONG)0;
-#define BN_CP_32_TO_BUF(n) __buf_##n##_1 = (a)[2*(n)];\
- __buf_##n##_2 = (a)[2*(n)+1];
-#define BN_CP_32_FROM_BUF(a,n) *(a)++ = __buf_##n##_1;\
- *(a)++ = __buf_##n##_2;
-#define BN_CASE_32_BIT(n,a) case 2*(n)+1: __buf_##n##_2 = (a)[2*(n)+1];\
- case 2*(n): __buf_##n##_1 = (a)[2*(n)];
-#elif BN_BITS2 == 8
-#define __buf_0_3 (BN_ULONG)0
-#define __buf_0_4 (BN_ULONG)0
-#define __buf_0_5 (BN_ULONG)0
-#define __buf_0_6 (BN_ULONG)0
-#define __buf_0_7 (BN_ULONG)0
-#define __buf_0_8 (BN_ULONG)0
-#define BN_64_BIT_BUF(n) BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_2 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_3 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_4 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_5 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_6 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_7 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_8 = (BN_ULONG)0;
-#define BN_CP_64_TO_BUF(n) __buf_##n##_8 = (a)[8*(n)+7];\
- __buf_##n##_7 = (a)[8*(n)+6];\
- __buf_##n##_6 = (a)[8*(n)+5];\
- __buf_##n##_5 = (a)[8*(n)+4];\
- __buf_##n##_4 = (a)[8*(n)+3];\
- __buf_##n##_3 = (a)[8*(n)+2];\
- __buf_##n##_2 = (a)[8*(n)+1];\
- __buf_##n##_1 = (a)[8*(n)];
-#define BN_CP_64_FROM_BUF(a,n) *(a)++ = __buf_##n##_1;\
- *(a)++ = __buf_##n##_2;\
- *(a)++ = __buf_##n##_3;\
- *(a)++ = __buf_##n##_4;\
- *(a)++ = __buf_##n##_5;\
- *(a)++ = __buf_##n##_6;\
- *(a)++ = __buf_##n##_7;\
- *(a)++ = __buf_##n##_8;
-#define BN_CASE_64_BIT(n,a) case 8*(n)+7: __buf_##n##_8 = (a)[8*(n)+7];\
- case 8*(n)+6: __buf_##n##_7 = (a)[8*(n)+6];\
- case 8*(n)+5: __buf_##n##_6 = (a)[8*(n)+5];\
- case 8*(n)+4: __buf_##n##_5 = (a)[8*(n)+4];\
- case 8*(n)+3: __buf_##n##_4 = (a)[8*(n)+3];\
- case 8*(n)+2: __buf_##n##_3 = (a)[8*(n)+2];\
- case 8*(n)+1: __buf_##n##_2 = (a)[8*(n)+1];\
- case 8*(n): __buf_##n##_1 = (a)[8*(n)];
-#define BN_32_BIT_BUF(n) BN_ULONG __buf_##n##_1 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_2 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_3 = (BN_ULONG)0;\
- BN_ULONG __buf_##n##_4 = (BN_ULONG)0;
-#define BN_CP_32_TO_BUF(n) __buf_##n##_1 = (a)[4*(n)];\
- __buf_##n##_2 = (a)[4*(n)+1];\
- __buf_##n##_3 = (a)[4*(n)+2];\
- __buf_##n##_4 = (a)[4*(n)+3];
-#define BN_CP_32_FROM_BUF(a,n) *(a)++ = __buf_##n##_1;\
- *(a)++ = __buf_##n##_2;\
- *(a)++ = __buf_##n##_3;\
- *(a)++ = __buf_##n##_4;
-#define BN_CASE_32_BIT(n,a) case 4*(n)+3: __buf_##n##_4 = (a)[4*(n)+3];\
- case 4*(n)+2: __buf_##n##_3 = (a)[4*(n)+2];\
- case 4*(n)+1: __buf_##n##_2 = (a)[4*(n)+1];\
- case 4*(n): __buf_##n##_1 = (a)[4*(n)];
+#define bn_cp_64(to, n, from, m) \
+ { \
+ bn_cp_32(to, (n)*2, from, (m)*2); \
+ bn_cp_32(to, (n)*2+1, from, (m)*2+1); \
+ }
+#define bn_64_set_0(to, n) \
+ { \
+ bn_32_set_0(to, (n)*2); \
+ bn_32_set_0(to, (n)*2+1); \
+ }
+#if BN_BITS2 == 32
+#define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0;
+#define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0;
#endif
+#endif /* BN_BITS2 != 64 */
-#define BN_192_SET(d,a1,a2,a3) \
- {\
- register BN_ULONG *td = (d);\
- BN_CP_64_FROM_BUF(td,a3); BN_CP_64_FROM_BUF(td,a2);\
- BN_CP_64_FROM_BUF(td,a1);\
+#define nist_set_192(to, from, a1, a2, a3) \
+ { \
+ bn_cp_64(to, 0, from, (a3) - 3) \
+ bn_cp_64(to, 1, from, (a2) - 3) \
+ bn_cp_64(to, 2, from, (a1) - 3) \
}
int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
BN_CTX *ctx)
{
- int top;
- BN_ULONG carry = 0;
- register BN_ULONG *r_d, *a_d;
- BN_ULONG t_d[BN_NIST_192_TOP];
- BN_64_BIT_BUF(3) BN_64_BIT_BUF(4)
- BN_64_BIT_BUF(5)
-
- top = BN_ucmp(field, a);
- if (top == 0)
+ int top = a->top, i;
+ int carry;
+ register BN_ULONG *r_d, *a_d = a->d;
+ BN_ULONG t_d[BN_NIST_192_TOP],
+ buf[BN_NIST_192_TOP],
+ c_d[BN_NIST_192_TOP],
+ *res;
+ size_t mask;
+ static const BIGNUM _bignum_nist_p_192_sqr = {
+ (BN_ULONG *)_nist_p_192_sqr,
+ sizeof(_nist_p_192_sqr)/sizeof(_nist_p_192_sqr[0]),
+ sizeof(_nist_p_192_sqr)/sizeof(_nist_p_192_sqr[0]),
+ 0,BN_FLG_STATIC_DATA };
+
+ field = &_bignum_nist_p_192; /* just to make sure */
+
+ if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_192_sqr)>=0)
+ return BN_nnmod(r, a, field, ctx);
+
+ i = BN_ucmp(field, a);
+ if (i == 0)
{
BN_zero(r);
return 1;
}
- else if (top > 0)
- return (r == a)? 1 : (BN_copy(r ,a) != NULL);
+ else if (i > 0)
+ return (r == a) ? 1 : (BN_copy(r ,a) != NULL);
if (r != a)
- if (!BN_ncopy(r, a, BN_NIST_192_TOP))
- return 0;
-
- r_d = r->d;
- a_d = a->d;
- top = a->top-1;
-
- switch (top)
{
- BN_CASE_64_BIT(5, a_d)
- BN_CASE_64_BIT(4, a_d)
- BN_CASE_64_BIT(3, a_d)
- break;
- default: /* a->top == field->top */
- return BN_usub(r, a, field);
+ if (!bn_wexpand(r, BN_NIST_192_TOP))
+ return 0;
+ r_d = r->d;
+ nist_cp_bn(r_d, a_d, BN_NIST_192_TOP);
}
+ else
+ r_d = a_d;
- BN_192_SET(t_d,0,3,3)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
- ++carry;
-
- BN_192_SET(t_d,4,4,0)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
- ++carry;
+ nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
- BN_192_SET(t_d,5,5,5)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP))
- ++carry;
+ nist_set_192(t_d, buf, 0, 3, 3);
+ carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
+ nist_set_192(t_d, buf, 4, 4, 0);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
+ nist_set_192(t_d, buf, 5, 5, 5)
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
- while (carry)
- {
- if (bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP))
- --carry;
- }
+ if (carry > 0)
+ carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
+ else
+ carry = 1;
+
+ /*
+ * we need 'if (carry==0 || result>=modulus) result-=modulus;'
+ * as comparison implies subtraction, we can write
+ * 'tmp=result-modulus; if (!carry || !borrow) result=tmp;'
+ * this is what happens below, but without explicit if:-) a.
+ */
+ mask = 0-(size_t)bn_sub_words(c_d,r_d,_nist_p_192[0],BN_NIST_192_TOP);
+ mask &= 0-(size_t)carry;
+ res = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+ nist_cp_bn(r_d, res, BN_NIST_192_TOP);
r->top = BN_NIST_192_TOP;
bn_correct_top(r);
- if (BN_ucmp(r, field) >= 0)
- {
- bn_sub_words(r_d, r_d, _nist_p_192, BN_NIST_192_TOP);
- bn_correct_top(r);
- }
- bn_check_top(r);
return 1;
}
-#define BN_224_SET(d,a1,a2,a3,a4,a5,a6,a7) \
- {\
- register BN_ULONG *td = (d);\
- BN_CP_32_FROM_BUF(td,a7); BN_CP_32_FROM_BUF(td,a6);\
- BN_CP_32_FROM_BUF(td,a5); BN_CP_32_FROM_BUF(td,a4);\
- BN_CP_32_FROM_BUF(td,a3); BN_CP_32_FROM_BUF(td,a2);\
- BN_CP_32_FROM_BUF(td,a1);\
+typedef BN_ULONG (*bn_addsub_f)(BN_ULONG *,const BN_ULONG *,const BN_ULONG *,int);
+
+#define nist_set_224(to, from, a1, a2, a3, a4, a5, a6, a7) \
+ { \
+ bn_cp_32(to, 0, from, (a7) - 7) \
+ bn_cp_32(to, 1, from, (a6) - 7) \
+ bn_cp_32(to, 2, from, (a5) - 7) \
+ bn_cp_32(to, 3, from, (a4) - 7) \
+ bn_cp_32(to, 4, from, (a3) - 7) \
+ bn_cp_32(to, 5, from, (a2) - 7) \
+ bn_cp_32(to, 6, from, (a1) - 7) \
}
int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
BN_CTX *ctx)
{
-#ifndef NO_32_BIT_TYPE
- int tmp_int;
- int carry = 0;
- BN_ULONG *r_d, *a_d;
- BN_ULONG t_d[BN_NIST_224_TOP];
- BN_32_BIT_BUF(7) BN_32_BIT_BUF(8)
- BN_32_BIT_BUF(9) BN_32_BIT_BUF(10)
- BN_32_BIT_BUF(11) BN_32_BIT_BUF(12)
- BN_32_BIT_BUF(13)
-
- tmp_int = BN_ucmp(field, a);
- if (tmp_int == 0)
+ int top = a->top, i;
+ int carry;
+ BN_ULONG *r_d, *a_d = a->d;
+ BN_ULONG t_d[BN_NIST_224_TOP],
+ buf[BN_NIST_224_TOP],
+ c_d[BN_NIST_224_TOP],
+ *res;
+ size_t mask;
+ union { bn_addsub_f f; size_t p; } u;
+ static const BIGNUM _bignum_nist_p_224_sqr = {
+ (BN_ULONG *)_nist_p_224_sqr,
+ sizeof(_nist_p_224_sqr)/sizeof(_nist_p_224_sqr[0]),
+ sizeof(_nist_p_224_sqr)/sizeof(_nist_p_224_sqr[0]),
+ 0,BN_FLG_STATIC_DATA };
+
+
+ field = &_bignum_nist_p_224; /* just to make sure */
+
+ if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_224_sqr)>=0)
+ return BN_nnmod(r, a, field, ctx);
+
+ i = BN_ucmp(field, a);
+ if (i == 0)
{
BN_zero(r);
return 1;
}
- else if (tmp_int > 0)
+ else if (i > 0)
return (r == a)? 1 : (BN_copy(r ,a) != NULL);
if (r != a)
- if (!BN_ncopy(r, a, BN_NIST_224_TOP))
- return 0;
-
- r_d = r->d;
- a_d = a->d;
-
- tmp_int = a->top-1;
-
- switch (tmp_int)
{
- BN_CASE_32_BIT(13, a_d)
- BN_CASE_32_BIT(12, a_d)
- BN_CASE_32_BIT(11, a_d)
- BN_CASE_32_BIT(10, a_d)
- BN_CASE_32_BIT(9, a_d)
- BN_CASE_32_BIT(8, a_d)
- BN_CASE_32_BIT(7, a_d)
- break;
- default: /* a->top == field->top */
- return BN_usub(r, a, field);
+ if (!bn_wexpand(r, BN_NIST_224_TOP))
+ return 0;
+ r_d = r->d;
+ nist_cp_bn(r_d, a_d, BN_NIST_224_TOP);
}
-
- BN_224_SET(t_d,10,9,8,7,0,0,0)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP))
- ++carry;
- BN_224_SET(t_d,0,13,12,11,0,0,0)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP))
- ++carry;
- BN_224_SET(t_d,13,12,11,10,9,8,7)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
- --carry;
- BN_224_SET(t_d,0,0,0,0,13,12,11)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP))
- --carry;
-
+ else
+ r_d = a_d;
+
+#if BN_BITS2==64
+ /* copy upper 256 bits of 448 bit number ... */
+ nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+ /* ... and right shift by 32 to obtain upper 224 bits */
+ nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+ /* truncate lower part to 224 bits too */
+ r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
+#else
+ nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
+#endif
+ nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
+ carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+ nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+ nist_set_224(t_d, buf, 13, 12, 11, 10, 9, 8, 7);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+ nist_set_224(t_d, buf, 0, 0, 0, 0, 13, 12, 11);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_224_TOP);
+
+#if BN_BITS2==64
+ carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
+#endif
+ u.f = bn_sub_words;
if (carry > 0)
- while (carry)
- {
- if (bn_sub_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP))
- --carry;
- }
- else if (carry < 0)
- while (carry)
- {
- if (bn_add_words(r_d,r_d,_nist_p_224,BN_NIST_224_TOP))
- ++carry;
- }
-
- r->top = BN_NIST_224_TOP;
- bn_correct_top(r);
- if (BN_ucmp(r, field) >= 0)
{
- bn_sub_words(r_d, r_d, _nist_p_224, BN_NIST_224_TOP);
- bn_correct_top(r);
- }
- bn_check_top(r);
- return 1;
-#else
- return 0;
+ carry = (int)bn_sub_words(r_d,r_d,_nist_p_224[carry-1],BN_NIST_224_TOP);
+#if BN_BITS2==64
+ carry=(int)(~(r_d[BN_NIST_224_TOP-1]>>32))&1;
#endif
- }
-
-static void _init_256_data(void)
- {
- int i;
- BN_ULONG *tmp1 = _256_data;
- const BN_ULONG *tmp2 = tmp1;
-
- memcpy(tmp1, _nist_p_256, BN_NIST_256_TOP * sizeof(BN_ULONG));
- tmp1 += BN_NIST_256_TOP;
-
- for (i=0; i<5; i++)
+ }
+ else if (carry < 0)
{
- bn_add_words(tmp1, _nist_p_256, tmp2, BN_NIST_256_TOP);
- tmp2 = tmp1;
- tmp1 += BN_NIST_256_TOP;
+ /* it's a bit more comlicated logic in this case.
+ * if bn_add_words yields no carry, then result
+ * has to be adjusted by unconditionally *adding*
+ * the modulus. but if it does, then result has
+ * to be compared to the modulus and conditionally
+ * adjusted by *subtracting* the latter. */
+ carry = (int)bn_add_words(r_d,r_d,_nist_p_224[-carry-1],BN_NIST_224_TOP);
+ mask = 0-(size_t)carry;
+ u.p = ((size_t)bn_sub_words&mask) | ((size_t)bn_add_words&~mask);
}
- _is_set_256_data = 1;
+ else
+ carry = 1;
+
+ /* otherwise it's effectively same as in BN_nist_mod_192... */
+ mask = 0-(size_t)(*u.f)(c_d,r_d,_nist_p_224[0],BN_NIST_224_TOP);
+ mask &= 0-(size_t)carry;
+ res = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+ nist_cp_bn(r_d, res, BN_NIST_224_TOP);
+ r->top = BN_NIST_224_TOP;
+ bn_correct_top(r);
+
+ return 1;
}
-#define BN_256_SET(d,a1,a2,a3,a4,a5,a6,a7,a8) \
- {\
- register BN_ULONG *td = (d);\
- BN_CP_32_FROM_BUF(td,a8); BN_CP_32_FROM_BUF(td,a7);\
- BN_CP_32_FROM_BUF(td,a6); BN_CP_32_FROM_BUF(td,a5);\
- BN_CP_32_FROM_BUF(td,a4); BN_CP_32_FROM_BUF(td,a3);\
- BN_CP_32_FROM_BUF(td,a2); BN_CP_32_FROM_BUF(td,a1);\
+#define nist_set_256(to, from, a1, a2, a3, a4, a5, a6, a7, a8) \
+ { \
+ bn_cp_32(to, 0, from, (a8) - 8) \
+ bn_cp_32(to, 1, from, (a7) - 8) \
+ bn_cp_32(to, 2, from, (a6) - 8) \
+ bn_cp_32(to, 3, from, (a5) - 8) \
+ bn_cp_32(to, 4, from, (a4) - 8) \
+ bn_cp_32(to, 5, from, (a3) - 8) \
+ bn_cp_32(to, 6, from, (a2) - 8) \
+ bn_cp_32(to, 7, from, (a1) - 8) \
}
int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
BN_CTX *ctx)
{
-#ifndef NO_32_BIT_TYPE
- int tmp_int;
+ int i, top = a->top;
int carry = 0;
- register BN_ULONG *a_d, *r_d;
- BN_ULONG t_d[BN_NIST_256_TOP];
- BN_ULONG t_d2[BN_NIST_256_TOP];
- BN_32_BIT_BUF(8) BN_32_BIT_BUF(9)
- BN_32_BIT_BUF(10) BN_32_BIT_BUF(11)
- BN_32_BIT_BUF(12) BN_32_BIT_BUF(13)
- BN_32_BIT_BUF(14) BN_32_BIT_BUF(15)
-
- if (!_is_set_256_data)
- {
- CRYPTO_w_lock(CRYPTO_LOCK_BN);
-
- if (!_is_set_256_data)
- _init_256_data();
-
- CRYPTO_w_unlock(CRYPTO_LOCK_BN);
- }
-
- tmp_int = BN_ucmp(field, a);
- if (tmp_int == 0)
+ register BN_ULONG *a_d = a->d, *r_d;
+ BN_ULONG t_d[BN_NIST_256_TOP],
+ buf[BN_NIST_256_TOP],
+ c_d[BN_NIST_256_TOP],
+ *res;
+ size_t mask;
+ union { bn_addsub_f f; size_t p; } u;
+ static const BIGNUM _bignum_nist_p_256_sqr = {
+ (BN_ULONG *)_nist_p_256_sqr,
+ sizeof(_nist_p_256_sqr)/sizeof(_nist_p_256_sqr[0]),
+ sizeof(_nist_p_256_sqr)/sizeof(_nist_p_256_sqr[0]),
+ 0,BN_FLG_STATIC_DATA };
+
+ field = &_bignum_nist_p_256; /* just to make sure */
+
+ if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_256_sqr)>=0)
+ return BN_nnmod(r, a, field, ctx);
+
+ i = BN_ucmp(field, a);
+ if (i == 0)
{
BN_zero(r);
return 1;
}
- else if (tmp_int > 0)
+ else if (i > 0)
return (r == a)? 1 : (BN_copy(r ,a) != NULL);
if (r != a)
- if (!BN_ncopy(r, a, BN_NIST_256_TOP))
- return 0;
-
- tmp_int = a->top-1;
-
- a_d = a->d;
- r_d = r->d;
- switch (tmp_int)
{
- BN_CASE_32_BIT(15, a_d)
- BN_CASE_32_BIT(14, a_d)
- BN_CASE_32_BIT(13, a_d)
- BN_CASE_32_BIT(12, a_d)
- BN_CASE_32_BIT(11, a_d)
- BN_CASE_32_BIT(10, a_d)
- BN_CASE_32_BIT(9, a_d)
- BN_CASE_32_BIT(8, a_d)
- break;
- default: /* a->top == field->top */
- return BN_usub(r, a, field);
+ if (!bn_wexpand(r, BN_NIST_256_TOP))
+ return 0;
+ r_d = r->d;
+ nist_cp_bn(r_d, a_d, BN_NIST_256_TOP);
}
+ else
+ r_d = a_d;
+
+ nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
/*S1*/
- BN_256_SET(t_d,15,14,13,12,11,0,0,0)
+ nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
/*S2*/
- BN_256_SET(t_d2,0,15,14,13,12,0,0,0)
- if (bn_add_words(t_d, t_d, t_d2, BN_NIST_256_TOP))
- carry = 2;
+ nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+ carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
/* left shift */
{
register BN_ULONG *ap,t,c;
ap = t_d;
c=0;
- for (tmp_int=BN_NIST_256_TOP; tmp_int != 0; --tmp_int)
+ for (i = BN_NIST_256_TOP; i != 0; --i)
{
t= *ap;
*(ap++)=((t<<1)|c)&BN_MASK2;
c=(t & BN_TBIT)?1:0;
}
- if (c)
- ++carry;
+ carry <<= 1;
+ carry |= c;
}
-
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- ++carry;
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*S3*/
- BN_256_SET(t_d,15,14,0,0,0,10,9,8)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- ++carry;
+ nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*S4*/
- BN_256_SET(t_d,8,13,15,14,13,11,10,9)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- ++carry;
+ nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D1*/
- BN_256_SET(t_d,10,8,0,0,0,13,12,11)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- --carry;
+ nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D2*/
- BN_256_SET(t_d,11,9,0,0,15,14,13,12)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- --carry;
+ nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D3*/
- BN_256_SET(t_d,12,0,10,9,8,15,14,13)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- --carry;
+ nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
/*D4*/
- BN_256_SET(t_d,13,0,11,10,9,0,15,14)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP))
- --carry;
-
- if (carry)
+ nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
+
+ /* see BN_nist_mod_224 for explanation */
+ u.f = bn_sub_words;
+ if (carry > 0)
+ carry = (int)bn_sub_words(r_d,r_d,_nist_p_256[carry-1],BN_NIST_256_TOP);
+ else if (carry < 0)
{
- if (carry > 0)
- bn_sub_words(r_d, r_d, _256_data + BN_NIST_256_TOP *
- --carry, BN_NIST_256_TOP);
- else
- {
- carry = -carry;
- bn_add_words(r_d, r_d, _256_data + BN_NIST_256_TOP *
- --carry, BN_NIST_256_TOP);
- }
+ carry = (int)bn_add_words(r_d,r_d,_nist_p_256[-carry-1],BN_NIST_256_TOP);
+ mask = 0-(size_t)carry;
+ u.p = ((size_t)bn_sub_words&mask) | ((size_t)bn_add_words&~mask);
}
+ else
+ carry = 1;
+ mask = 0-(size_t)(*u.f)(c_d,r_d,_nist_p_256[0],BN_NIST_256_TOP);
+ mask &= 0-(size_t)carry;
+ res = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+ nist_cp_bn(r_d, res, BN_NIST_256_TOP);
r->top = BN_NIST_256_TOP;
bn_correct_top(r);
- if (BN_ucmp(r, field) >= 0)
- {
- bn_sub_words(r_d, r_d, _nist_p_256, BN_NIST_256_TOP);
- bn_correct_top(r);
- }
- bn_check_top(r);
- return 1;
-#else
- return 0;
-#endif
- }
-
-static void _init_384_data(void)
- {
- int i;
- BN_ULONG *tmp1 = _384_data;
- const BN_ULONG *tmp2 = tmp1;
-
- memcpy(tmp1, _nist_p_384, BN_NIST_384_TOP * sizeof(BN_ULONG));
- tmp1 += BN_NIST_384_TOP;
- for (i=0; i<7; i++)
- {
- bn_add_words(tmp1, _nist_p_384, tmp2, BN_NIST_384_TOP);
- tmp2 = tmp1;
- tmp1 += BN_NIST_384_TOP;
- }
- _is_set_384_data = 1;
+ return 1;
}
-#define BN_384_SET(d,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
- {\
- register BN_ULONG *td = (d);\
- BN_CP_32_FROM_BUF(td,a12); BN_CP_32_FROM_BUF(td,a11);\
- BN_CP_32_FROM_BUF(td,a10); BN_CP_32_FROM_BUF(td,a9);\
- BN_CP_32_FROM_BUF(td,a8); BN_CP_32_FROM_BUF(td,a7);\
- BN_CP_32_FROM_BUF(td,a6); BN_CP_32_FROM_BUF(td,a5);\
- BN_CP_32_FROM_BUF(td,a4); BN_CP_32_FROM_BUF(td,a3);\
- BN_CP_32_FROM_BUF(td,a2); BN_CP_32_FROM_BUF(td,a1);\
+#define nist_set_384(to,from,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12) \
+ { \
+ bn_cp_32(to, 0, from, (a12) - 12) \
+ bn_cp_32(to, 1, from, (a11) - 12) \
+ bn_cp_32(to, 2, from, (a10) - 12) \
+ bn_cp_32(to, 3, from, (a9) - 12) \
+ bn_cp_32(to, 4, from, (a8) - 12) \
+ bn_cp_32(to, 5, from, (a7) - 12) \
+ bn_cp_32(to, 6, from, (a6) - 12) \
+ bn_cp_32(to, 7, from, (a5) - 12) \
+ bn_cp_32(to, 8, from, (a4) - 12) \
+ bn_cp_32(to, 9, from, (a3) - 12) \
+ bn_cp_32(to, 10, from, (a2) - 12) \
+ bn_cp_32(to, 11, from, (a1) - 12) \
}
int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
BN_CTX *ctx)
{
-#ifndef NO_32_BIT_TYPE
- int tmp_int;
+ int i, top = a->top;
int carry = 0;
- register BN_ULONG *r_d, *a_d;
- BN_ULONG t_d[BN_NIST_384_TOP];
- BN_32_BIT_BUF(12) BN_32_BIT_BUF(13)
- BN_32_BIT_BUF(14) BN_32_BIT_BUF(15)
- BN_32_BIT_BUF(16) BN_32_BIT_BUF(17)
- BN_32_BIT_BUF(18) BN_32_BIT_BUF(19)
- BN_32_BIT_BUF(20) BN_32_BIT_BUF(21)
- BN_32_BIT_BUF(22) BN_32_BIT_BUF(23)
-
- if (!_is_set_384_data)
- {
- CRYPTO_w_lock(CRYPTO_LOCK_BN);
-
- if (!_is_set_384_data)
- _init_384_data();
-
- CRYPTO_w_unlock(CRYPTO_LOCK_BN);
- }
-
- tmp_int = BN_ucmp(field, a);
- if (tmp_int == 0)
+ register BN_ULONG *r_d, *a_d = a->d;
+ BN_ULONG t_d[BN_NIST_384_TOP],
+ buf[BN_NIST_384_TOP],
+ c_d[BN_NIST_384_TOP],
+ *res;
+ size_t mask;
+ union { bn_addsub_f f; size_t p; } u;
+ static const BIGNUM _bignum_nist_p_384_sqr = {
+ (BN_ULONG *)_nist_p_384_sqr,
+ sizeof(_nist_p_384_sqr)/sizeof(_nist_p_384_sqr[0]),
+ sizeof(_nist_p_384_sqr)/sizeof(_nist_p_384_sqr[0]),
+ 0,BN_FLG_STATIC_DATA };
+
+
+ field = &_bignum_nist_p_384; /* just to make sure */
+
+ if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_384_sqr)>=0)
+ return BN_nnmod(r, a, field, ctx);
+
+ i = BN_ucmp(field, a);
+ if (i == 0)
{
BN_zero(r);
return 1;
}
- else if (tmp_int > 0)
+ else if (i > 0)
return (r == a)? 1 : (BN_copy(r ,a) != NULL);
if (r != a)
- if (!BN_ncopy(r, a, BN_NIST_384_TOP))
- return 0;
-
- r_d = r->d;
- a_d = a->d;
- tmp_int = a->top-1;
-
- switch (tmp_int)
{
- BN_CASE_32_BIT(23, a_d)
- BN_CASE_32_BIT(22, a_d)
- BN_CASE_32_BIT(21, a_d)
- BN_CASE_32_BIT(20, a_d)
- BN_CASE_32_BIT(19, a_d)
- BN_CASE_32_BIT(18, a_d)
- BN_CASE_32_BIT(17, a_d)
- BN_CASE_32_BIT(16, a_d)
- BN_CASE_32_BIT(15, a_d)
- BN_CASE_32_BIT(14, a_d)
- BN_CASE_32_BIT(13, a_d)
- BN_CASE_32_BIT(12, a_d)
- break;
- default: /* a->top == field->top */
- return BN_usub(r, a, field);
+ if (!bn_wexpand(r, BN_NIST_384_TOP))
+ return 0;
+ r_d = r->d;
+ nist_cp_bn(r_d, a_d, BN_NIST_384_TOP);
}
+ else
+ r_d = a_d;
+
+ nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
/*S1*/
- BN_256_SET(t_d,0,0,0,0,0,23,22,21)
+ nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
/* left shift */
{
register BN_ULONG *ap,t,c;
ap = t_d;
c=0;
- for (tmp_int=BN_NIST_256_TOP; tmp_int != 0; --tmp_int)
+ for (i = 3; i != 0; --i)
{
t= *ap;
*(ap++)=((t<<1)|c)&BN_MASK2;
c=(t & BN_TBIT)?1:0;
}
+ *ap=c;
}
- if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2),
- t_d, BN_NIST_256_TOP))
- ++carry;
- /*S2*/
- BN_384_SET(t_d,23,22,21,20,19,18,17,16,15,14,13,12)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- ++carry;
+ carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2),
+ t_d, BN_NIST_256_TOP);
+ /*S2 */
+ carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
/*S3*/
- BN_384_SET(t_d,20,19,18,17,16,15,14,13,12,23,22,21)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- ++carry;
+ nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S4*/
- BN_384_SET(t_d,19,18,17,16,15,14,13,12,20,0,23,0)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- ++carry;
+ nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S5*/
- BN_256_SET(t_d,0,0,0,0,23,22,21,20)
- if (bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2),
- t_d, BN_NIST_256_TOP))
- ++carry;
+ nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*S6*/
- BN_384_SET(t_d,0,0,0,0,0,0,23,22,21,0,0,20)
- if (bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- ++carry;
+ nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+ carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D1*/
- BN_384_SET(t_d,22,21,20,19,18,17,16,15,14,13,12,23)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- --carry;
+ nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D2*/
- BN_384_SET(t_d,0,0,0,0,0,0,0,23,22,21,20,0)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- --carry;
+ nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
/*D3*/
- BN_384_SET(t_d,0,0,0,0,0,0,0,23,23,0,0,0)
- if (bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP))
- --carry;
-
- if (carry)
+ nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+ carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
+
+ /* see BN_nist_mod_224 for explanation */
+ u.f = bn_sub_words;
+ if (carry > 0)
+ carry = (int)bn_sub_words(r_d,r_d,_nist_p_384[carry-1],BN_NIST_384_TOP);
+ else if (carry < 0)
{
- if (carry > 0)
- bn_sub_words(r_d, r_d, _384_data + BN_NIST_384_TOP *
- --carry, BN_NIST_384_TOP);
- else
- {
- carry = -carry;
- bn_add_words(r_d, r_d, _384_data + BN_NIST_384_TOP *
- --carry, BN_NIST_384_TOP);
- }
+ carry = (int)bn_add_words(r_d,r_d,_nist_p_384[-carry-1],BN_NIST_384_TOP);
+ mask = 0-(size_t)carry;
+ u.p = ((size_t)bn_sub_words&mask) | ((size_t)bn_add_words&~mask);
}
+ else
+ carry = 1;
+ mask = 0-(size_t)(*u.f)(c_d,r_d,_nist_p_384[0],BN_NIST_384_TOP);
+ mask &= 0-(size_t)carry;
+ res = (BN_ULONG *)(((size_t)c_d&~mask) | ((size_t)r_d&mask));
+ nist_cp_bn(r_d, res, BN_NIST_384_TOP);
r->top = BN_NIST_384_TOP;
bn_correct_top(r);
- if (BN_ucmp(r, field) >= 0)
- {
- bn_sub_words(r_d, r_d, _nist_p_384, BN_NIST_384_TOP);
- bn_correct_top(r);
- }
- bn_check_top(r);
+
return 1;
-#else
- return 0;
-#endif
}
+#define BN_NIST_521_RSHIFT (521%BN_BITS2)
+#define BN_NIST_521_LSHIFT (BN_BITS2-BN_NIST_521_RSHIFT)
+#define BN_NIST_521_TOP_MASK ((BN_ULONG)BN_MASK2>>BN_NIST_521_LSHIFT)
+
int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
BN_CTX *ctx)
{
-#if BN_BITS2 == 64
-#define BN_NIST_521_TOP_MASK (BN_ULONG)0x1FF
-#elif BN_BITS2 == 32
-#define BN_NIST_521_TOP_MASK (BN_ULONG)0x1FF
-#elif BN_BITS2 == 16
-#define BN_NIST_521_TOP_MASK (BN_ULONG)0x1FF
-#elif BN_BITS2 == 8
-#define BN_NIST_521_TOP_MASK (BN_ULONG)0x1
-#endif
- int top, ret = 0;
- BN_ULONG *r_d;
- BIGNUM *tmp;
-
- /* check whether a reduction is necessary */
- top = a->top;
- if (top < BN_NIST_521_TOP || ( top == BN_NIST_521_TOP &&
- (!(a->d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))))
+ int top = a->top, i;
+ BN_ULONG *r_d, *a_d = a->d,
+ t_d[BN_NIST_521_TOP],
+ val,tmp,*res;
+ size_t mask;
+ static const BIGNUM _bignum_nist_p_521_sqr = {
+ (BN_ULONG *)_nist_p_521_sqr,
+ sizeof(_nist_p_521_sqr)/sizeof(_nist_p_521_sqr[0]),
+ sizeof(_nist_p_521_sqr)/sizeof(_nist_p_521_sqr[0]),
+ 0,BN_FLG_STATIC_DATA };
+
+ field = &_bignum_nist_p_521; /* just to make sure */
+
+ if (BN_is_negative(a) || BN_ucmp(a,&_bignum_nist_p_521_sqr)>=0)
+ return BN_nnmod(r, a, field, ctx);
+
+ i = BN_ucmp(field, a);
+ if (i == 0)
+ {
+ BN_zero(r);
+ return 1;
+ }
+ else if (i > 0)
return (r == a)? 1 : (BN_copy(r ,a) != NULL);
- BN_CTX_start(ctx);
- tmp = BN_CTX_get(ctx);
- if (!tmp)
- goto err;
-
- if (!BN_ncopy(tmp, a, BN_NIST_521_TOP))
- return 0;
- if (!BN_rshift(r, a, 521))
- return 0;
-
- if (tmp->top == BN_NIST_521_TOP)
- tmp->d[BN_NIST_521_TOP-1] &= BN_NIST_521_TOP_MASK;
-
- bn_correct_top(tmp);
- if (!BN_uadd(r, tmp, r))
- return 0;
- top = r->top;
- r_d = r->d;
- if (top == BN_NIST_521_TOP &&
- (r_d[BN_NIST_521_TOP-1] & ~(BN_NIST_521_TOP_MASK)))
+ if (r != a)
{
- BN_NIST_ADD_ONE(r_d)
- r_d[BN_NIST_521_TOP-1] &= BN_NIST_521_TOP_MASK;
+ if (!bn_wexpand(r,BN_NIST_521_TOP))
+ return 0;
+ r_d = r->d;
+ nist_cp_bn(r_d,a_d, BN_NIST_521_TOP);
}
- bn_correct_top(r);
+ else
+ r_d = a_d;
- ret = 1;
-err:
- BN_CTX_end(ctx);
+ /* upper 521 bits, copy ... */
+ nist_cp_bn_0(t_d,a_d + (BN_NIST_521_TOP-1), top - (BN_NIST_521_TOP-1),BN_NIST_521_TOP);
+ /* ... and right shift */
+ for (val=t_d[0],i=0; i<BN_NIST_521_TOP-1; i++)
+ {
+ tmp = val>>BN_NIST_521_RSHIFT;
+ val = t_d[i+1];
+ t_d[i] = (tmp | val<<BN_NIST_521_LSHIFT) & BN_MASK2;
+ }
+ t_d[i] = val>>BN_NIST_521_RSHIFT;
+ /* lower 521 bits */
+ r_d[i] &= BN_NIST_521_TOP_MASK;
+
+ bn_add_words(r_d,r_d,t_d,BN_NIST_521_TOP);
+ mask = 0-(size_t)bn_sub_words(t_d,r_d,_nist_p_521,BN_NIST_521_TOP);
+ res = (BN_ULONG *)(((size_t)t_d&~mask) | ((size_t)r_d&mask));
+ nist_cp_bn(r_d,res,BN_NIST_521_TOP);
+ r->top = BN_NIST_521_TOP;
+ bn_correct_top(r);
- bn_check_top(r);
- return ret;
+ return 1;
}
diff --git a/crypto/bn/bn_prime.c b/crypto/bn/bn_prime.c
index d03403a600..7b25979dd1 100644
--- a/crypto/bn/bn_prime.c
+++ b/crypto/bn/bn_prime.c
@@ -258,7 +258,8 @@ int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
/* first look for small factors */
if (!BN_is_odd(a))
- return 0;
+ /* a is even => a is prime if and only if a == 2 */
+ return BN_is_word(a, 2);
if (do_trial_division)
{
for (i = 1; i < NUMPRIMES; i++)
@@ -376,14 +377,15 @@ static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
static int probable_prime(BIGNUM *rnd, int bits)
{
int i;
- BN_ULONG mods[NUMPRIMES];
- BN_ULONG delta,d;
+ prime_t mods[NUMPRIMES];
+ BN_ULONG delta,maxdelta;
again:
if (!BN_rand(rnd,bits,1,1)) return(0);
/* we now have a random number 'rand' to test. */
for (i=1; i<NUMPRIMES; i++)
- mods[i]=BN_mod_word(rnd,(BN_ULONG)primes[i]);
+ mods[i]=(prime_t)BN_mod_word(rnd,(BN_ULONG)primes[i]);
+ maxdelta=BN_MASK2 - primes[NUMPRIMES-1];
delta=0;
loop: for (i=1; i<NUMPRIMES; i++)
{
@@ -391,12 +393,8 @@ again:
* that gcd(rnd-1,primes) == 1 (except for 2) */
if (((mods[i]+delta)%primes[i]) <= 1)
{
- d=delta;
delta+=2;
- /* perhaps need to check for overflow of
- * delta (but delta can be up to 2^32)
- * 21-May-98 eay - added overflow check */
- if (delta < d) goto again;
+ if (delta > maxdelta) goto again;
goto loop;
}
}
diff --git a/crypto/bn/bn_prime.h b/crypto/bn/bn_prime.h
index b7cf9a9bfe..51d2194feb 100644
--- a/crypto/bn/bn_prime.h
+++ b/crypto/bn/bn_prime.h
@@ -58,10 +58,12 @@
#ifndef EIGHT_BIT
#define NUMPRIMES 2048
+typedef unsigned short prime_t;
#else
#define NUMPRIMES 54
+typedef unsigned char prime_t;
#endif
-static const unsigned int primes[NUMPRIMES]=
+static const prime_t primes[NUMPRIMES]=
{
2, 3, 5, 7, 11, 13, 17, 19,
23, 29, 31, 37, 41, 43, 47, 53,
diff --git a/crypto/bn/bn_prime.pl b/crypto/bn/bn_prime.pl
index e583d1d53b..3fafb6f3e9 100644
--- a/crypto/bn/bn_prime.pl
+++ b/crypto/bn/bn_prime.pl
@@ -101,10 +101,12 @@ for ($i=0; $i <= $#primes; $i++)
printf "#ifndef EIGHT_BIT\n";
printf "#define NUMPRIMES %d\n",$num;
+printf "typedef unsigned short prime_t;\n";
printf "#else\n";
printf "#define NUMPRIMES %d\n",$eight;
+printf "typedef unsigned char prime_t;\n";
printf "#endif\n";
-print "static const unsigned int primes[NUMPRIMES]=\n\t{\n\t";
+print "static const prime_t primes[NUMPRIMES]=\n\t{\n\t";
$init=0;
for ($i=0; $i <= $#primes; $i++)
{
diff --git a/crypto/bn/bn_print.c b/crypto/bn/bn_print.c
index 5fb8473f37..bebb466d08 100644
--- a/crypto/bn/bn_print.c
+++ b/crypto/bn/bn_print.c
@@ -62,7 +62,7 @@
#include <openssl/buffer.h>
#include "bn_lcl.h"
-static const char *Hex="0123456789ABCDEF";
+static const char Hex[]="0123456789ABCDEF";
/* Must 'OPENSSL_free' the returned data */
char *BN_bn2hex(const BIGNUM *a)
@@ -134,7 +134,7 @@ char *BN_bn2dec(const BIGNUM *a)
}
else
{
- if (BN_get_sign(t))
+ if (BN_is_negative(t))
*p++ = '-';
i=0;
@@ -294,6 +294,27 @@ err:
return(0);
}
+int BN_asc2bn(BIGNUM **bn, const char *a)
+ {
+ const char *p = a;
+ if (*p == '-')
+ p++;
+
+ if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x'))
+ {
+ if (!BN_hex2bn(bn, p + 2))
+ return 0;
+ }
+ else
+ {
+ if (!BN_dec2bn(bn, p))
+ return 0;
+ }
+ if (*a == '-')
+ (*bn)->neg = 1;
+ return 1;
+ }
+
#ifndef OPENSSL_NO_BIO
#ifndef OPENSSL_NO_FP_API
int BN_print_fp(FILE *fp, const BIGNUM *a)
diff --git a/crypto/bn/bn_rand.c b/crypto/bn/bn_rand.c
index 323bfa74bc..b376c28ff3 100644
--- a/crypto/bn/bn_rand.c
+++ b/crypto/bn/bn_rand.c
@@ -134,7 +134,7 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
buf=(unsigned char *)OPENSSL_malloc(bytes);
if (buf == NULL)
{
- BNerr(BN_F_BN_RAND,ERR_R_MALLOC_FAILURE);
+ BNerr(BN_F_BNRAND,ERR_R_MALLOC_FAILURE);
goto err;
}
@@ -227,7 +227,7 @@ int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
/* random number r: 0 <= r < range */
-static int bn_rand_range(int pseudo, BIGNUM *r, BIGNUM *range)
+static int bn_rand_range(int pseudo, BIGNUM *r, const BIGNUM *range)
{
int (*bn_rand)(BIGNUM *, int, int, int) = pseudo ? BN_pseudo_rand : BN_rand;
int n;
@@ -294,12 +294,12 @@ static int bn_rand_range(int pseudo, BIGNUM *r, BIGNUM *range)
}
-int BN_rand_range(BIGNUM *r, BIGNUM *range)
+int BN_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bn_rand_range(0, r, range);
}
-int BN_pseudo_rand_range(BIGNUM *r, BIGNUM *range)
+int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bn_rand_range(1, r, range);
}
diff --git a/crypto/bn/bn_recp.c b/crypto/bn/bn_recp.c
index a08489e04a..2e8efb8dae 100644
--- a/crypto/bn/bn_recp.c
+++ b/crypto/bn/bn_recp.c
@@ -191,7 +191,7 @@ int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
{
if (j++ > 2)
{
- BNerr(BN_F_BN_MOD_MUL_RECIPROCAL,BN_R_BAD_RECIPROCAL);
+ BNerr(BN_F_BN_DIV_RECP,BN_R_BAD_RECIPROCAL);
goto err;
}
if (!BN_usub(r,r,&(recp->N))) goto err;
@@ -204,8 +204,8 @@ int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
ret=1;
err:
BN_CTX_end(ctx);
- if(dv) bn_check_top(dv);
- if(rem) bn_check_top(rem);
+ bn_check_top(dv);
+ bn_check_top(rem);
return(ret);
}
diff --git a/crypto/bn/bn_shift.c b/crypto/bn/bn_shift.c
index de9312dce2..c4d301afc4 100644
--- a/crypto/bn/bn_shift.c
+++ b/crypto/bn/bn_shift.c
@@ -177,7 +177,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
nw=n/BN_BITS2;
rb=n%BN_BITS2;
lb=BN_BITS2-rb;
- if (nw > a->top || a->top == 0)
+ if (nw >= a->top || a->top == 0)
{
BN_zero(r);
return(1);
diff --git a/crypto/bn/bn_sqr.c b/crypto/bn/bn_sqr.c
index 3b4b3f0d38..270d0cd348 100644
--- a/crypto/bn/bn_sqr.c
+++ b/crypto/bn/bn_sqr.c
@@ -148,8 +148,8 @@ int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
if (rr != r) BN_copy(r,rr);
ret = 1;
err:
- if(rr) bn_check_top(rr);
- if(tmp) bn_check_top(tmp);
+ bn_check_top(rr);
+ bn_check_top(tmp);
BN_CTX_end(ctx);
return(ret);
}
diff --git a/crypto/bn/bn_sqrt.c b/crypto/bn/bn_sqrt.c
index 924ee274df..6beaf9e5e5 100644
--- a/crypto/bn/bn_sqrt.c
+++ b/crypto/bn/bn_sqrt.c
@@ -1,4 +1,4 @@
-/* crypto/bn/bn_mod.c */
+/* crypto/bn/bn_sqrt.c */
/* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
* and Bodo Moeller for the OpenSSL project. */
/* ====================================================================
@@ -83,7 +83,8 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
goto end;
if (!BN_set_word(ret, BN_is_bit_set(a, 0)))
{
- BN_free(ret);
+ if (ret != in)
+ BN_free(ret);
return NULL;
}
bn_check_top(ret);
@@ -102,7 +103,8 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
goto end;
if (!BN_set_word(ret, BN_is_one(a)))
{
- BN_free(ret);
+ if (ret != in)
+ BN_free(ret);
return NULL;
}
bn_check_top(ret);
diff --git a/crypto/bn/bn_word.c b/crypto/bn/bn_word.c
index 1bcb37e292..ee7b87c45c 100644
--- a/crypto/bn/bn_word.c
+++ b/crypto/bn/bn_word.c
@@ -69,6 +69,9 @@ BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)
#endif
int i;
+ if (w == 0)
+ return (BN_ULONG)-1;
+
bn_check_top(a);
w&=BN_MASK2;
for (i=a->top-1; i>=0; i--)
@@ -94,7 +97,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
if (!w)
/* actually this an error (division by zero) */
- return 0;
+ return (BN_ULONG)-1;
if (a->top == 0)
return 0;
@@ -102,7 +105,7 @@ BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
j = BN_BITS2 - BN_num_bits_word(w);
w <<= j;
if (!BN_lshift(a, a, j))
- return 0;
+ return (BN_ULONG)-1;
for (i=a->top-1; i>=0; i--)
{
@@ -175,7 +178,13 @@ int BN_sub_word(BIGNUM *a, BN_ULONG w)
/* degenerate case: w is zero */
if (!w) return 1;
/* degenerate case: a is zero */
- if(BN_is_zero(a)) return BN_set_word(a,w);
+ if(BN_is_zero(a))
+ {
+ i = BN_set_word(a,w);
+ if (i != 0)
+ BN_set_negative(a, 1);
+ return i;
+ }
/* handle 'a' when negative */
if (a->neg)
{
diff --git a/crypto/bn/bntest.c b/crypto/bn/bntest.c
index 9169cc8813..0cd99c5b4b 100644
--- a/crypto/bn/bntest.c
+++ b/crypto/bn/bntest.c
@@ -106,6 +106,7 @@ int test_mont(BIO *bp,BN_CTX *ctx);
int test_mod(BIO *bp,BN_CTX *ctx);
int test_mod_mul(BIO *bp,BN_CTX *ctx);
int test_mod_exp(BIO *bp,BN_CTX *ctx);
+int test_mod_exp_mont_consttime(BIO *bp,BN_CTX *ctx);
int test_exp(BIO *bp,BN_CTX *ctx);
int test_gf2m_add(BIO *bp);
int test_gf2m_mod(BIO *bp);
@@ -183,116 +184,120 @@ int main(int argc, char *argv[])
message(out,"BN_add");
if (!test_add(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_sub");
if (!test_sub(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_lshift1");
if (!test_lshift1(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_lshift (fixed)");
if (!test_lshift(out,ctx,BN_bin2bn(lst,sizeof(lst)-1,NULL)))
goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_lshift");
if (!test_lshift(out,ctx,NULL)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_rshift1");
if (!test_rshift1(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_rshift");
if (!test_rshift(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_sqr");
if (!test_sqr(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_mul");
if (!test_mul(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_div");
if (!test_div(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_div_word");
if (!test_div_word(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_div_recp");
if (!test_div_recp(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_mod");
if (!test_mod(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_mod_mul");
if (!test_mod_mul(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_mont");
if (!test_mont(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_mod_exp");
if (!test_mod_exp(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
+
+ message(out,"BN_mod_exp_mont_consttime");
+ if (!test_mod_exp_mont_consttime(out,ctx)) goto err;
+ (void)BIO_flush(out);
message(out,"BN_exp");
if (!test_exp(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_kronecker");
if (!test_kron(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_mod_sqrt");
if (!test_sqrt(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_add");
if (!test_gf2m_add(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod");
if (!test_gf2m_mod(out)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_mul");
if (!test_gf2m_mod_mul(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_sqr");
if (!test_gf2m_mod_sqr(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_inv");
if (!test_gf2m_mod_inv(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_div");
if (!test_gf2m_mod_div(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_exp");
if (!test_gf2m_mod_exp(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_sqrt");
if (!test_gf2m_mod_sqrt(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
message(out,"BN_GF2m_mod_solve_quad");
if (!test_gf2m_mod_solve_quad(out,ctx)) goto err;
- BIO_flush(out);
+ (void)BIO_flush(out);
BN_CTX_free(ctx);
BIO_free(out);
@@ -302,7 +307,7 @@ int main(int argc, char *argv[])
err:
BIO_puts(out,"1\n"); /* make sure the Perl script fed by bc notices
* the failure, see test_bn in test/Makefile.ssl*/
- BIO_flush(out);
+ (void)BIO_flush(out);
ERR_load_crypto_strings();
ERR_print_errors_fp(stderr);
EXIT(1);
@@ -481,7 +486,7 @@ static void print_word(BIO *bp,BN_ULONG w)
return;
}
#endif
- BIO_printf(bp,"%lX",w);
+ BIO_printf(bp,BN_HEX_FMT1,w);
}
int test_div_word(BIO *bp)
@@ -727,6 +732,8 @@ int test_mont(BIO *bp, BN_CTX *ctx)
BN_init(&n);
mont=BN_MONT_CTX_new();
+ if (mont == NULL)
+ return 0;
BN_bntest_rand(&a,100,0,0); /**/
BN_bntest_rand(&b,100,0,0); /**/
@@ -921,6 +928,57 @@ int test_mod_exp(BIO *bp, BN_CTX *ctx)
BN_bntest_rand(b,2+i,0,0); /**/
if (!BN_mod_exp(d,a,b,c,ctx))
+ return(0);
+
+ if (bp != NULL)
+ {
+ if (!results)
+ {
+ BN_print(bp,a);
+ BIO_puts(bp," ^ ");
+ BN_print(bp,b);
+ BIO_puts(bp," % ");
+ BN_print(bp,c);
+ BIO_puts(bp," - ");
+ }
+ BN_print(bp,d);
+ BIO_puts(bp,"\n");
+ }
+ BN_exp(e,a,b,ctx);
+ BN_sub(e,e,d);
+ BN_div(a,b,e,c,ctx);
+ if(!BN_is_zero(b))
+ {
+ fprintf(stderr,"Modulo exponentiation test failed!\n");
+ return 0;
+ }
+ }
+ BN_free(a);
+ BN_free(b);
+ BN_free(c);
+ BN_free(d);
+ BN_free(e);
+ return(1);
+ }
+
+int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx)
+ {
+ BIGNUM *a,*b,*c,*d,*e;
+ int i;
+
+ a=BN_new();
+ b=BN_new();
+ c=BN_new();
+ d=BN_new();
+ e=BN_new();
+
+ BN_bntest_rand(c,30,0,1); /* must be odd for montgomery */
+ for (i=0; i<num2; i++)
+ {
+ BN_bntest_rand(a,20+i*5,0,0); /**/
+ BN_bntest_rand(b,2+i,0,0); /**/
+
+ if (!BN_mod_exp_mont_consttime(d,a,b,c,ctx,NULL))
return(00);
if (bp != NULL)
@@ -971,8 +1029,8 @@ int test_exp(BIO *bp, BN_CTX *ctx)
BN_bntest_rand(a,20+i*5,0,0); /**/
BN_bntest_rand(b,2+i,0,0); /**/
- if (!BN_exp(d,a,b,ctx))
- return(00);
+ if (BN_exp(d,a,b,ctx) <= 0)
+ return(0);
if (bp != NULL)
{
@@ -1060,8 +1118,8 @@ int test_gf2m_mod(BIO *bp)
{
BIGNUM *a,*b[2],*c,*d,*e;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1118,8 +1176,8 @@ int test_gf2m_mod_mul(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d,*e,*f,*g,*h;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1189,8 +1247,8 @@ int test_gf2m_mod_sqr(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1248,8 +1306,8 @@ int test_gf2m_mod_inv(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1303,8 +1361,8 @@ int test_gf2m_mod_div(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d,*e,*f;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1366,8 +1424,8 @@ int test_gf2m_mod_exp(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d,*e,*f;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1437,8 +1495,8 @@ int test_gf2m_mod_sqrt(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d,*e,*f;
int i, j, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
@@ -1496,8 +1554,8 @@ int test_gf2m_mod_solve_quad(BIO *bp,BN_CTX *ctx)
{
BIGNUM *a,*b[2],*c,*d,*e;
int i, j, s = 0, t, ret = 0;
- unsigned int p0[] = {163,7,6,3,0};
- unsigned int p1[] = {193,15,0};
+ int p0[] = {163,7,6,3,0,-1};
+ int p1[] = {193,15,0,-1};
a=BN_new();
b[0]=BN_new();
diff --git a/crypto/bn/expspeed.c b/crypto/bn/expspeed.c
index 07a1bcf51c..4d5f221f33 100644
--- a/crypto/bn/expspeed.c
+++ b/crypto/bn/expspeed.c
@@ -321,7 +321,7 @@ void do_mul_exp(BIGNUM *r, BIGNUM *a, BIGNUM *b, BIGNUM *c, BN_CTX *ctx)
#else /* TEST_SQRT */
"2*sqrt [prime == %d (mod 64)] %4d %4d mod %4d"
#endif
- " -> %8.3fms %5.1f (%ld)\n",
+ " -> %8.6fms %5.1f (%ld)\n",
#ifdef TEST_SQRT
P_MOD_64,
#endif
diff --git a/crypto/bn/exptest.c b/crypto/bn/exptest.c
index 37aec55b89..074a8e882a 100644
--- a/crypto/bn/exptest.c
+++ b/crypto/bn/exptest.c
@@ -77,7 +77,7 @@ int main(int argc, char *argv[])
BIO *out=NULL;
int i,ret;
unsigned char c;
- BIGNUM *r_mont,*r_recp,*r_simple,*a,*b,*m;
+ BIGNUM *r_mont,*r_mont_const,*r_recp,*r_simple,*a,*b,*m;
RAND_seed(rnd_seed, sizeof rnd_seed); /* or BN_rand may fail, and we don't
* even check its return value
@@ -88,6 +88,7 @@ int main(int argc, char *argv[])
ctx=BN_CTX_new();
if (ctx == NULL) EXIT(1);
r_mont=BN_new();
+ r_mont_const=BN_new();
r_recp=BN_new();
r_simple=BN_new();
a=BN_new();
@@ -143,8 +144,17 @@ int main(int argc, char *argv[])
EXIT(1);
}
+ ret=BN_mod_exp_mont_consttime(r_mont_const,a,b,m,ctx,NULL);
+ if (ret <= 0)
+ {
+ printf("BN_mod_exp_mont_consttime() problems\n");
+ ERR_print_errors(out);
+ EXIT(1);
+ }
+
if (BN_cmp(r_simple, r_mont) == 0
- && BN_cmp(r_simple,r_recp) == 0)
+ && BN_cmp(r_simple,r_recp) == 0
+ && BN_cmp(r_simple,r_mont_const) == 0)
{
printf(".");
fflush(stdout);
@@ -153,6 +163,8 @@ int main(int argc, char *argv[])
{
if (BN_cmp(r_simple,r_mont) != 0)
printf("\nsimple and mont results differ\n");
+ if (BN_cmp(r_simple,r_mont_const) != 0)
+ printf("\nsimple and mont const time results differ\n");
if (BN_cmp(r_simple,r_recp) != 0)
printf("\nsimple and recp results differ\n");
@@ -162,18 +174,20 @@ int main(int argc, char *argv[])
printf("\nsimple ="); BN_print(out,r_simple);
printf("\nrecp ="); BN_print(out,r_recp);
printf("\nmont ="); BN_print(out,r_mont);
+ printf("\nmont_ct ="); BN_print(out,r_mont_const);
printf("\n");
EXIT(1);
}
}
BN_free(r_mont);
+ BN_free(r_mont_const);
BN_free(r_recp);
BN_free(r_simple);
BN_free(a);
BN_free(b);
BN_free(m);
BN_CTX_free(ctx);
- ERR_remove_state(0);
+ ERR_remove_thread_state(NULL);
CRYPTO_mem_leaks(out);
BIO_free(out);
printf(" done\n");